1 /* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2
3 /*********************************************************************
4 * Clustal Omega - Multiple sequence alignment
5 *
6 * Copyright (C) 2010 University College Dublin
7 *
8 * Clustal-Omega is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This file is part of Clustal-Omega.
14 *
15 ********************************************************************/
16
17 /*
18 * RCS $Id: hhalignment-C.h 316 2016-12-16 16:14:39Z fabian $
19 */
20
21
22 /*
23 * Changelog: Michael Remmert made changes to hhalign stand-alone code
24 * FS implemented some of the changes on 2010-10-28 -> MR1
25 *
26 * Note: MR seems to have changed all [aijk]++ to ++[aijk],
27 * FS did not do that on 2010-10-28
28 */
29
30 // hhalignment.C
31
32 //////////////////////////////////////////////////////////////////////////////
33 //// Class Alignment
34 //////////////////////////////////////////////////////////////////////////////
35
36 // hhalignment.C
37
38 #ifndef MAIN
39 #define MAIN
40 #include <iostream> // cin, cout, cerr
41 #include <fstream> // ofstream, ifstream
42 #include <stdio.h> // printf
43 using std::cout;
44 using std::cerr;
45 using std::endl;
46 using std::ios;
47 using std::ifstream;
48 using std::ofstream;
49 #include <stdlib.h> // exit
50 #include <string> // strcmp, strstr
51 #include <math.h> // sqrt, pow
52 #include <limits.h> // INT_MIN
53 #include <float.h> // FLT_MIN
54 #include <time.h> // clock
55 #include <ctype.h> // islower, isdigit etc
56 #include "util-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
57 #include "list.h" // list data structure
58 #include "hash.h" // hash data structure
59 #include "hhdecl-C.h"
60 #include "hhutil-C.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
61 #include "hhhmm.h"
62 #endif
63
64 #ifdef HAVE_CONFIG_H
65 #include "config.h"
66 #endif
67 //#include "new_new.h" /* memory tracking */
68
69
70 enum {KEEP_NOT = 0, KEEP_CONDITIONALLY, KEEP_ALWAYS};
71
72 //////////////////////////////////////////////////////////////////////////////
73 // Class Alignment
74 //////////////////////////////////////////////////////////////////////////////
75
76
77 //////////////////////////////////////////////////////////////////////////////
78 // Object constructor
79 //////////////////////////////////////////////////////////////////////////////
Alignment(int maxseq,int maxres)80 Alignment::Alignment(int maxseq, int maxres)
81 {
82
83 //printf(">>>>>>>>%s:%s:%d: maxseq=%d, maxres=%d\n", __FUNCTION__, __FILE__, __LINE__, maxseq, maxres); /* (FS) */
84 longname = new(char[DESCLEN]);
85 sname = new char*[maxseq+2]; /* MR1 */
86 seq = new char*[maxseq+2]; /* MR1 */
87 l = new int[maxres];
88 X = new char*[maxseq+2]; /* MR1 */
89 I = new short unsigned int*[maxseq+2]; /* MR1 */
90 keep = new char[maxseq+2]; /* MR1 */
91 display = new char[maxseq+2]; /* MR1 */
92 wg = new float[maxseq+2]; /* MR1 */
93 nseqs = new int[maxres+2]; /* MR1 */
94 N_in=L=0;
95 nres=NULL; // number of residues per sequence k
96 first=NULL; // first residue in sequence k
97 last=NULL; // last residue in sequence k
98 ksort=NULL; // sequence indices sorted by descending nres[k]
99 name[0]='\0'; // no name defined yet
100 longname[0]='\0'; // no name defined yet
101 fam[0]='\0'; // no name defined yet
102 file[0]='\0'; // no name defined yet
103 readCommentLine = '0'; /* MR1 */
104 }
105
106 //////////////////////////////////////////////////////////////////////////////
107 // Object destructor
108 //////////////////////////////////////////////////////////////////////////////
~Alignment()109 Alignment::~Alignment()
110 {
111 delete[] longname; longname = NULL;
112 for(int k=0; k<N_in; k++)
113 {
114 delete[] sname[k]; sname[k] = NULL;
115 delete[] seq[k]; seq[k] = NULL;
116 delete[] X[k]; X[k] = NULL;
117 delete[] I[k]; I[k] = NULL;
118 }
119 delete[] sname; sname = NULL;
120 delete[] seq; seq = NULL;
121 delete[] X; X = NULL;
122 delete[] I; I = NULL;
123 delete[] l; l = NULL;
124 delete[] keep; keep = NULL;
125 delete[] display; display = NULL;
126 delete[] wg; wg = NULL;
127 delete[] nseqs; nseqs = NULL;
128 delete[] nres; nres = NULL;
129 delete[] first; first = NULL;
130 delete[] last; last = NULL;
131 delete[] ksort; ksort = NULL;
132 }
133
134
135 /**
136 * @brief Reads in an alignment from file into matrix seq[k][l] as ASCII
137 */
138 void
Read(FILE * inf,char infile[],char * firstline)139 Alignment::Read(FILE* inf, char infile[], char* firstline)
140 {
141 int l; // Postion in alignment incl. gaps (first=1)
142 int h; // Position in input line (first=0)
143 int k; // Index of sequence being read currently (first=0)
144 char line[LINELEN]=""; // input line
145 //char cur_seq[MAXCOL]; // Sequence currently read in
146 char *cur_seq=new char[par.maxColCnt];
147 char* cur_name; // Sequence currently read in
148 int linenr=0; // current line number in input file
149 char skip_sequence=0;
150 RemoveExtension(file,infile); //copy rootname (w/o path) of infile into file variable of class object
151
152 kss_dssp=ksa_dssp=kss_pred=kss_conf=kfirst=-1;
153 n_display=0;
154 N_in=0;
155 N_filtered=0;
156 N_ss=0;
157 cur_seq[0]=' '; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq)
158 l=1; k=-1;
159
160 // Does firstline already contain first line of file?
161 if (firstline!= NULL) strcpy(line,firstline);
162
163 /////////////////////////////////////////////////////////////////////////
164 // Read infile line by line
165 /* FIXME: not safe to use MAXSEQ, however, don't think we ever get here (FS) */
166 while(firstline || (fgetline(line,LINELEN,inf) && (k<MAXSEQ))) /* FIXME: FS introduced () around &&, precedence! MR1 */
167 {
168 linenr++;
169 firstline=NULL;
170 if (line[0]=='>') //line contains sequence name
171 {
172 if (k>=MAXSEQ-1)
173 {
174 if (v>=1 && k>=MAXSEQ)
175 cerr<<endl<<"WARNING: maximum number "<<MAXSEQ<<" of sequences exceded in file "<<infile<<"\n";
176 break;
177 }
178 cur_name=line+1; //beginning of current sequence name
179 if (k>=0) //if this is at least the second name line
180 {
181 if (strlen(cur_seq)==0)
182 {
183 cerr<<endl<<"Error: sequence "<<sname[k]<<" contains no residues."<<endl;
184 exit(1);
185 }
186
187 // Create space for residues and paste new sequence in
188 seq[k]=new char[strlen(cur_seq)+2];
189 if (!seq[k]) MemoryError("array for input sequences");
190 X[k]=new char[strlen(cur_seq)+2];
191 if (!X[k]) MemoryError("array for input sequences");
192 I[k]=new short unsigned int[strlen(cur_seq)+2];
193 if (!I[k]) MemoryError("array for input sequences");
194 strcpy(seq[k],cur_seq);
195 }
196 skip_sequence=0;
197
198 k++;
199 l=1; //position in current sequence (first=1)
200
201 // display[k]= 0: do not show in Q-T alignments 1: show if not filtered out later 2: show in any case (do not filter out)
202 // keep[k] = 0: do not include in profile 1: include if not filtered out later 2: include in any case (do not filter out)
203 /* {KEEP_NOT=0, KEEP_CONDITIONALLY=1, KEEP_ALWAYS=2} */
204 if (line[1]=='@') cur_name++; //skip @-character in name
205 if (!strncmp(line,">ss_dssp",8)) {
206 if (kss_dssp<0) {display[k]=2; n_display++; keep[k]=KEEP_NOT; kss_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;}
207 }
208 else if (!strncmp(line,">sa_dssp",8)) {
209 if (ksa_dssp<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; ksa_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;}
210 }
211 else if (!strncmp(line,">ss_pred",8)) {
212 if (kss_pred<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_pred=k; N_ss++;} else {skip_sequence=1; k--; continue;}
213 }
214 else if (!strncmp(line,">ss_conf",8)) {
215 if (kss_conf<0) {display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; kss_conf=k; N_ss++;} else {skip_sequence=1; k--; continue;}
216 }
217 else if (!strncmp(line,">ss_",4) || !strncmp(line,">sa_",4)) {
218 display[k]=KEEP_ALWAYS; n_display++; keep[k]=KEEP_NOT; N_ss++;
219 }
220 else if (!strncmp(line,">aa_",4)) { // ignore sequences beginning with ">aa_"
221 skip_sequence=1; k--; continue;
222 }
223 //store first real seq
224 else if (kfirst<0)
225 {
226 char word[NAMELEN];
227 strwrd(word,line); // Copies first word in ptr to str
228 if (strstr(word,"_consensus"))
229 {display[k]=2; keep[k]=0; n_display++; kfirst=k;} /* MR1 */
230 else
231 {display[k]=keep[k]=KEEP_ALWAYS; n_display++; kfirst=k;}
232 }
233 //store all sequences
234 else if (par.mark==0) {display[k]=keep[k]=KEEP_CONDITIONALLY; n_display++;}
235 //store sequences up to nseqdis
236 else if (line[1]=='@'&& n_display-N_ss<par.nseqdis) {display[k]=keep[k]=KEEP_ALWAYS; n_display++;}
237 else {display[k]=KEEP_NOT; keep[k]=KEEP_CONDITIONALLY;}
238
239 // store sequence name
240 if (v>=4) printf("Reading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",cur_name,k,n_display,display[k],keep[k]);
241 sname[k] = new char[strlen(cur_name)+1];
242 if (!sname[k]) {MemoryError("array for sequence names");}
243 strcpy(sname[k],cur_name);
244 } // end if(line contains sequence name)
245
246 else if (line[0]=='#') // Commentary line?
247 {
248 // #PF01367.9 5_3_exonuc: 5'-3' exonuclease, C-terminal SAM fold; PDB 1taq, 1bgx (T:271-174), 1taq (271-174)
249 if (name[0]) continue; // if already name defined: skip commentary line
250 char *ptr1, *ptr2;
251 ptr1=strscn_(line+1); // set ptr1 to first non-whitespace character after '#' -> AC number
252 strncpy(longname,ptr1,DESCLEN-1); // copy whole commentary line after '# ' into longname
253 longname[DESCLEN-1]='\0';
254 strtr(longname,""," ");
255 ptr2=strcut_(ptr1); // cut after AC number and set ptr2 to first non-whitespace character after AC number
256 // strcpy(fam,ptr1); // copy AC number to fam
257 // if (!strncmp(fam,"PF",2)) strcut_(fam,'.'); // if PFAM identifier contains '.' cut it off
258 // strcut_(ptr2); // cut after first word ...
259 strcpy(name,ptr1); // ... and copy first word into name
260 readCommentLine = '1'; /* MR1 */
261 }
262
263 //line contains sequence residues or SS information and does not belong to a >aa_ sequence
264 else if (!skip_sequence)
265 {
266 if (v>=4) cout<<line<<"\n"; //DEBUG
267 if (k==-1 && v)
268 {
269 cerr<<endl<<"WARNING: No sequence name preceding following line in "<<infile<<":\n\'"<<line<<"\'\n";
270 continue;
271 }
272
273 h=0; //counts characters in current line
274
275 // Check whether all characters are correct; store into cur_seq
276 if (keep[k] || (k == kfirst) ) // normal line containing residues /* MR1 */
277 {
278 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
279 {
280 if (aa2i(line[h])>=0) // ignore white-space characters ' ', \t and \n (aa2i()==-1)
281 {cur_seq[l]=line[h]; l++;}
282 else if (aa2i(line[h])==-2 && v)
283 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
284 h++;
285 }
286 }
287 else if (k==kss_dssp) // lines with dssp secondary structure states (. - H E C S T G B)
288 {
289 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
290 {
291 if (ss2i(line[h])>=0 && ss2i(line[h])<=7)
292 {cur_seq[l]=ss2ss(line[h]); l++;}
293 else if (v)
294 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
295 h++;
296 }
297 }
298 else if (k==ksa_dssp) // lines with dssp solvent accessibility states (. - ???)
299 {
300 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
301 {
302 if (sa2i(line[h])>=0)
303 cur_seq[l++]=line[h];
304 else if (v)
305 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
306 h++;
307 }
308 }
309 else if (k==kss_pred) // lines with predicted secondary structure (. - H E C)
310 {
311 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
312 {
313 if (ss2i(line[h])>=0 && ss2i(line[h])<=3)
314 {cur_seq[l]=ss2ss(line[h]); l++;}
315 else if (v)
316 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<h<<" in line "<<linenr<<" of "<<infile<<"\n";
317 h++;
318 }
319 }
320 else if (k==kss_conf) // lines with confidence values should contain only 0-9, '-', or '.'
321 {
322 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
323 {
324 if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9'))
325 {cur_seq[l]=line[h]; l++;}
326 else if (v)
327 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n";
328 h++;
329 }
330 }
331 else if (display[k]) // other lines such as >sa_pred etc
332 {
333 while (h<LINELEN && line[h]>'\0' && l</*MAXCOL*/par.maxColCnt-1)
334 {
335 if (line[h]=='-' || line[h]=='.' || (line[h]>='0' && line[h]<='9') || (line[h]>='A' && line[h]<='B'))
336 {cur_seq[l]=line[h]; l++;}
337 else if (v)
338 cerr<<endl<<"WARNING: invalid symbol \'"<<line[h]<<"\' at pos. "<<l<<" in line "<<linenr<<" of "<<infile<<"\n";
339 h++;
340 }
341 }
342 if (v && l>=/*MAXCOL*/par.maxColCnt-1)
343 {
344 cerr<<endl<<"WARNING: maximum number of residues "<</*MAXCOL*/par.maxColCnt-2<<" exceded in sequence "<<sname[k]<<"\n";
345 skip_sequence=1;
346 }
347 cur_seq[l]='\0'; //Ensure that cur_seq ends with a '\0' character
348 } //end else
349
350 }
351 /////////////////////////////////////////////////////////////////////////
352
353
354 if (k>=0) //if at least one sequence was read in
355 {
356 seq[k]=new char[strlen(cur_seq)+2];
357 if (!seq[k]) MemoryError("array for input sequences");
358 X[k]=new char[strlen(cur_seq)+2];
359 if (!X[k]) MemoryError("array for input sequences");
360 I[k]=new short unsigned int[strlen(cur_seq)+2];
361 if (!I[k]) MemoryError("array for input sequences");
362 strcpy(seq[k],cur_seq);
363 }
364 else
365 {cerr<<endl<<"Error: no sequences found in file "<<infile<<"\n"; exit(1);}
366
367 N_in = k+1;
368
369 // Set name, longname, fam
370 if (!*name) // longname, name and family were not set by '#...' line yet -> extract from first sequence
371 {
372 char* ptr;
373 // strtr(sname[kfirst],"~"," "); // 'transpose': replaces the tilde with a blanc everywhere in sname[kfirst]
374 strncpy(longname,sname[kfirst],DESCLEN-1); // longname is name of first sequence
375 longname[DESCLEN-1]='\0';
376 strncpy(name,sname[kfirst],NAMELEN-1); // Shortname is first word of longname...
377 name[NAMELEN-1]='\0';
378 ptr = strcut(name); // ...until first white-space character
379 if (ptr && islower(ptr[0]) && ptr[1]=='.' && isdigit(ptr[2])) //Scop family code present as second word?
380 {
381 lwrstr(name); // Transform upper case to lower case
382 strcut(ptr); // Non-white-space characters until next white-space character..
383 strcpy(fam,ptr); // ...are the SCOP familiy code
384 }
385 else if (name[0]=='P' && name[1]=='F' && isdigit(name[2]) && isdigit(name[3]) ) //Pfam code
386 {
387 strcpy(fam,name); // set family name = Pfam code
388 }
389 }
390
391
392
393 delete[] cur_seq; cur_seq = NULL;
394
395 // Checking for warning messages
396 if (v==0) return;
397 if (v>=2) cout<<"Read "<<infile<<" with "<<N_in<<" sequences\n";
398 if (v>=3) cout<<"Query sequence for alignment has number "<<kfirst<<" (0 is first)\n";
399 return;
400 }
401
402 /*
403 * At this point GetSeqsFromHMM() slots in, however,
404 * only needed in hhbliys.C, so will skip it for moment, MR1
405 */
406
407
408 /////////////////////////////////////////////////////////////////////////////
409 /**
410 * @brief Convert ASCII in seq[k][l] to int (0-20) in X[k][i],
411 * throw out all insert states, record their number in I[k][i]
412 * and store sequences to be displayed in seq[k] */
413 /////////////////////////////////////////////////////////////////////////////
414 void
Compress(const char infile[])415 Alignment::Compress(const char infile[])
416 {
417 int i; // Index for match state (first=1)
418 int l; // Postion in alignment incl. gaps (first=1)
419 int k; // Index for sequences (first=0)
420 int a; // amino acid index
421 char c;
422 int unequal_lengths=0; /* k: seq k doesn't have same number
423 of match states as seq 0 => WARNING */
424 /* points to next character in seq[k] to be written */
425 /*static short unsigned int h[MAXSEQ];*/
426 /*short*/ unsigned int *h = NULL; /* short may lead to overflow for long alignments, FS, r235 -> r236 */
427
428 h = new /*short*/ unsigned int[N_in+2]; /* short -> overflow, FS, r235 -> r236 */
429 float *percent_gaps = NULL; /* FS, 2010-Nov */
430 char *match_state = NULL; /* FS, 2010-Nov */
431
432
433 // Initialize
434 for (k=0;k<N_in; k++)
435 {I[k][0]=0;}
436
437 if (v>=3)
438 {
439 if (par.M==1)
440 cout<<"Using match state assignment by capital letters (a2m format)\n";
441 else if (par.M==2) cout<<"Using percentage-rule match state assignment\n";
442 else if (par.M==3) cout<<"Using residues of first sequence as match states\n";
443 }
444
445 // Create matrices X and I with amino acids represented by integer numbers
446 switch(par.M)
447 {
448
449 /////////////////////////////////////////////////////////////////////////
450 /* a2m/a3m format: match states capital case,
451 inserts lower case, delete states '-', inserted gaps '.'
452 The confidence values for ss prediction are interpreted as follows:
453 0-9:match states(!) '-' :match state '.':insert */
454 case 1:
455 default:
456
457 // Warn if alignment is ment to be -M first or -M NN instead of A2M/A3M
458 if (v>=2 && strchr(seq[kfirst],'-') ) // Seed/query sequence contains a gap ...
459 {
460 for (k=1; k<N_in; k++)
461 if (strpbrk(seq[k],"abcdefghiklmnpqrstuvwxyz.")) break;
462 if (k==N_in) // ... but alignment contains no lower case residue
463 printf("WARNING: input alignment %s looks like aligned FASTA instead of A2M/A3M format. Consider using '-M first' or '-M 50'\n",infile);
464 }
465
466 // Remove '.' characters from seq[k]
467 for(k=0; k<N_in; k++)
468 {
469 char* ptrS=seq[k]; // pointer to source: character in seq[k]
470 char* ptrD=seq[k]; // pointer to destination: seq[k]
471 while(1) // omit '.' symbols
472 {
473 if (*ptrS!='.') {*ptrD=*ptrS; ptrD++;} //leave out '.' symbols
474 if (!*ptrS) break;
475 ptrS++;
476 }
477 }
478 L=/*MAXRES*/par.maxResLen-2; // needed because L=imin(L,i)
479 for (k=0; k<N_in; k++)
480 {
481 i=1; l=1; // start at i=1, not i=0!
482 if (keep[k]) //skip >ss_dssp, >ss_pred, >ss_conf, >aa_... sequences
483 {
484 while((c=seq[k][l++])) // assign residue to c at same time
485 {
486 if (c>='a' && c<='z') I[k][i-1]++;//insert state = lower case character
487 else if (c!='.') //match state = upper case character
488 {
489 X[k][i]=aa2i(c);
490 I[k][i]=0;
491 i++;
492 }
493 }
494 }
495 else if (k==kss_dssp || k==kss_pred) // does alignment contain sequence of secondary structure states?
496 {
497 while((c=seq[k][l++])) // assign residue to c at same time
498 if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=ss2i(c); //match state = upper case character
499 }
500 else if (k==ksa_dssp) // does alignment contain sequence of prediction confidence values?
501 {
502 while((c=seq[k][l++])) // assign residue to c at same time
503 if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=sa2i(c); //match state = upper case character
504 }
505 else if (k==kss_conf) // does alignment contain sequence of prediction confidence values?
506 {
507 while((c=seq[k][l++])) // assign residue to c at same time
508 if (c!='.') X[k][i++]=cf2i(c); //match state = 0-9 or '-'
509 }
510 else if (k==kfirst) // does alignment contain sequence of prediction confidence values?
511 {
512 while((c=seq[k][l++])) // assign residue to c at same time
513 if (c!='.')
514 {
515 X[k][i]=aa2i(c);
516 I[k][i]=0;
517 ++i;
518 }
519 }
520 else continue;
521 i--;
522 if (L!=i && L!=/*MAXRES*/par.maxResLen-2 && !unequal_lengths) unequal_lengths=k; //sequences have different lengths
523 L=imin(L,i);
524 } // end for (k)
525 if (unequal_lengths) break;
526
527 //Replace GAP with ENDGAP for all end gaps /* MR1 */
528 for (k=0; k<N_in; ++k)
529 {
530 if (!keep[k]) continue;
531 for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */
532 for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
533 }
534
535 for (i=1; i<=L; i++) this->l[i]=i; //assign column indices to match states
536 if (L<=0)
537 {
538 cout<<"\nError: Alignment in "<<infile<<" contains no match states. Consider using -M first or -M <int> option"<<endl;
539 exit(1);
540 }
541
542 if (L==/*MAXRES*/par.maxResLen-2 && v>=2)
543 {
544 printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",L);
545 break;
546 }
547 if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" match states\n";
548 break;
549
550 /////////////////////////////////////////////////////////////////////////
551 // gap-rule assignment of match states
552 case 2:
553 int nl[NAA+2]; //nl[a] = number of seq's with amino acid a at position l
554 /* Note: allocating statically is fine most of the time
555 but when the sequences/profiles get really long
556 we might run out of memory, so must really do it dynamically.
557 had to move declaration of float *percent_gaps out of switch()
558 */
559 //float percent_gaps[MAXCOL]; //percentage of gaps in column k (with weighted sequences)
560 percent_gaps = new float[par.maxColCnt];
561
562 //determine number of columns L in alignment
563 L=strlen(seq[kfirst])-1;
564
565 // Conversion to integer representation, checking for unequal lengths and initialization
566 if (nres==NULL) nres=new int[N_in];
567 #if 0
568 //#ifdef HAVE_OPENMP
569 #pragma omp parallel for schedule(static), private(l)
570 #endif
571 for (k=0; k<N_in; k++)
572 {
573 if (!keep[k]) continue;
574 int nr=0;
575 wg[k]=0; nres[k]=0;
576 for (l=1; l<=L; l++)
577 {
578 X[k][l]=aa2i(seq[k][l]);
579 if (X[k][l]<NAA) nr++;
580 }
581 nres[k]=nr;
582 if (seq[k][L+1]!='\0' && !unequal_lengths) unequal_lengths=k;
583 }
584 if (unequal_lengths) break;
585
586 // Quick and dirty calculation of the weight per sequence wg[k]
587 for (l=1; l<=L; l++) // for all positions l in alignment
588 {
589 int naa=0; //number of different amino acids
590 for (a=0; a<20; a++) nl[a]=0;
591 for (k=0; k<N_in; k++) if (keep[k]) nl[ (int)X[k][l]]++;
592 for (a=0; a<20; a++) if(nl[a]) naa++;
593 if (!naa) naa=1; //naa=0 when column consists of only gaps and Xs (=ANY)
594 #if 0
595 //#ifdef HAVE_OPENMP
596 #pragma omp parallel for schedule(static)
597 #endif
598 for (k=0; k<N_in; k++)
599 if (keep[k] && (X[k][l]<20) )
600 {
601 //wg[k]+=1.0/float(nl[ (int)X[k][l]]*naa*nres[k]+30.0); /* original version */
602 wg[k] += 1.0/float(nl[ (int)X[k][l]]*naa*(nres[k]+30.0)); /* MR1 */
603 // wg[k] += 1.0/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */
604 // wg[k] += (naa-1.0)/float(nl[ (int)X[k][l]]*(nres[k]+30.0)); /* MR1 commented out */
605 }
606 } /* 1=l<=L*/
607
608 //Replace GAP with ENDGAP for all end gaps
609 for (k=0; k<N_in; ++k)
610 {
611 if (!keep[k]) continue;
612 for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1: NOTE i++ <- ++i */
613 for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
614 }
615
616 // Add up percentage of gaps
617 #if 0
618 //#ifdef HAVE_OPENMP
619 #pragma omp parallel for schedule(static) private(k)
620 #endif
621 for (l=1; l<=L; l++)
622 {
623 float res=0;
624 float gap=0;
625 for (k=0; k< N_in; k++){
626 if (keep[k]){
627 if ( X[k][l]<GAP) res+=wg[k]; /* MR1, AA or ANY, changed from <ANY */
628 else if ( X[k][l] != ENDGAP) gap+=wg[k]; /* MR1, else: GAP. ENDGAPs are ignored for counting percentage */
629 }
630 }
631 percent_gaps[l]=100.*gap/(res+gap);
632 if (v>=4) cout<<"percent gaps["<<l<<"]="<<percent_gaps[l]<<" first seq:"<<seq[0][l]<<"\n";
633 }
634
635 /* Insert states 'bloat' the HMM,
636 throwing them out 'slims' down the HMM.
637 A slimmer HMM takes less time to construct.
638 However, the marriage of Clustal and Hhalign
639 is particularly sensitive to residues
640 at the very end of the profile; these I call
641 'telomeres'. Telomeres must not be shed when
642 throwing out insert states, for the telomeres
643 we set the match threshold to 100%.
644 */
645 #define MGAP_LOGIC 0
646 #define TELOMERE_LOGIC 1
647 #define TELOMERE_DYNAMIC 0
648
649 #define ALWAYS_ACCEPT 101.0 /* do NOT change this parameter, must be >=100,
650 make slightly bigger than 100% -- to be sure to be sure */
651 #define DEFAULT_MGAPS 100.0 /* Soeding's default is 50, omega default prior to telomere logic was 100
652 FIXME: this used to be par.Mgaps,
653 in a later version re-introduce par.Mgaps to keep this value flexible */
654 #define TELOMER_LENGTH 10 /* this parameter must be > 0 (unless DEFAULT_MGAPS=100),
655 if it is too big (L/2) then telomere logic has no effect,
656 don't think it should be changed (much) */
657 #define TELOMER_FRACTION 0.10
658 //#define HMM_MIN_LENGTH 0.923
659 #define HMM_MIN_LENGTH 0.950
660 #define FORTRAN_OFFSET 1
661 double dDefaultMgaps;
662 dDefaultMgaps = DEFAULT_MGAPS;
663
664 #if TELOMERE_LOGIC /* turn telomere logic on (1) or off (0) */
665 int iTelomereLength;
666
667 #if TELOMERE_DYNAMIC /* keep telomere length 'dynamic' */
668 iTelomereLength = TELOMER_LENGTH > (int)(L*TELOMER_FRACTION) ? TELOMER_LENGTH : (int)(L*TELOMER_FRACTION);
669 #else
670 iTelomereLength = TELOMER_LENGTH;
671 #endif /* this was dynamic telomere */
672 #endif /* this was telomere logic */
673
674 /* if HMMs get too small (much smaller than profile length L)
675 then one is liable to get a back-tracking error.
676 So we should ensure that the DEFAULT_MGAPS parameter does NOT
677 shrink the HMM too much.
678 take percentage-gap vector, sort it, and fix dDefaultMgaps,
679 such that at least (HMM_MIN_LENGTH)*(L) are left
680 */
681 #if MGAP_LOGIC /* try to adapt Mgaps to size of final HMM */
682 {
683 float *pfPercentGaps = NULL;
684 if (NULL == (pfPercentGaps = (float *)malloc((L+1)*sizeof(float)))){
685 printf("%s:%s:%d: could not malloc %d float for sorted percent-gaps\n",
686 __FUNCTION__, __FILE__, __LINE__, L+1);
687 dDefaultMgaps = DEFAULT_MGAPS;
688 }
689 else {
690 for (l = 0; l < L; l++) {
691 pfPercentGaps[l] = percent_gaps[l+FORTRAN_OFFSET];
692 }
693 qsort(pfPercentGaps, L, sizeof(float), CompFltAsc);
694
695 dDefaultMgaps = pfPercentGaps[(int)(HMM_MIN_LENGTH*L)];
696 if (dDefaultMgaps < DEFAULT_MGAPS){
697 //printf("Mgaps = %f <- %f\n", DEFAULT_MGAPS, dDefaultMgaps);
698 dDefaultMgaps = DEFAULT_MGAPS;
699 }
700 else {
701 //printf("Mgaps = %f\n", dDefaultMgaps);
702 }
703
704 free(pfPercentGaps); pfPercentGaps = NULL;
705 }
706 }
707 #endif /* tried to adapt Mgaps to size of final HMM */
708
709 // Throw out insert states and keep only match states
710 i=0;
711 for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';}
712 for (l=1; l<=L; l++)
713 {
714 #if TELOMERE_LOGIC
715 float fMgaps = ALWAYS_ACCEPT;
716 if ( (l < iTelomereLength) || (L-l < iTelomereLength) ){
717 /* residue is in telomere, always retain this position */
718 fMgaps = ALWAYS_ACCEPT;
719 }
720 else if (0){
721 /* FIXME: would like to put a transition phase in here,
722 where the Mgap value gradually goes down from 100 to DEFAULT_MGAPS,
723 however, may not be necessary and will make code more clunky */
724 }
725 else {
726 /* position is in centre of sequence,
727 retain position if less than DEFAULT_MGAPS% gaps at this position,
728 for example, if DEFAULT_MGAPS=30 throw out if more than 30% gap.
729 conversely, if DEFAULT_MGAPS=100 throw out if more than 100% gaps,
730 which can never happen, so always retain */
731 fMgaps = dDefaultMgaps;
732 }
733 if (percent_gaps[l] <= fMgaps)
734 #else /* this was telomere logic */
735 if (percent_gaps[l]<=float(par.Mgaps))
736 #endif /* this was Soeding default */
737 {
738 if (i>=/*MAXRES*/par.maxResLen-2) {
739 if (v>=1)
740 printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i);
741 break;
742 }
743 i++;
744 this->l[i]=l;
745 #if 0
746 //#ifdef HAVE_OPENMP
747 #pragma omp parallel for schedule(static)
748 #endif
749 for (k=0; k<N_in; k++)
750 {
751 if (keep[k])
752 {
753 seq[k][h[k]++]=MatchChr(seq[k][l]);
754 X[k][i]=X[k][l];
755 I[k][i]=0;
756 }
757 else if (k==kss_dssp || k==kss_pred)
758 {
759 seq[k][h[k]++]=MatchChr(seq[k][l]);
760 X[k][i]=ss2i(seq[k][l]);
761 }
762 else if (k==ksa_dssp)
763 {
764 seq[k][h[k]++]=MatchChr(seq[k][l]);
765 X[k][i]=sa2i(seq[k][l]);
766 }
767 else if (k==kss_conf)
768 {
769 seq[k][h[k]++]=seq[k][l];
770 X[k][i]=cf2i(seq[k][l]);
771 }
772 }
773 }
774 else
775 {
776 for (k=0; k<N_in; k++)
777 if (keep[k] && X[k][l]<GAP)
778 {
779 I[k][i]++;
780 seq[k][h[k]++]=InsertChr(seq[k][l]);
781 }
782 }
783 }
784 for (k=0; k<N_in; k++) seq[k][h[k]]='\0';
785
786 //printf("%d\t%d\t%d\tN/L/M\n", N_in, L, i); /* -------- FIXME */
787
788 if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n";
789 L = i; //Number of match states
790
791 delete[] percent_gaps; percent_gaps = NULL;
792 break;
793
794
795 ////////////////////////////////////////////////////////////////////////
796 // Using residues of first sequence as match states
797 case 3:
798 /* Note: allocating statically is fine most of the time
799 but when the sequences/profiles get really long
800 we might run out of memory, so must really do it dynamically.
801 had to move declaration of float *percent_gaps out of switch()
802 */
803 //char match_state[MAXCOL]; //1: column assigned to match state 0: insert state
804 match_state = new char[par.maxColCnt];
805
806 // Determine number of columns L in alignment
807 L=strlen(seq[0]+1);
808 if (v>=3) printf("Length of first seq = %i\n",L);
809 // Check for sequences with unequal lengths
810 for (k=1; k<N_in; k++)
811 if (int(strlen(seq[k]+1))!=L) {unequal_lengths=k; break;}
812 if (unequal_lengths) break;
813
814 // Determine match states: seq kfirst has residue at pos l -> match state
815 for (l=1; l<=L; l++)
816 if (isalpha(seq[kfirst][l])) match_state[l]=1; else match_state[l]=0;
817 // Throw out insert states and keep only match states
818 for (k=0; k<N_in; k++) {h[k]=1; seq[k][0]='-';}
819 i=0;
820 for (l=1; l<=L; l++)
821 {
822 if (match_state[l]) // does sequence 0 have residue at position l?
823 {
824 if (i>=/*MAXRES*/par.maxResLen-2) {
825 if (v>=1)
826 printf("WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i);
827 break;
828 }
829 i++;
830 this->l[i]=l;
831 for (k=0; k<N_in; k++)
832 {
833 if (keep[k])
834 {
835 seq[k][h[k]++]=MatchChr(seq[k][l]);
836 X[k][i]=aa2i(seq[k][l]);
837 I[k][i]=0;
838 }
839 else if (k==kss_dssp || k==kss_pred)
840 {
841 seq[k][h[k]++]=MatchChr(seq[k][l]);
842 X[k][i]=ss2i(seq[k][l]);
843 }
844 else if (k==ksa_dssp)
845 {
846 seq[k][h[k]++]=MatchChr(seq[k][l]);
847 X[k][i]=sa2i(seq[k][l]);
848 }
849 else if (k==kss_conf)
850 {
851 seq[k][h[k]++]=seq[k][l];
852 X[k][i]=cf2i(seq[k][l]);
853 }
854 }
855 }
856 else
857 {
858 for (k=0; k<N_in; k++)
859 if (keep[k] && aa2i(seq[k][l])<GAP)
860 {
861 I[k][i]++;
862 seq[k][h[k]++]=InsertChr(seq[k][l]);
863 }
864 }
865 }
866 for (k=0; k<N_in; k++) seq[k][h[k]]='\0';
867
868 //Replace GAP with ENDGAP for all end gaps /* MR1 */
869 for (k=0; k<N_in; ++k)
870 {
871 if (!keep[k]) continue;
872 for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP; /* MR1, note i++ <- ++i */
873 for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; /* MR1 */
874 }
875
876 if (v>=2) cout<<"Alignment in "<<infile<<" contains "<<L<<" columns and "<<i<<" match states\n";
877 L = i; //Number of match states
878
879 delete[] match_state; match_state = NULL;
880 break;
881
882 } //end switch()
883 ///////////////////////////////////////////////////////////////////////////
884
885
886 // Error
887 if (unequal_lengths)
888 {
889 strcut(sname[unequal_lengths]);
890 cerr<<endl<<"Error: sequences in "<<infile<<" do not all have the same number of columns, \ne.g. first sequence and sequence "<<sname[unequal_lengths]<<".\n";
891 if(par.M==1) cerr<<".\nCheck input format for '-M a2m' option and consider using '-M first' or '-M 50'\n";
892 exit(1);
893 }
894
895 // Avert user about -cons option?
896 if (v>=2 && !par.cons)
897 {
898 for (i=1; i<=L; i++)
899 if (X[kfirst][i]==GAP)
900 {
901 printf("NOTE: Use the '-cons' option to calculate a consensus sequence as first sequence of the alignment.\n");
902 break;
903 }
904 }
905 /* MR1
906 //Replace GAP with ENDGAP for all end gaps
907 for (k=0; k<N_in; k++)
908 {
909 if (!keep[k]) continue;
910 for (i=1; i<=L && X[k][i]==GAP; i++) X[k][i]=ENDGAP;
911 for (i=L; i>=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP;
912 }*/
913
914 // DEBUG
915 if (v>=4)
916 for (k=0; k<N_in; k++)
917 {
918 if (!display[k]) continue;
919 cout<<">"<<sname[k]<<"\n";
920 if (k==kss_dssp || k==kss_pred) {for (i=1; i<=L; i++) cout<<char(i2ss(X[k][i]));}
921 else if (k==kss_conf) {for (i=1; i<=L; i++) cout<<char(i2cf(X[k][i]));}
922 else if (k==ksa_dssp) {for (i=1; i<=L; i++) cout<<char(i2sa(X[k][i]));}
923 else
924 {
925 for (i=1; i<=L; i++) cout<<char(i2aa(X[k][i]));
926 cout<<"\n";
927 for (i=1; i<=L; i++)
928 if (I[k][i]==0) cout<<"-"; else if (I[k][i]>9) cout<<"X"; else cout<<I[k][i];
929 }
930 cout<<"\n";
931 }
932
933 delete[](h); h = NULL;
934 }
935
936
937 /**
938 * @brief Remove sequences with seq. identity larger than seqid percent
939 *(remove the shorter of two) or coverage<cov_thr
940 *
941 * FIXME: originally max_seqid is a variable that is the cutoff
942 * above which sequences are thrown out. We want to throw out sequences
943 * when building the HMM but not for display, there we want to keep all.
944 * This should be really easy, but there is some hidden stuff going on
945 * in this function, so I did a minimal-invasive change and just stuck
946 * (effectively) a hard-wired 100 instead of the variable.
947 * At a later stage we should get rid of this function alltogether
948 * as it does gobble up some time (and is quadratic in noof sequences, I think)
949 * FS, 2010-10-04
950 */
951 ////////////////////////////////////////////////////////////////////////////
952 /*
953 */
954 inline int
FilterForDisplay(int max_seqid,int coverage,int qid,float qsc,int N)955 Alignment::FilterForDisplay(int max_seqid, int coverage, int qid, float qsc, int N)
956 {
957
958 /* FIXME
959 * by just returning n_display and not doing anything
960 * I think we display everything and not do any work for it
961 */
962 return n_display; /* FS, 2010-10-04*/
963
964
965 if (par.mark) return n_display;
966 char *dummy = new char[N_in+1];
967 int vtmp=v, seqid;
968 v=0;
969 n_display=0;
970 if (kss_dssp>=0) display[kss_dssp]=KEEP_NOT;
971 if (ksa_dssp>=0) display[ksa_dssp]=KEEP_NOT;
972 if (kss_pred>=0) display[kss_pred]=KEEP_NOT;
973 if (kss_conf>=0) display[kss_conf]=KEEP_NOT;
974 for (seqid=imin(10,max_seqid); n_display<N && seqid<=max_seqid; seqid++)
975 {
976 for (int k=0; k<N_in; k++) dummy[k]=display[k];
977 n_display = Filter2(dummy,coverage,qid,qsc,20,seqid,0);
978 // printf("Seqid=%3i n_display=%4i\n",seqid,n_display);
979 }
980 if (n_display>N)
981 {
982 for (int k=0; k<N_in; k++) dummy[k]=display[k];
983 n_display = Filter2(dummy,coverage,qid,qsc,20,--(--seqid),0);
984 }
985 v=vtmp;
986 for (int k=0; k<N_in; k++) display[k]=dummy[k];
987 if (kss_dssp>=0) {display[kss_dssp]=KEEP_CONDITIONALLY; n_display++;}
988 if (ksa_dssp>=0) {display[ksa_dssp]=KEEP_CONDITIONALLY; n_display++;}
989 if (kss_pred>=0) {display[kss_pred]=KEEP_CONDITIONALLY; n_display++;}
990 if (kss_conf>=0) {display[kss_conf]=KEEP_CONDITIONALLY; n_display++;}
991 delete[] dummy; dummy = NULL;
992 return n_display;
993 }
994
995 /////////////////////////////////////////////////////////////////////////////////////
996 // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) or coverage<cov_thr
997 /////////////////////////////////////////////////////////////////////////////////////
Filter(int max_seqid,int coverage,int qid,float qsc,int N)998 inline int Alignment::Filter(int max_seqid, int coverage, int qid, float qsc, int N)
999 {
1000 return Filter2(keep,coverage,qid,qsc,20,max_seqid,N);
1001 }
1002
1003 /////////////////////////////////////////////////////////////////////////////
1004 /*
1005 * @brief Select set of representative sequences in the multiple sequence alignment
1006 *
1007 * Filter criteria:
1008 * Remove sequences with coverage of query less than "coverage" percent
1009 * Remove sequences with sequence identity to query of less than "qid" percent
1010 * If Ndiff==0, remove sequences with seq. identity larger than seqid2(=max_seqid) percent
1011 * If Ndiff>0, remove sequences with minimum-sequence-identity filter of between seqid1
1012 * and seqid2 (%), where the minimum seqid threshold is determined such that,
1013 * in all column blocks of at least WMIN=25 residues, at least Ndiff sequences are left.
1014 * This ensures that in multi-domain proteins sequences covering one domain are not
1015 * removed completely because sequences covering other domains are more diverse.
1016 *
1017 * Allways the shorter of two compared sequences is removed (=> sort sequences by length first).
1018 * Please note: sequence identity of sequence x with y when filtering x is calculated as
1019 * number of residues in sequence x that are identical to an aligned residue in y / number of residues in x
1020 * Example: two sequences x and y are 100% identical in their overlapping region but one overlaps by 10% of its
1021 * length on the left and the other by 20% on the right. Then x has 10% seq.id with y and y has 20% seq.id. with x.
1022 */
1023 //////////////////////////////////////////////////////////////////////////////
1024 int
Filter2(char keep[],int coverage,int qid,float qsc,int seqid1,int seqid2,int Ndiff)1025 Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, int seqid2, int Ndiff)
1026 {
1027 // In the beginnning, keep[k] is 1 for all regular amino acid sequences and 0 for all others (ss_conf, ss_pred,...)
1028 // In the end, keep[k] will be 1 for all regular representative sequences kept in the alignment, 0 for all others
1029 char* in=new char[N_in+1]; // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid
1030 char* inkk=new char[N_in+1]; // inkk[k]=1 iff in[ksort[k]]=1 else 0;
1031 int* Nmax=new int[L+2]; // position-dependent maximum-sequence-identity threshold for filtering /* MR1, used to be called idmax*/
1032 int* idmaxwin=new int[L+2]; // minimum value of idmax[i-WFIL,i+WFIL]
1033 int* seqid_prev=new int[N_in+1]; // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid)
1034 int* N=new int[L+2]; // N[i] number of already accepted sequences at position i
1035 const int WFIL=25; // see previous line
1036
1037 int diffNmax=Ndiff; // current maximum difference of Nmax[i] and Ndiff /* MR1 */
1038 int diffNmax_prev=0; // previous maximum difference of Nmax[i] and Ndiff /* MR1 */
1039
1040 int seqid; // current maximum value for the position-dependent maximum-sequence-identity thresholds in idmax[]
1041 int seqid_step=0; // previous increment of seqid /* MR1 */
1042
1043 float diff_min_frac; // minimum fraction of differing positions between sequence j and k needed to accept sequence k
1044 float qdiff_max_frac=0.9999-0.01*qid; // maximum allowable number of residues different from query sequence
1045 int diff; // number of differing positions between sequences j and k (counted so far)
1046 int diff_suff; // number of differing positions between sequences j and k that would be sufficient
1047 int qdiff_max; // maximum number of residues required to be different from query
1048 int cov_kj; // upper limit of number of positions where both sequence k and j have a residue
1049 int first_kj; // first non-gap position in sequence j AND k
1050 int last_kj; // last non-gap position in sequence j AND k
1051 int kk, jj; // indices for sequence from 1 to N_in
1052 int k, j; // kk=ksort[k], jj=ksort[j]
1053 int i; // counts residues
1054 int n; // number of sequences accepted so far
1055
1056 // Initialize in[k]
1057 for (n=k=0; k<N_in; k++) if (keep[k]==KEEP_ALWAYS) {in[k]=2/*KEEP_ALWAYS??*/; n++;} else in[k]=0;
1058
1059 // Determine first[k], last[k]?
1060 if (first==NULL)
1061 {
1062 first=new int[N_in];// first non-gap position in sequence k
1063 last =new int[N_in];// last non-gap position in sequence k
1064 for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1065 {
1066 for (i=1; i<=L; i++) if (X[k][i]<NAA) break;
1067 first[k]=i;
1068 for (i=L; i>=1; i--) if (X[k][i]<NAA) break;
1069 last[k]=i;
1070 }
1071 }
1072
1073 // Determine number of residues nres[k]?
1074 // KB: memory leak as sizeof(nres) just gives the size of an int pointer
1075 //if ( (nres==NULL) || (sizeof(nres)<N_in*sizeof(int)) )
1076 if (nres==NULL)
1077 {
1078 nres=new int[N_in];
1079 for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1080 {
1081 int nr=0;
1082 for (i=first[k]; i<=last[k]; i++)
1083 if (X[k][i]<NAA) nr++;
1084 nres[k]=nr;
1085 // printf("%20.20s nres=%3i first=%3i last=%3i\n",sname[k],nr,first[k],last[k]);
1086 }
1087 }
1088
1089 // Sort sequences according to length; afterwards, nres[ksort[kk]] is sorted by size
1090 if (ksort==NULL)
1091 {
1092 ksort=new int[N_in]; // never reuse alignment object for new alignment with more sequences
1093 for (k=0; k<N_in; k++) ksort[k]=k;
1094 QSortInt(nres,ksort,kfirst+1,N_in-1,-1); //Sort sequences after kfirst (query) in descending order
1095 }
1096 for (kk=0; kk<N_in; kk++) inkk[kk]=in[ksort[kk]];
1097
1098 // Initialize N[i], idmax[i], idprev[i]
1099 for (i=1; i<first[kfirst]; i++) N[i]=0;
1100 for (i=first[kfirst]; i<=last[kfirst]; i++) N[i]=1;
1101 for (i=last[kfirst]+1; i<=L; i++) N[i]=0;
1102 //for (i=1; i<=L; i++) {idmax[i]=seqid1; idmaxwin[i]=-1;}
1103 for (i=1; i<=L; ++i) {Nmax[i]=0; idmaxwin[i]=-1;} /* MR1 */
1104 for (k=0; k<N_in; k++) seqid_prev[k]=-1;
1105 if (Ndiff<=0 || Ndiff>=N_in) {seqid1=seqid2; Ndiff=N_in; diffNmax=Ndiff;}
1106
1107 // Check coverage and sim-to-query criteria for each sequence k
1108 for (k=0; k<N_in; k++)
1109 {
1110 if (keep[k]==KEEP_NOT || keep[k]==KEEP_ALWAYS) continue; // seq k not regular sequence OR is marked sequence
1111 if (100*nres[k]<coverage*L) {keep[k]=KEEP_NOT; continue;} // coverage too low? => reject once and for all
1112
1113 float qsc_sum=0.0;
1114
1115 // Check if score-per-column with query is at least qsc
1116 if (qsc>-10)
1117 {
1118 float qsc_min = qsc*nres[k]; // minimum total score of seq k with query
1119
1120 int gapq=0, gapk=0; // number of consecutive gaps in query or k'th sequence at position i
1121 for (int i=first[k]; i<=last[k]; i++)
1122 {
1123 if (X[k][i]<20)
1124 {
1125 gapk=0;
1126 if (X[kfirst][i]<20)
1127 {
1128 gapq=0;
1129 qsc_sum += S[(int)X[kfirst][i]][(int)X[k][i]];
1130 }
1131 else if (gapq++) qsc_sum-=PLTY_GAPEXTD; else qsc_sum-=PLTY_GAPOPEN;
1132 }
1133 else if (X[kfirst][i]<20)
1134 {
1135 gapq=0;
1136 if (gapk++) qsc_sum-=PLTY_GAPEXTD; else qsc_sum-=PLTY_GAPOPEN;
1137 }
1138 }
1139 // printf("k=%3i qsc=%6.2f\n",k,qsc_sum);
1140 if (qsc_sum<qsc_min) {keep[k]=KEEP_NOT; continue;} // too different from query? => reject once and for all
1141 }
1142
1143 //Check if sequence similarity with query at least qid?
1144 if (qdiff_max_frac<0.999)
1145 {
1146 qdiff_max=int(qdiff_max_frac*nres[k]+0.9999);
1147 // printf("k=%-4i nres=%-4i qdiff_max=%-4i first=%-4i last=%-4i",k,nres[k],qdiff_max,first[k],last[k]);
1148 diff=0;
1149 for (int i=first[k]; i<=last[k]; i++)
1150 // enough different residues to reject based on minimum qid with query? => break
1151 if (X[k][i]<20 && X[k][i]!=X[kfirst][i] && ++diff>=qdiff_max) break;
1152 // printf(" diff=%4i\n",diff);
1153 if (diff>=qdiff_max) {keep[k]=KEEP_NOT; continue;} // too different from query? => reject once and for all
1154 }
1155 // printf(" qsc=%6.2f qid=%6.2f \n",qsc_sum/nres[k],100.0*(1.0-(float)(diff)/nres[k]));
1156 } // end for (k)
1157
1158 if (seqid1>seqid2)
1159 {
1160 for (n=k=0; k<N_in; k++) if (keep[k]>KEEP_NOT) n++;
1161 return n;
1162 }
1163
1164 // Successively increment idmax[i] at positons where N[i]<Ndiff
1165 //for (seqid=seqid1; seqid<=seqid2; seqid+=1+(seqid>=50)) /* MR1 */
1166 seqid=seqid1;
1167 while (seqid<=seqid2)
1168 {
1169 /*
1170 // Update idmax[i]
1171 for (i=1; i<=L; i++) if (N[i]<Ndiff) idmax[i]=seqid;
1172
1173 // Update idmaxwin[i] as minimum of idmax[i-WFIL,i+WFIL]. If idmaxwin[] has not changed then stop
1174 char stop=1;
1175 for (i=1; i<=L; i++)
1176 {
1177 int idmax_min=seqid2;
1178 for (j=imax(1,imin(L-2*WFIL+1,i-WFIL)); j<=imin(L,imax(2*WFIL,i+WFIL)); j++)
1179 if (idmax[j]<idmax_min) idmax_min=idmax[j];
1180 if (idmax_min>idmaxwin[i]) stop=0; // idmaxwin[i] has changed => do not stop
1181 idmaxwin[i]=idmax_min;
1182 }
1183 */
1184 char stop=1;
1185 // Update Nmax[i]
1186 diffNmax_prev = diffNmax;
1187 diffNmax = 0;
1188 for (i=1; i<=L; ++i)
1189 {
1190 int max=0;
1191 for (j=imax(1,imin(L-2*WFIL+1,i-WFIL)); j<=imin(L,imax(2*WFIL,i+WFIL)); ++j)
1192 if (N[j]>max) max=N[j];
1193 if (Nmax[i]<max) Nmax[i]=max;
1194 if (Nmax[i]<Ndiff)
1195 {
1196 stop=0;
1197 idmaxwin[i]=seqid;
1198 if (diffNmax<Ndiff-Nmax[i]) diffNmax=Ndiff-Nmax[i];
1199 }
1200
1201 }
1202
1203 //printf("seqid=%3i diffNmax_prev= %-4i diffNmax= %-4i n=%-5i N_in-N_ss=%-5i\n",seqid,diffNmax_prev,diffNmax,n,N_in-N_ss);
1204
1205 if (stop) break;
1206
1207 // // DEBUG
1208 // printf("idmax ");
1209 // for (i=1; i<=L; i++) printf("%2i ",idmax[i]);
1210 // printf("\n");
1211 // printf("idmaxwin ");
1212 // for (i=1; i<=L; i++) printf("%2i ",idmaxwin[i]);
1213 // printf("\n");
1214 // printf("N[i] ");
1215 // for (i=1; i<=L; i++) printf("%2i ",N[i]);
1216 // printf("\n");
1217
1218 // Loop over all candidate sequences kk (-> k)
1219 #if 0
1220 //#ifdef HAVE_OPENMP
1221 #pragma omp parallel for schedule(static) private(k, i, diff_min_frac, jj, j, first_kj, last_kj, cov_kj, diff_suff, diff)
1222 #endif
1223 for (kk=0; kk<N_in; kk++)
1224 {
1225 if (inkk[kk]) continue; // seq k already accepted
1226 k=ksort[kk];
1227 if (!keep[k]) continue; // seq k is not regular aa sequence or already suppressed by coverage or qid criterion
1228 if (keep[k]==KEEP_ALWAYS) {inkk[kk]=2; continue;} // accept all marked sequences (no n++, since this has been done already)
1229
1230 // Calculate max-seq-id threshold seqidk for sequence k (as maximum over idmaxwin[i])
1231 // if (seqid>=100) {in[k]=inkk[kk]=1; n++; continue;}
1232 if (seqid>=100) {
1233 in[k]=inkk[kk]=1;
1234 #if 0
1235 //#ifdef HAVE_OPENMP
1236 #pragma omp atomic
1237 #endif
1238 n++;
1239 continue;
1240 }
1241 float seqidk=seqid1;
1242 for (i=first[k]; i<=last[k]; i++)
1243 if (idmaxwin[i]>seqidk) seqidk=idmaxwin[i];
1244 if (seqid==seqid_prev[k]) continue; // sequence has already been rejected at this seqid threshold => reject this time
1245 seqid_prev[k]=seqid;
1246 diff_min_frac =0.9999-0.01*seqidk; // min fraction of differing positions between sequence j and k needed to accept sequence k
1247
1248 // Loop over already accepted sequences
1249 for (jj=0; jj<kk; jj++)
1250 {
1251 if (!inkk[jj]) continue;
1252 j=ksort[jj];
1253 first_kj=imax(first[k],first[j]);
1254 last_kj =imin(last[k],last[j]);
1255 cov_kj = last_kj-first_kj+1;
1256 diff_suff=int(diff_min_frac*imin(nres[k],cov_kj)+0.999); // nres[j]>nres[k] anyway because of sorting /* MR1 0.999 */
1257 diff=0;
1258 for (int i=first_kj; i<=last_kj; i++)
1259 {
1260 // enough different residues to accept? => break
1261 if (X[k][i]>=NAA || X[j][i]>=NAA)
1262 cov_kj--;
1263 else
1264 if (X[k][i]!=X[j][i] && ++diff>=diff_suff) break; // accept (k,j)
1265 }
1266 // // DEBUG
1267 // printf("%20.20s with %20.20s: diff=%i diff_min_frac*cov_kj=%f diff_suff=%i nres=%i cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj);
1268 // printf("%s\n%s\n\n",seq[k],seq[j]);
1269
1270 //if (float(diff)<fmin(diff_min_frac*cov_kj,diff_suff)) break; //similarity > acceptace threshold? Reject! /* MR1 */
1271 if (diff<diff_suff && float(diff)<=diff_min_frac*cov_kj) break; //dissimilarity < acceptace threshold? Reject! /* MR1 */
1272
1273
1274 }
1275 if (jj>=kk) // did loop reach end? => accept k. Otherwise reject k (the shorter of the two)
1276 {
1277 in[k]=inkk[kk]=1;
1278 #if 0
1279 //#ifdef HAVE_OPENMP
1280 #pragma omp atomic
1281 #endif
1282 n++;
1283 for (i=first[k]; i<=last[k]; i++) {
1284 #if 0
1285 //#ifdef HAVE_OPENMP
1286 #pragma omp atomic
1287 #endif
1288 N[i]++; // update number of sequences at position i
1289 }
1290 // printf("%i %20.20s accepted\n",k,sname[k]);
1291 }
1292 // else
1293 // {
1294 // printf("%20.20s rejected: too similar with seq %20.20s diff=%i diff_min_frac*cov_kj=%f diff_suff=%i nres=%i cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj);
1295 // printf("%s\n%s\n\n",seq[k],seq[j]);
1296 // }
1297
1298 } // End Loop over all candidate sequences kk
1299
1300 // // DEBUG
1301 // printf("\n");
1302 // printf("seqid_prev[k]= \n");
1303 // for (k=0; k<N_in; k++) printf("%2i ",seqid_prev[k]);
1304 // printf("\n");
1305
1306 // Increment seqid /* MR1 */
1307 seqid_step = imax(1,imin(5,diffNmax/(diffNmax_prev-diffNmax+1)*seqid_step/2));
1308 seqid += seqid_step;
1309
1310 } // End Loop over seqid
1311
1312 if (v>=2)
1313 {
1314 printf("%i out of %i sequences passed filter (",n,N_in-N_ss);
1315 if (par.coverage)
1316 printf("%i%% min coverage, ",coverage);
1317 if (qid)
1318 printf("%i%% min sequence identity to query, ",qid);
1319 if (qsc>-10)
1320 printf("%.2f bits min score per column to query, ",qsc);
1321 if (Ndiff<N_in && Ndiff>0)
1322 printf("up to %i%% position-dependent max pairwise sequence identity)\n",seqid);
1323 else
1324 printf("%i%% max pairwise sequence identity)\n",seqid1);
1325 }
1326
1327 for (k=0; k<N_in; k++) keep[k]=in[k];
1328 delete[] in; in = NULL;
1329 delete[] inkk; inkk = NULL;
1330 //delete[] idmax; idmax = NULL;
1331 delete[] Nmax; /* MR1 */
1332 delete[] idmaxwin; idmaxwin = NULL;
1333 delete[] seqid_prev; seqid_prev = NULL;
1334 delete[] N; N = NULL;
1335 #if 0
1336 printf("%s:%s:%d: sequences accepted = %d/%d\n", __FUNCTION__, __FILE__, __LINE__, n, N_in-N_ss);
1337 #endif
1338 return n;
1339 }
1340
1341
1342
1343 /* MR1: the Alignment::HomologyFilter is no longer needed in hhalign-stand-alone */
1344 /////////////////////////////////////////////////////////////////////////////
1345 /**
1346 * @brief Filter for min score per column coresc with core query profile,
1347 * defined by coverage_core and qsc_core
1348 */
1349 /////////////////////////////////////////////////////////////////////////////
1350 int
HomologyFilter(int coverage_core,float qsc_core,float coresc)1351 Alignment::HomologyFilter(int coverage_core, float qsc_core, float coresc)
1352 {
1353 const int seqid_core=90; //maximum sequence identity in core alignment
1354 const int qid_core=0;
1355 const int Ndiff_core=0;
1356 int n;
1357 HMM qcore;
1358 char* coreseq=new char[N_in]; // coreseq[k]=1 if sequence belongs to core of alignment (i.e. it is very similar to query)
1359 for (int k=0; k<N_in; k++) coreseq[k]=keep[k]; // Copy keep[] into coreseq[]
1360
1361 // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
1362 int v1=v; v=1;
1363 n = Filter2(coreseq,coverage_core,qid_core,qsc_core,seqid_core,seqid_core,Ndiff_core);
1364 v=v1;
1365 if (v>=2)
1366 {
1367 printf("%i out of %i core alignment sequences passed filter (",n,N_in-N_ss);
1368 if (par.coverage_core)
1369 printf("%i%% min coverage, ",coverage_core);
1370 if (qid_core)
1371 printf("%i%% min sequence identity to query, ",qid_core);
1372 if (qsc_core>-10)
1373 printf("%.2f bits min score per column to query, ",qsc_core);
1374 printf("%i%% max pairwise sequence identity)\n",seqid_core);
1375 }
1376
1377 // Calculate bare AA frequencies and transition probabilities -> qcore.f[i][a], qcore.tr[i][a]
1378 FrequenciesAndTransitions(qcore,coreseq);
1379
1380 // Add transition pseudocounts to query -> q.p[i][a] (gapd=1.0, gape=0.333, gapf=gapg=1.0, gaph=gapi=1.0, gapb=1.0
1381 qcore.AddTransitionPseudocounts(1.0,0.333,1.0,1.0,1.0,1.0,1.0);
1382
1383 // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
1384 qcore.PreparePseudocounts();
1385
1386 // Add amino acid pseudocounts to query: qcore.p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
1387 qcore.AddAminoAcidPseudocounts(2,1.5,2.0,1.0); // pcm=2, pca=1.0, pcb=2.5, pcc=1.0
1388
1389 // Filter out all sequences below min score per column with qcore
1390 n=FilterWithCoreHMM(keep, coresc, qcore);
1391
1392 if (v>=2) cout<<n<<" out of "<<N_in-N_ss<<" sequences filtered by minimum score-per-column threshold of "<<qsc_core<<"\n";
1393 delete[] coreseq; coreseq = NULL;
1394 return n;
1395 }
1396
1397
1398 /////////////////////////////////////////////////////////////////////////////////////
1399 /**
1400 * @brief Filter out all sequences below a minimum score per column with profile qcore
1401 */
1402 int
FilterWithCoreHMM(char in[],float coresc,HMM & qcore)1403 Alignment::FilterWithCoreHMM(char in[], float coresc, HMM& qcore)
1404 {
1405 int k; // count sequences in alignment
1406 int i; // column in query alignment
1407 int a; // amino acid (0..19)
1408 int n=1; // number of sequences that passed filter
1409 float** logodds=new float*[L+1]; // log-odds ratios for HMM qcore
1410 char gap; // 1: previous state in seq k was a gap 0: previous state in seq k was an amino acid
1411 float score; // score of sequence k aligned with qcore
1412
1413 for (i=1; i<=L; i++) logodds[i]=new(float[21]);
1414
1415 // Determine first[k], last[k]?
1416 if (first==NULL)
1417 {
1418 first=new int[N_in];// first non-gap position in sequence k
1419 last =new int[N_in];// last non-gap position in sequence k
1420 for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1421 {
1422 for (i=1; i<=L; i++) if (X[k][i]<NAA) break;
1423 first[k]=i;
1424 for (i=L; i>=1; i--) if (X[k][i]<NAA) break;
1425 last[k]=i;
1426 }
1427 }
1428
1429 // Determine number of residues nres[k]?
1430 if (nres==NULL)
1431 {
1432 nres=new int[N_in];
1433 for (k=0; k<N_in; k++) // do this for ALL sequences, not only those with in[k]==1 (since in[k] may be display[k])
1434 {
1435 int nr=0;
1436 for (i=first[k]; i<=last[k]; i++)
1437 if (X[k][i]<NAA) nr++;
1438 nres[k]=nr;
1439 // printf("%20.20s nres=%3i first=%3i last=%3i\n",sname[k],nr,f,l);
1440 }
1441 }
1442
1443 // Precalculate the log-odds for qcore
1444 for (i=1; i<=L; i++)
1445 {
1446 for (a=0; a<NAA; a++)
1447 logodds[i][a]=fast_log2(qcore.p[i][a]/pb[a]);
1448 logodds[i][ANY]=-0.5; // half a bit penalty for X
1449
1450 // printf(" A R N D C Q E G H I L K M F P S T W Y V\n");
1451 // printf("%6i ",i);
1452 // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*qcore.f[i][a]);
1453 // printf("\n");
1454 // printf(" ");
1455 // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*qcore.g[i][a]);
1456 // printf("\n");
1457 // printf(" ");
1458 // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*qcore.p[i][a]);
1459 // printf("\n");
1460 // printf(" ");
1461 // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*pb[a]);
1462 // printf("\n");
1463 // printf(" ");
1464 // for (a=0; a<20; ++a) fprintf(stdout,"%5.2f ",fast_log2(qcore.p[i][a]/pb[a]));
1465 // printf("\n");
1466 }
1467
1468 // Main loop: test all sequences k
1469 for (k=kfirst+1; k<N_in; k++)
1470 {
1471 if (!in[k]) continue; // if in[k]==0 sequence k will be suppressed directly
1472
1473 float score_M=0.0;
1474 float score_prev=0.0;
1475
1476 // Calculate score of sequence k with core HMM
1477 score=0; gap=0;
1478 for (i=first[k]; i<=last[k]; i++)
1479 {
1480 score_M=0.0;
1481 if (X[k][i]<=ANY) // current state is Match
1482 {
1483 score_M=logodds[i][ (int)X[k][i]];
1484 score+=logodds[i][ (int)X[k][i]];
1485 if (gap) score+=qcore.tr[i][D2M]; else score+=qcore.tr[i][M2M];
1486 gap=0;
1487 }
1488 else if (X[k][i]==GAP) // current state is Delete (ignore ENDGAPs)
1489 {
1490 if (gap) score+=qcore.tr[i][D2D]; else score+=qcore.tr[i][M2D];
1491 gap=1;
1492 }
1493 if (I[k][i]) score+=qcore.tr[i][M2I]+(I[k][i]-1)*qcore.tr[i][I2I]+qcore.tr[i][I2M];
1494 // if (k==2) printf("i=%3i %c:%c score_M=%6.2f score=%6.2f score_sum=%6.2f \n",i,i2aa(X[kfirst][i]),i2aa(X[k][i]),score_M,score-score_prev,score);
1495 score_prev=score;
1496 }
1497
1498 printf("k=%3i score=%6.2f\n",k,score);
1499 if (score<nres[k]*coresc) in[k]=0; else n++;// reject sequence k?
1500 }
1501 for (i=1; i<=L; i++){
1502 delete[] logodds[i]; logodds[i] = NULL;
1503 }
1504 delete[] logodds; logodds = NULL;
1505 return n;
1506 }
1507
1508
1509 /* MR1 */
1510 #if 0
1511 /////////////////////////////////////////////////////////////////////////////////////
1512 /**
1513 * @brief Filter alignment to given diversity/Neff
1514 */
1515 bool
1516 Alignment::FilterNeff()
1517 {
1518 int v1=v;
1519 v=v1-1;
1520 const float TOLX=0.001;
1521 const float TOLY=0.02;
1522 char dummy[N_in+1];
1523 for (int k=0; k<N_in; ++k) dummy[k]=keep[k];
1524 float x=0.0,y=0.0;
1525 float x0=-1.0;
1526 float x1=+2.0;
1527 float y0=filter_by_qsc(x0,dummy);
1528 float y1=filter_by_qsc(x1,dummy);
1529 int i=2;
1530 while (y0-par.Neff>0 && par.Neff-y1>0)
1531 {
1532 x = x0 + (par.Neff-y0)*(x1-x0)/(y1-y0); // linear interpolation between (x0,y0) and (x1,y1)
1533 y = filter_by_qsc(x,dummy);
1534 if (v>=2) printf(" %3i x0=%6.3f -> %6.3f x=%6.3f -> %6.3f x1=%6.3f -> %6.3f \n",++i,x0,y0,x,y,x1,y1);
1535 if (y>par.Neff) {x0=x; y0=y;} else {x1=x; y1=y;}
1536 if (fabs(par.Neff-y)<TOLY || x1-x0<TOLX) break;
1537 }
1538 v=v1;
1539
1540 if (y0>=par.Neff && y1<=par.Neff)
1541 {
1542 // Write filtered alignment WITH insert states (lower case) to alignment file
1543 if (v>=2) printf("Found Neff=%6.3f at filter threshold qsc=%6.3f\n",y,x);
1544 return true;
1545 }
1546 else if (v>=1)
1547 printf("Diversity of unfiltered alignment %.2f is below target diversity %.2f. No alignment written\n",y0,par.Neff);
1548
1549 return false;
1550 }
1551
1552 float Alignment::filter_by_qsc(float qsc, char* dummy)
1553 {
1554 HMM q;
1555 for (int k=0; k<N_in; ++k) keep[k]=dummy[k];
1556 Filter2(keep,par.coverage,0,qsc,par.max_seqid+1,par.max_seqid,0);
1557 FrequenciesAndTransitions(q);
1558 // printf("qsc=%4.1f N_filtered=%-3i Neff=%6.3f\n",qsc,n,q.Neff_HMM);
1559 return q.Neff_HMM;
1560 }
1561 #endif
1562
1563 /////////////////////////////////////////////////////////////////////////////////////
1564 /**
1565 * @brief Calculate AA frequencies q.p[i][a] and transition probabilities q.tr[i][a] from alignment
1566 */
1567 void
FrequenciesAndTransitions(HMM & q,char * in)1568 Alignment::FrequenciesAndTransitions(HMM& q, char* in)
1569 {
1570 int k; // index of sequence
1571 int i; // position in alignment
1572 int a; // amino acid (0..19)
1573 int ni[NAA+3]; // number of times amino acid a occurs at position i
1574 int naa; // number of different amino acids
1575
1576 if (v>=3)
1577 cout<<"Calculating position-dependent weights on subalignments\n";
1578
1579 if (in==NULL) in=keep; // what's this good for?
1580
1581 if (N_filtered>1)
1582 {
1583 for (k=0; k<N_in; k++) wg[k]=0.0; // initialized wg[k]
1584 // Calculate global weights
1585 for (i=1; i<=L; i++) // for all positions i in alignment
1586 {
1587 for (a=0; a<20; a++) ni[a]=0;
1588 for (k=0; k<N_in; k++) if (in[k]) ni[ (int)X[k][i]]++;
1589 naa=0; for (a=0; a<20; a++) if(ni[a]) naa++;
1590 if (!naa) naa=1; //naa=0 when column consists of only gaps and Xs (=ANY)
1591 for (k=0; k<N_in; k++)
1592 if (in[k] && X[k][i]<20)
1593 wg[k] += 1.0/float(ni[ (int)X[k][i]]*naa*(nres[k]+30));
1594 // ensure that each residue of a short sequence contributes as much as a residue of a long sequence:
1595 // contribution is proportional to one over sequence length nres[k] plus 30.
1596 }
1597 NormalizeTo1(wg,N_in);
1598
1599
1600 // Do pos-specific sequence weighting and calculate amino acid frequencies and transitions
1601 for (k=0; k<N_in; k++) X[k][0]=ENDGAP; // make sure that sequences ENTER subalignment j for j=1
1602 for (k=0; k<N_in; k++) X[k][L+1]=ENDGAP; // does it have an influence?
1603
1604 #if 0
1605 //#ifdef HAVE_OPENMP
1606 if(par.wg != 1)
1607 {
1608 #pragma omp parallel sections
1609 {
1610 #pragma omp section
1611 Amino_acid_frequencies_and_transitions_from_M_state(q,in); // use subalignments of seqs with residue in i
1612 #pragma omp section
1613 Transitions_from_I_state(q,in); // use subalignments of seqs with insert in i
1614 #pragma omp section
1615 Transitions_from_D_state(q,in); // use subalignments of seqs with delete in i. Must be last of these three calls if par.wg==1!
1616 }
1617 }
1618 else
1619 {
1620 #pragma omp parallel sections
1621 {
1622 #pragma omp section
1623 Amino_acid_frequencies_and_transitions_from_M_state(q,in); // use subalignments of seqs with residue in i
1624 #pragma omp section
1625 Transitions_from_I_state(q,in); // use subalignments of seqs with insert in i
1626 }
1627 Transitions_from_D_state(q,in); // use subalignments of seqs with delete in i. Must be last of these three calls if par.wg==1!
1628 }
1629 #else
1630 Amino_acid_frequencies_and_transitions_from_M_state(q,in);
1631 Transitions_from_I_state(q,in);
1632 Transitions_from_D_state(q,in);
1633 #endif
1634 }
1635 else // N_filtered==1
1636 {
1637 X[kfirst][0]=X[kfirst][L+1]=ANY; // (to avoid anallowed access within loop)
1638 q.Neff_HMM=1.0f;
1639 for (i=0; i<=L+1; i++) // for all positions i in alignment
1640 {
1641 q.Neff_M[i]=1.0f;
1642 q.Neff_I[i]=q.Neff_D[i]=0.0f;
1643 for (a=0; a<20; a++) q.f[i][a]=0.0;
1644 /* this is the crucial change that makes terminal-X work */
1645 //q.f[i][ (int)(X[kfirst][i]) ] = 1.0; /* MR1 */
1646 if (X[kfirst][i] < ANY) /* MR1 */
1647 q.f[i][(unsigned int) X[kfirst][i] ] = 1.0;
1648 else
1649 for (a=0; a<20; ++a) q.f[i][a]=pb[a];
1650 q.tr[i][M2M]=0;
1651 q.tr[i][M2I]=-100000.0;
1652 q.tr[i][M2D]=-100000.0;
1653 q.tr[i][I2M]=-100000.0;
1654 q.tr[i][I2I]=-100000.0;
1655 q.tr[i][D2M]=-100000.0;
1656 q.tr[i][D2D]=-100000.0;
1657 }
1658 q.tr[0][I2M]=0;
1659 q.tr[L][I2M]=0;
1660 q.tr[0][D2M]=0;
1661 q.Neff_M[0]=q.Neff_I[0]=q.Neff_D[0]=99.999; // Neff_av[0] is used for calculation of transition pseudocounts for the start state
1662 }
1663
1664 if (v>=3)
1665 {
1666 printf("\nMatches:\n");
1667 printf("col Neff nseqs\n");
1668 for (i=1; i<=imin(L,100); i++)
1669 printf("%3i %5.2f %3i\n",i,q.Neff_M[i],nseqs[i]);
1670
1671 printf("\nInserts:\n");
1672 printf("col Neff nseqs\n");
1673 for (i=1; i<=imin(L,100); i++)
1674 printf("%3i %5.2f %3i\n",i,q.Neff_I[i],nseqs[i]);
1675
1676 printf("\nDeletes:\n");
1677 printf("col Neff nseqs\n");
1678 for (i=1; i<=imin(L,100); i++)
1679 printf("%3i %5.2f %3i\n",i,q.Neff_D[i],nseqs[i]);
1680 }
1681
1682 // Copy column information into HMM q
1683 q.L=L;
1684 q.N_in=N_in;
1685 q.N_filtered=N_filtered;
1686 for (i=1; i<=L; i++) q.l[i]=l[i];
1687
1688 // Set names in HMM q
1689 if (strlen(q.name)==0) strcpy(q.name,name);
1690 if (strlen(q.longname)==0) strcpy(q.longname,longname);
1691 if (strlen(q.fam)==0) strcpy(q.fam,fam);
1692 ScopID(q.cl,q.fold,q.sfam,q.fam); // derive superfamily, fold and class code from family name
1693 strcpy(q.file,file); // Store basename of alignment file name in q.file
1694
1695 // Copy sequences to be displayed into HMM
1696 q.nss_dssp=q.nsa_dssp=q.nss_pred=q.nss_conf=q.nfirst=-1;
1697 int n=0;
1698 if (kss_dssp>=0) q.nss_dssp=n++; // copy dssp sequence?
1699 if (ksa_dssp>=0) q.nsa_dssp=n++; // copy dssp sequence?
1700 if (kss_pred>=0) q.nss_pred=n++; // copy psipred sequence?
1701 if (kss_conf>=0) q.nss_conf=n++; // copy confidence value sequence?
1702
1703 // Calculate consensus sequence?
1704 if (par.showcons || par.cons)
1705 {
1706 float maxw;
1707 int maxa;
1708 if (par.showcons)
1709 {
1710 // Reserve space for consensus/conservation sequence as Q-T alignment mark-up
1711 q.ncons=n++;
1712 q.sname[q.ncons]=new(char[10]);
1713 if (!q.sname[q.ncons]) {MemoryError("array of names for displayed sequences");}
1714 strcpy(q.sname[q.ncons],"Consensus");
1715 q.seq[q.ncons]=new char[L+2];
1716 if (!q.seq[q.ncons]) {MemoryError("array of names for displayed sequences");}
1717 }
1718 if (par.cons)
1719 {
1720 // Reserve space for consensus sequence as first sequence in alignment
1721 q.nfirst=n++; kfirst=-1;
1722 q.sname[q.nfirst]=new char[strlen(name)+11];
1723 if (!q.sname[q.nfirst]) {MemoryError("array of names for displayed sequences");}
1724 strcpy(q.sname[q.nfirst],name);
1725 strcat(q.sname[q.nfirst],"_consensus");
1726 q.seq[q.nfirst]=new char[L+2];
1727 if (!q.seq[q.nfirst]) {MemoryError("array of names for displayed sequences");}
1728 }
1729 // Calculate consensus amino acids using similarity matrix
1730 for (i=1; i<=L; i++)
1731 {
1732 maxw=0.0; maxa=0;
1733 for (a=0; a<20; a++)
1734 if (q.f[i][a]-pb[a]>maxw) {maxw = q.f[i][a]-pb[a]; maxa = a;}
1735
1736 if (par.showcons)
1737 {
1738 maxw =0.0;
1739 for (int b=0; b<20; b++) maxw += q.f[i][b]*Sim[maxa][b]*Sim[maxa][b];
1740 maxw *= q.Neff_M[i]/(q.Neff_HMM+1); // columns with many gaps don't get consensus symbol
1741 if (maxw>0.6) q.seq[q.ncons][i] = uprchr(i2aa(maxa));
1742 else if (maxw>0.4) q.seq[q.ncons][i] = lwrchr(i2aa(maxa));
1743 else q.seq[q.ncons][i] = 'x';
1744 }
1745 if (par.cons) q.seq[q.nfirst][i] = uprchr(i2aa(maxa));
1746 }
1747 if (par.showcons)
1748 {
1749 q.seq[q.ncons][0]='-';
1750 q.seq[q.ncons][L+1]='\0';
1751 }
1752 if (par.cons)
1753 {
1754 q.seq[q.nfirst][0]='-';
1755 q.seq[q.nfirst][L+1]='\0';
1756 }
1757 }
1758
1759 // Copy sequences to be displayed from alignment to HMM
1760 for (k=0; k<N_in; k++)
1761 {
1762 int nn;
1763 if (display[k])
1764 {
1765 if (0 && (n>=MAXSEQDIS)) {
1766 /* FIXME: the test was if(n>=MAXSEQDIS),
1767 this test was necessary because alignment memory was static,
1768 now it should be dynamic, and should always have the right size,
1769 there are at least number-of-sequences plus a 'bit' more
1770 however, I do not know what that 'bit' is likely to be (in the future).
1771 at the moment it is 1 for the consnseus and 1 for structure,
1772 but this might change (FS)
1773 */
1774 if (par.mark) cerr<<"WARNING: maximum number "<<MAXSEQDIS<<" of sequences for display of alignment exceeded\n";
1775 break;
1776 }
1777 if (k==kss_dssp) nn=q.nss_dssp; // copy dssp sequence to nss_dssp
1778 else if (k==ksa_dssp) nn=q.nsa_dssp;
1779 else if (k==kss_pred) nn=q.nss_pred;
1780 else if (k==kss_conf) nn=q.nss_conf;
1781 else if (k==kfirst) nn=q.nfirst=n++;
1782 else nn=n++;
1783 // strcut(sname[k]," "); // delete rest of name line beginning with two spaces " " // Why this?? Problem for pdb seqs without chain
1784 q.sname[nn]=new char[strlen(sname[k])+1];
1785 if (!q.sname[nn]) {MemoryError("array of names for displayed sequences");}
1786 strcpy(q.sname[nn],sname[k]);
1787 q.seq[nn]=new char[strlen(seq[k])+1];
1788 if (!q.seq[nn]) {MemoryError("array of names for displayed sequences");}
1789 strcpy(q.seq[nn],seq[k]);
1790 }
1791 }
1792 q.n_display=n; // how many sequences to be displayed in alignments?
1793
1794 // Copy secondary structure information into HMM
1795 if (kss_dssp>=0)
1796 for (i=1; i<=L; i++) q.ss_dssp[i]=X[kss_dssp][i];
1797 if (ksa_dssp>=0)
1798 for (i=1; i<=L; i++) q.sa_dssp[i]=X[ksa_dssp][i];
1799 if (kss_pred>=0)
1800 {
1801 for (i=1; i<=L; i++) q.ss_pred[i]=X[kss_pred][i];
1802 if (kss_conf>=0)
1803 for (i=1; i<=L; i++) q.ss_conf[i]=X[kss_conf][i];
1804 else
1805 for (i=1; i<=L; i++) q.ss_conf[i]=5;
1806 }
1807
1808 q.lamda=0.0;
1809 q.mu=0.0;
1810
1811 // Debug: print occurence of amino acids for each position i
1812 if (v>=2) printf("Effective number of sequences exp(entropy) = %-4.1f\n",q.Neff_HMM); //PRINT
1813 if (v>=3)
1814 {
1815 cout<<"\nMatr: ";
1816 for (a=0; a<20; a++) printf("%4.1f ",100*pb[a]);
1817 cout<<"\nAmino acid frequencies without pseudocounts:\n";
1818 cout<<" A R N D C Q E G H I L K M F P S T W Y V\n";
1819 for (i=1; i<=L; i++)
1820 {
1821 printf("%3i: ",i);
1822 for (a=0; a<20; a++) printf("%4.0f ",100*q.f[i][a]);
1823 cout<<endl;
1824 }
1825 cout<<"\n";
1826
1827 printf("\nListing transition probabilities without pseudocounts:\n");
1828 printf(" i M->M M->I M->D I->M I->I D->M D->D Neff_M Neff_I Neff_D\n");
1829 for (i=0; i<=L; i++)
1830 {
1831 printf("%4i %6.3f %6.3f %6.3f ",i,pow(2.0,q.tr[i][M2M]),pow(2.0,q.tr[i][M2I]),pow(2.0,q.tr[i][M2D]));
1832 printf("%6.3f %6.3f ",pow(2.0,q.tr[i][I2M]),pow(2.0,q.tr[i][I2I]));
1833 printf("%6.3f %6.3f ",pow(2.0,q.tr[i][D2M]),pow(2.0,q.tr[i][D2D]));
1834 printf("%6.3f %6.3f %6.3f\n",q.Neff_M[i],q.Neff_I[i],q.Neff_D[i]);
1835 }
1836 }
1837 q.trans_lin=0;
1838 q.has_pseudocounts=false; /* MR1 */
1839
1840 return;
1841 }
1842
1843
1844 /////////////////////////////////////////////////////////////////////////////////////
1845 /*
1846 * FIXME: one of the most time consuming routines (according to gprof on r112)
1847 */
1848 /**
1849 * @brief Calculate freqs q.f[i][a] and transitions q.tr[i][a] (a=MM,MI,MD) with pos-specific subalignments
1850 * Pos-specific weights are calculated like in "GetPositionSpecificWeights()"
1851 */
1852 void
Amino_acid_frequencies_and_transitions_from_M_state(HMM & q,char * in)1853 Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM& q, char* in)
1854 {
1855 // Calculate position-dependent weights wi[k] for each i.
1856 // For calculation of weights in column i use sub-alignment
1857 // over sequences which have a *residue* in column i (no gap, no end gap)
1858 // and over columns where none of these sequences has an end gap.
1859 // This is done by updating the arrays n[j][a] at each step i-1->i while letting i run from 1 to L.
1860 // n[j][a] = number of occurences of amino acid a at column j of the subalignment,
1861 // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
1862 // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
1863 // then the old values wi[k], Neff[i-1], and ncol are used for the new position i.
1864 // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
1865
1866 int k; // index of sequence
1867 int i,j; // position in alignment
1868 int a; // amino acid (0..19)
1869 int naa; // number of different amino acids
1870 int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
1871 //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
1872 float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i
1873 //float Neff[MAXRES]; // diversity of subalignment i
1874 float *Neff = new float[par.maxResLen]; // diversity of subalignment i
1875 int nseqi=0; // number of sequences in subalignment i
1876 int ncol=0; // number of columns j that contribute to Neff[i]
1877 char change; // has the set of sequences in subalignment changed? 0:no 1:yes
1878 float fj[NAA+3]; // to calculate entropy
1879 float sum;
1880
1881 wi = new float[N_in+2];
1882
1883 // Global weights?
1884 if (par.wg==1)
1885 for (k=0; k<N_in; k++)
1886 wi[k]=wg[k];
1887
1888 // Initialization
1889 q.Neff_HMM=0.0f;
1890 Neff[0]=0.0; // if the first column has no residues (i.e. change==0), Neff[i]=Neff[i-1]=Neff[0]
1891 n = new int*[L+2];
1892 //#if 0
1893 #ifdef HAVE_OPENMP
1894 #pragma omp parallel for schedule(static) private(a)
1895 #endif
1896 for (j=1; j<=L; j++) {
1897 n[j]=new(int[NAA+3]);
1898 for (a=0; a<NAA+3; a++)
1899 n[j][a]=0;
1900 }
1901
1902
1903 //////////////////////////////////////////////////////////////////////////////////////////////
1904 // Main loop through alignment columns
1905 for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
1906 {
1907 if (par.wg==0)
1908 {
1909 change=0;
1910 // Check all sequences k and update n[j][a] and ri[j] if necessary
1911 for (k=0; k<N_in; k++)
1912 {
1913 if (!in[k]) continue;
1914 if (X[k][i-1]>=ANY && X[k][i]<ANY)
1915 { // ... if sequence k was NOT included in i-1 and has to be included for column i
1916 change=1;
1917 nseqi++;
1918 //#if 0
1919 #ifdef HAVE_OPENMP
1920 #pragma omp parallel for schedule(static)
1921 #endif
1922 for (int j=1; j<=L; j++)
1923 n[j][ (int)X[k][j]]++;
1924 }
1925 else if (X[k][i-1]<ANY && X[k][i]>=ANY)
1926 { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
1927 change=1;
1928 nseqi--;
1929 //#if 0
1930 #ifdef HAVE_OPENMP
1931 #pragma omp parallel for schedule(static)
1932 #endif
1933 for (int j=1; j<=L; j++)
1934 n[j][ (int)X[k][j]]--;
1935 }
1936 } //end for (k)
1937 nseqs[i]=nseqi;
1938
1939 // If subalignment changed: update weights wi[k] and Neff[i]
1940 if (change)
1941 {
1942 // Initialize weights and numbers of residues for subalignment i
1943 ncol=0;
1944 for (k=0; k<N_in; k++)
1945 wi[k]=1E-8; // for pathological alignments all wi[k] can get 0; /* MR1 */
1946
1947 // sum wi[k] over all columns j and sequences k of subalignment
1948 //#if 0
1949 #ifdef HAVE_OPENMP
1950 #pragma omp parallel for schedule(static) private(naa, a, k)
1951 #endif
1952 for (j=1; j<=L; j++)
1953 {
1954 // do at least a fraction MAXENDGAPFRAC of sequences in subalignment contain an end gap in j?
1955 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
1956 naa=0;
1957 for (a=0; a<20; a++)
1958 if(n[j][a])
1959 naa++;
1960 if (naa==0) continue;
1961 //#if 0
1962 #ifdef HAVE_OPENMP
1963 #pragma omp atomic
1964 #endif
1965 ncol++;
1966 for (k=0; k<N_in; k++)
1967 {
1968 if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
1969 {
1970 // if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Mi=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
1971 //#if 0
1972 #ifdef HAVE_OPENMP
1973 #pragma omp atomic
1974 #endif
1975 wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
1976 }
1977 }
1978 } // end for (j)
1979
1980 // Check whether number of columns in subalignment is sufficient
1981 if (ncol<NCOLMIN)
1982 // Take global weights
1983 for (k=0; k<N_in; k++)
1984 if(in[k] && X[k][i]<ANY)
1985 wi[k]=wg[k];
1986 else wi[k]=0.0;
1987
1988 // Calculate Neff[i]
1989 Neff[i]=0.0;
1990 //#if 0
1991 #ifdef HAVE_OPENMP
1992 #pragma omp parallel for schedule(static) private(a, k, fj)
1993 #endif
1994 for (j=1; j<=L; j++)
1995 {
1996 // do at least a fraction MAXENDGAPFRA of sequences in subalignment contain an end gap in j?
1997 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
1998 for (a=0; a<20; a++) fj[a]=0;
1999 for (k=0; k<N_in; k++)
2000 if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
2001 fj[ (int)X[k][j] ]+=wi[k];
2002 NormalizeTo1(fj,NAA);
2003 for (a=0; a<20; a++)
2004 if (fj[a]>1E-10)
2005 //#if 0
2006 #ifdef HAVE_OPENMP
2007 #pragma omp atomic
2008 #endif
2009 Neff[i]-=fj[a]*fast_log2(fj[a]);
2010 }
2011 if (ncol>0)
2012 Neff[i]=pow(2.0,Neff[i]/ncol);
2013 else
2014 Neff[i]=1.0;
2015 } // end change
2016
2017 else //no update was necessary; copy values for i-1
2018 {
2019 Neff[i]=Neff[i-1];
2020 }
2021 } // end par.wg==0
2022
2023
2024 // Calculate amino acid frequencies q.f[i][a] from weights wi[k]
2025 for (a=0; a<20; a++)
2026 q.f[i][a]=0;
2027 for (k=0; k<N_in; k++)
2028 if (in[k])
2029 q.f[i][ (int)X[k][i] ]+=wi[k];
2030 NormalizeTo1(q.f[i],NAA,pb);
2031
2032 // Calculate transition probabilities from M state
2033 q.tr[i][M2M]=q.tr[i][M2D]=q.tr[i][M2I]=0.0;
2034 for (k=0; k<N_in; k++) //for all sequences
2035 {
2036 if (!in[k]) continue;
2037 //if input alignment is local ignore transitions from and to end gaps
2038 if (X[k][i]<ANY) //current state is M
2039 {
2040 if (I[k][i]) //next state is I
2041 q.tr[i][M2I]+=wi[k];
2042 else if (X[k][i+1]<=ANY) //next state is M
2043 q.tr[i][M2M]+=wi[k];
2044 else if (X[k][i+1]==GAP) //next state is D
2045 q.tr[i][M2D]+=wi[k];
2046 }
2047 } // end for(k)
2048 // Normalize and take log
2049 sum = q.tr[i][M2M]+q.tr[i][M2I]+q.tr[i][M2D]+FLT_MIN;
2050 q.tr[i][M2M]=Log2(q.tr[i][M2M]/sum);
2051 q.tr[i][M2I]=Log2(q.tr[i][M2I]/sum);
2052 q.tr[i][M2D]=Log2(q.tr[i][M2D]/sum);
2053
2054 // for (k=0; k<N_in; k++) if (in[k]) w[k][i]=wi[k];
2055 } // end loop through alignment columns i
2056
2057 // DD TODO:fill in all the missing Neff values
2058
2059 //////////////////////////////////////////////////////////////////////////////////////////////
2060
2061 delete[](wi); wi=NULL;
2062 // delete n[][]
2063 for (j=1; j<=L; j++){
2064 delete[](n[j]);
2065 (n[j]) = NULL;
2066 }
2067 delete[](n); (n) = NULL;
2068
2069 q.tr[0][M2M]=0;
2070 q.tr[0][M2I]=-100000;
2071 q.tr[0][M2D]=-100000;
2072 q.tr[L][M2M]=0;
2073 q.tr[L][M2I]=-100000;
2074 q.tr[L][M2D]=-100000;
2075 q.Neff_M[0]=99.999; // Neff_av[0] is used for calculation of transition pseudocounts for the start state
2076
2077 // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities
2078 for (a=0; a<20; a++)
2079 q.f[0][a]=q.f[L+1][a]=pb[a];
2080
2081 // Assign Neff_M[i] and calculate average over alignment, Neff_M[0]
2082 if (par.wg==1)
2083 {
2084 //#if 0
2085 #ifdef HAVE_OPENMP
2086 #pragma omp parallel for schedule(static) private(a)
2087 #endif
2088 for (i=1; i<=L; i++)
2089 {
2090 float sum=0.0f;
2091 for (a=0; a<20; a++)
2092 if (q.f[i][a]>1E-10)
2093 sum -= q.f[i][a]*fast_log2(q.f[i][a]);
2094 //#if 0
2095 #ifdef HAVE_OPENMP
2096 #pragma omp atomic
2097 #endif
2098 q.Neff_HMM+=pow(2.0,sum);
2099 }
2100 q.Neff_HMM/=L;
2101 float Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
2102 float scale=Log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with inserts at specific pos
2103 //#if 0
2104 #ifdef HAVE_OPENMP
2105 #pragma omp parallel for schedule(static) private(k)
2106 #endif
2107 for (i=1; i<=L; i++)
2108 {
2109 float w_M=-1.0/N_filtered;
2110 for (k=0; k<N_in; k++)
2111 if (in[k] && X[k][i]<=ANY)
2112 w_M+=wg[k];
2113 if (w_M<0)
2114 q.Neff_M[i]=1.0;
2115 else
2116 q.Neff_M[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_M);
2117 // fprintf(stderr,"M i=%3i ncol=--- Neff_M=%5.2f Nlim=%5.2f w_M=%5.3f Neff_M=%5.2f\n",i,q.Neff_HMM,Nlim,w_M,q.Neff_M[i]);
2118 }
2119 }
2120 else
2121 {
2122 for (i=1; i<=L; i++)
2123 {
2124 q.Neff_HMM+=Neff[i];
2125 q.Neff_M[i]=Neff[i];
2126 if (q.Neff_M[i] == 0) {
2127 q.Neff_M[i] = 1;
2128 } /* MR1 */
2129 }
2130 q.Neff_HMM/=L;
2131 } // end par.wg==1
2132
2133 delete[] Neff; Neff = NULL;
2134
2135 return;
2136
2137 } /* this is the end of Alignment::Amino_acid_frequencies_and_transitions_from_M_state() */
2138
2139
2140 /////////////////////////////////////////////////////////////////////////////////////
2141 /**
2142 * @brief Calculate transitions q.tr[i][a] (a=DM,DD) with pos-specific subalignments
2143 */
2144 void
Transitions_from_I_state(HMM & q,char * in)2145 Alignment::Transitions_from_I_state(HMM& q, char* in)
2146 {
2147 // Calculate position-dependent weights wi[k] for each i.
2148 // For calculation of weights in column i use sub-alignment
2149 // over sequences which have a INSERT in column i
2150 // and over columns where none of these sequences has an end gap.
2151 // This is done by calculating the arrays n[j][a] and rj[j] at each step i-1->i while letting i run from 1 to L.
2152 // n[j][a] = number of occurences of amino acid a at column j of the subalignment,
2153 // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
2154 // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
2155 // then the old values wi[k], Neff[i-1], and ncol are used for the new position i.
2156 // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
2157
2158 int k; // index of sequence
2159 int i,j; // position in alignment
2160 int a; // amino acid (0..19)
2161 int naa; // number of different amino acids
2162 int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
2163 //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
2164 float *wi = NULL; // weight of sequence k in column i, calculated from subalignment i
2165 //float Neff[MAXRES]; // diversity of subalignment i
2166 float *Neff = new float[par.maxResLen]; // diversity of subalignment i
2167 int nseqi; // number of sequences in subalignment i
2168 int ncol; // number of columns j that contribute to Neff[i]
2169 float fj[NAA+3]; // to calculate entropy
2170 float sum;
2171 float Nlim=0.0; // only for global weights
2172 float scale=0.0; // only for global weights
2173
2174 wi = new float[N_in+2];
2175
2176 // Global weights?
2177 if (par.wg==1)
2178 {
2179 for (k=0; k<N_in; k++) wi[k]=wg[k];
2180 Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
2181 scale=Log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with inserts at specific pos
2182 }
2183
2184 // Initialization
2185 n = new int*[L+2];
2186 for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2187
2188 //////////////////////////////////////////////////////////////////////////////////////////////
2189 // Main loop through alignment columns
2190 for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
2191 {
2192 if (par.wg==0) // local weights?
2193 {
2194
2195 // Calculate n[j][a] and ri[j]
2196 nseqi=0;
2197 for (k=0; k<N_in; k++)
2198 {
2199 if (in[k] && I[k][i]>0)
2200 {
2201 if (nseqi==0) // Initialize only if inserts present! Otherwise O(L*L) even for single sequences!
2202 {
2203 // Initialization of n[j][a]
2204 for (j=1; j<=L; j++)
2205 for (a=0; a<NAA+3; a++) n[j][a]=0;
2206 }
2207 nseqi++;
2208 for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
2209 }
2210 } //end for (k)
2211 nseqs[i]=nseqi;
2212
2213 // If there is no sequence in subalignment j ...
2214 if (nseqi==0)
2215 {
2216 ncol=0;
2217 Neff[i]=0.0; // effective number of sequence = 0!
2218 q.tr[i][I2M]=-100000;
2219 q.tr[i][I2I]=-100000;
2220 continue;
2221 }
2222
2223 // update weights wi[k] and Neff[i]
2224 // if (1)
2225 {
2226 // Initialize weights and numbers of residues for subalignment i
2227 ncol=0;
2228 for (k=0; k<N_in; k++) wi[k]=0.0;
2229
2230 // sum wi[k] over all columns j and sequences k of subalignment
2231 for (j=1; j<=L; j++)
2232 {
2233 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2234 naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
2235 if (naa==0) continue;
2236 ncol++;
2237 for (k=0; k<N_in; k++)
2238 {
2239 if (in[k] && I[k][i]>0 && X[k][j]<ANY)
2240 {
2241 if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Ii=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
2242 wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
2243 }
2244 }
2245 }
2246
2247 // Check whether number of columns in subalignment is sufficient
2248 if (ncol>=NCOLMIN)
2249 // Take global weights
2250 for (k=0; k<N_in; k++)
2251 if(in[k] && I[k][i]>0) wi[k]=wg[k]; else wi[k]=0.0;
2252
2253 // Calculate Neff[i]
2254 Neff[i]=0.0;
2255 for (j=1; j<=L; j++)
2256 {
2257 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2258 for (a=0; a<20; a++) fj[a]=0;
2259 for (k=0; k<N_in; k++)
2260 if (in[k] && I[k][i]>0 && X[k][j]<ANY)
2261 fj[ (int)X[k][j] ]+=wi[k];
2262 NormalizeTo1(fj,NAA);
2263 for (a=0; a<20; a++)
2264 if (fj[a]>1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]);
2265 }
2266 if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
2267
2268 }
2269 // Calculate transition probabilities from I state
2270 q.tr[i][I2M]=q.tr[i][I2I]=0.0;
2271 for (k=0; k<N_in; k++) //for all sequences
2272 {
2273 if (in[k] && I[k][i]>0) //current state is I
2274 {
2275 q.tr[i][I2M]+=wi[k];
2276 q.tr[i][I2I]+=wi[k]*(I[k][i]-1);
2277 }
2278 } // end for(k)
2279 }
2280
2281 else // fast global weights?
2282 {
2283 float w_I=-1.0/N_filtered;
2284 ncol=0;
2285 q.tr[i][I2M]=q.tr[i][I2I]=0.0;
2286 // Calculate amino acid frequencies fj[a] from weights wg[k]
2287 for (k=0; k<N_in; k++)
2288 if (in[k] && I[k][i]>0)
2289 {
2290 ncol++;
2291 w_I+=wg[k];
2292 q.tr[i][I2M]+=wi[k];
2293 q.tr[i][I2I]+=wi[k]*(I[k][i]-1);
2294 }
2295 if (ncol>0)
2296 {
2297 if (w_I<0) Neff[i]=1.0;
2298 else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_I);
2299 // fprintf(stderr,"I i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_I=%5.3f Neff_I=%5.2f\n",i,ncol,q.Neff_HMM,Nlim,w_I,Neff[i]);
2300 }
2301 else
2302 {
2303 Neff[i]=0.0;
2304 q.tr[i][I2M]=-100000;
2305 q.tr[i][I2I]=-100000;
2306 continue;
2307 }
2308 }
2309
2310 // Normalize and take log
2311 sum = q.tr[i][I2M]+q.tr[i][I2I];
2312 q.tr[i][I2M]=Log2(q.tr[i][I2M]/sum);
2313 q.tr[i][I2I]=Log2(q.tr[i][I2I]/sum);
2314
2315 }
2316 // end loop through alignment columns i
2317 //////////////////////////////////////////////////////////////////////////////////////////////
2318
2319 delete[](wi); wi = NULL;
2320 // delete n[][]
2321 for (j=1; j<=L; j++){
2322 delete[](n[j]); (n[j]) = NULL;
2323 }
2324 delete[](n); (n) = NULL;
2325
2326 q.tr[0][I2M]=0;
2327 q.tr[0][I2I]=-100000;
2328 q.tr[L][I2M]=0;
2329 q.tr[L][I2I]=-100000;
2330 q.Neff_I[0]=99.999;
2331
2332 // Assign Neff_I[i]
2333 for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i] and Neff[i]
2334 q.Neff_I[i]=Neff[i];
2335
2336 delete[] Neff; Neff = NULL;
2337 return;
2338
2339 } /* this is the end of Alignment::Transitions_from_I_state() */
2340
2341
2342 /////////////////////////////////////////////////////////////////////////////////////
2343 /**
2344 * @brief Calculate transitions q.tr[i][a] (a=DM,DD) with pos-specific subalignments
2345 */
2346 void
Transitions_from_D_state(HMM & q,char * in)2347 Alignment::Transitions_from_D_state(HMM& q, char* in)
2348 {
2349 // Calculate position-dependent weights wi[k] for each i.
2350 // For calculation of weights in column i use sub-alignment
2351 // over sequences which have a DELETE in column i
2352 // and over columns where none of these sequences has an end gap.
2353 // This is done by updating the arrays n[j][a] and rj[j] at each step i-1->i while letting i run from 1 to L.
2354 // n[j][a] = number of occurences of index a at column j of the subalignment,
2355 // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
2356 // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
2357 // then the old values wi[k], Neff[i-1], and ncol are used for the new position i.
2358 // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
2359
2360 int k; // index of sequence
2361 int i,j; // position in alignment
2362 int a; // amino acid (0..19)
2363 int naa; // number of different amino acids
2364 int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
2365 //float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i
2366 float *wi=NULL; // weight of sequence k in column i, calculated from subalignment i
2367 //float Neff[MAXRES]; // diversity of subalignment i
2368 float *Neff = new float[par.maxResLen]; // diversity of subalignment i
2369 int nseqi=0; // number of sequences in subalignment i (for DEBUGGING)
2370 int ncol=0; // number of columns j that contribute to Neff[i]
2371 char change; // has the set of sequences in subalignment changed? 0:no 1:yes
2372 float fj[NAA+3]; // to calculate entropy
2373 float sum;
2374 float Nlim=0.0; // only for global weights
2375 float scale=0.0; // only for global weights
2376
2377 wi = new float[N_in+2]; /* FIXME: FS */
2378 // Global weights?
2379 if (par.wg==1)
2380 {
2381 for (k=0; k<N_in; k++)
2382 wi[k]=wg[k];
2383 Nlim=fmax(10.0,q.Neff_HMM+1.0); // limiting Neff
2384 scale=Log2((Nlim-q.Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with dels at specific pos
2385 }
2386
2387 // Initialization
2388 n = new int*[L+2];
2389 //#if 0
2390 #ifdef HAVE_OPENMP
2391 #pragma omp parallel for schedule(static) private(a)
2392 #endif
2393 for (j=1; j<=L; j++) {
2394 n[j]=new(int[NAA+3]);
2395 for (a=0; a<NAA+3; a++)
2396 n[j][a]=0;
2397 }
2398
2399 //////////////////////////////////////////////////////////////////////////////////////////////
2400 // Main loop through alignment columns
2401 for (i=1; i<=L; i++) // Calculate wi[k] at position i as well as Neff[i]
2402 {
2403 if (par.wg==0) // if local weights
2404 {
2405 change=0;
2406 // Check all sequences k and update n[j][a] and ri[j] if necessary
2407 for (k=0; k<N_in; k++)
2408 {
2409 if (!in[k]) continue;
2410 if (X[k][i-1]!=GAP && X[k][i]==GAP)
2411 { // ... if sequence k was NOT included in i-1 and has to be included for column i
2412 change=1;
2413 nseqi++;
2414 //#if 0
2415 #ifdef HAVE_OPENMP
2416 #pragma omp parallel for schedule(static)
2417 #endif
2418 for (int j=1; j<=L; j++)
2419 n[j][ (int)X[k][j]]++;
2420 }
2421 else if (X[k][i-1]==GAP && X[k][i]!=GAP)
2422 { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
2423 change=1;
2424 nseqi--;
2425 //#if 0
2426 #ifdef HAVE_OPENMP
2427 #pragma omp parallel for schedule(static)
2428 #endif
2429 for (int j=1; j<=L; j++)
2430 n[j][ (int)X[k][j]]--;
2431 }
2432 } //end for (k)
2433 nseqs[i]=nseqi;
2434
2435 // If there is no sequence in subalignment j ...
2436 if (nseqi==0)
2437 {
2438 ncol=0;
2439 Neff[i]=0.0; // effective number of sequences = 0!
2440 q.tr[i][D2M]=-100000;
2441 q.tr[i][D2D]=-100000;
2442 continue;
2443 }
2444
2445 // If subalignment changed: update weights wi[k] and Neff[i]
2446 if (change)
2447 {
2448 // Initialize weights and numbers of residues for subalignment i
2449 ncol=0;
2450 for (k=0; k<N_in; k++)
2451 wi[k]=0.0;
2452
2453 // sum wg[k][i] over all columns j and sequences k of subalignment
2454 //#if 0
2455 #ifdef HAVE_OPENMP
2456 #pragma omp parallel for schedule(static) private(naa, a, k)
2457 #endif
2458 for (j=1; j<=L; j++)
2459 {
2460 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2461 naa=0;
2462 for (a=0; a<20; a++)
2463 if(n[j][a])
2464 naa++;
2465 if (naa==0) continue;
2466 //#if 0
2467 #ifdef HAVE_OPENMP
2468 #pragma omp atomic
2469 #endif
2470 ncol++;
2471 for (k=0; k<N_in; k++)
2472 {
2473 if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
2474 {
2475 if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Di=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
2476 //#if 0
2477 #ifdef HAVE_OPENMP
2478 #pragma omp atomic
2479 #endif
2480 wi[k]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
2481 }
2482 }
2483 } // end for (j)
2484 // Check whether number of columns in subalignment is sufficient
2485 if (ncol<NCOLMIN)
2486 // Take global weights
2487 for (k=0; k<N_in; k++)
2488 if(in[k] && X[k][i]==GAP)
2489 wi[k]=wg[k];
2490 else
2491 wi[k]=0.0;
2492
2493 // Calculate Neff[i]
2494 Neff[i]=0.0;
2495 //#if 0
2496 #ifdef HAVE_OPENMP
2497 #pragma omp parallel for schedule(static) private(a, k, fj)
2498 #endif
2499 for (j=1; j<=L; j++)
2500 {
2501 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2502 for (a=0; a<20; a++) fj[a]=0;
2503 for (k=0; k<N_in; k++)
2504 if (in[k] && X[k][i]==GAP && X[k][j]<ANY)
2505 fj[ (int)X[k][j] ]+=wi[k];
2506 NormalizeTo1(fj,NAA);
2507 for (a=0; a<20; a++)
2508 if (fj[a]>1E-10)
2509 //#if 0
2510 #ifdef HAVE_OPENMP
2511 #pragma omp atomic
2512 #endif
2513 Neff[i]-=fj[a]*fast_log2(fj[a]);
2514 }
2515 if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0;
2516
2517 }
2518
2519 else //no update was necessary; copy values for i-1
2520 {
2521 Neff[i]=Neff[i-1];
2522 }
2523
2524 // Calculate transition probabilities from D state
2525 q.tr[i][D2M]=q.tr[i][D2D]=0.0;
2526 for (k=0; k<N_in; k++) //for all sequences
2527 {
2528 if (in[k] && X[k][i]==GAP) //current state is D
2529 {
2530 if (X[k][i+1]==GAP) //next state is D
2531 q.tr[i][D2D]+=wi[k];
2532 else if (X[k][i+1]<=ANY) //next state is M
2533 q.tr[i][D2M]+=wi[k];
2534 }
2535 } // end for(k)
2536 }
2537
2538 else // fast global weights?
2539 {
2540 float w_D=-1.0/N_filtered;
2541 ncol=0;
2542 q.tr[i][D2M]=q.tr[i][D2D]=0.0;
2543 // Calculate amino acid frequencies fj[a] from weights wg[k]
2544 for (k=0; k<N_in; k++) //for all sequences
2545 if (in[k] && X[k][i]==GAP) //current state is D
2546 {
2547 ncol++;
2548 w_D+=wg[k];
2549 if (X[k][i+1]==GAP) //next state is D
2550 q.tr[i][D2D]+=wi[k];
2551 else if (X[k][i+1]<=ANY) //next state is M
2552 q.tr[i][D2M]+=wi[k];
2553 }
2554 if (ncol>0)
2555 {
2556 if (w_D<0) Neff[i]=1.0;
2557 else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_D);
2558 // fprintf(stderr,"D i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_D=%5.3f Neff_D=%5.2f\n",i,ncol,q.Neff_HMM,Nlim,w_D,Neff[i]);
2559 }
2560 else
2561 {
2562 Neff[i]=0.0; // effective number of sequences = 0!
2563 q.tr[i][D2M]=-100000;
2564 q.tr[i][D2D]=-100000;
2565 continue;
2566 }
2567 }
2568
2569 // Normalize and take log
2570 sum = q.tr[i][D2M]+q.tr[i][D2D];
2571 q.tr[i][D2M]=Log2(q.tr[i][D2M]/sum);
2572 q.tr[i][D2D]=Log2(q.tr[i][D2D]/sum);
2573
2574 }
2575 // end loop through alignment columns i
2576 //////////////////////////////////////////////////////////////////////////////////////////////
2577
2578 q.tr[0][D2M]=0;
2579 q.tr[0][D2D]=-100000;
2580 q.Neff_D[0]=99.999;
2581
2582 // Assign Neff_D[i]
2583 for (i=1; i<=L; i++) {
2584 q.Neff_D[i]=Neff[i];
2585 delete[](n[i]);
2586 (n[i]) = NULL;
2587 }
2588
2589 delete[](n); (n) = NULL;
2590 delete[](wi); wi = NULL;/* FIXME: FS */
2591
2592 delete[] Neff; Neff = NULL;
2593 return;
2594
2595 } /* this is the end of Alignment::Transitions_from_D_state() */
2596
2597
2598
2599 /////////////////////////////////////////////////////////////////////////////////////
2600 /**
2601 * @brief Write alignment without insert states (lower case) to alignment file?
2602 */
2603 void
WriteWithoutInsertsToFile(char * alnfile)2604 Alignment::WriteWithoutInsertsToFile(char* alnfile)
2605 {
2606 if (v>=2) cout<<"Writing alignment to "<<alnfile<<"\n";
2607 FILE* alnf;
2608 if (!par.append) alnf = fopen(alnfile,"w"); else alnf = fopen(alnfile,"a");
2609 if (!alnf) OpenFileError(alnfile);
2610 // If alignment name is different from that of query: write name into commentary line
2611 if (strncmp(longname,sname[kfirst],DESCLEN-1)) fprintf(alnf,"#%s\n",longname);
2612 if (v>=2) cout<<"Writing alignment to "<<alnfile<<"\n";
2613 for (int k=0; k<N_in; k++)
2614 if (keep[k] || display[k]==KEEP_ALWAYS) // print if either in profile (keep[k]>0) or display is obligatory (display[k]==2)
2615 {
2616 fprintf(alnf,">%s\n",sname[k]);
2617 for (int i=1; i<=L; i++) fprintf(alnf,"%c",i2aa(X[k][i]));
2618 fprintf(alnf,"\n");
2619 }
2620 fclose(alnf);
2621 }
2622
2623 /////////////////////////////////////////////////////////////////////////////////////
2624 // Write stored,filtered sequences WITH insert states (lower case) to alignment file?
2625 /////////////////////////////////////////////////////////////////////////////////////
WriteToFile(char * alnfile,const char format[])2626 void Alignment::WriteToFile(char* alnfile, const char format[])
2627 {
2628 FILE* alnf;
2629 if (!par.append) alnf = fopen(alnfile,"w"); else alnf = fopen(alnfile,"a");
2630 if (!alnf) OpenFileError(alnfile);
2631 // If alignment name is different from that of query: write name into commentary line
2632 if (strncmp(longname,sname[kfirst],DESCLEN-1)) fprintf(alnf,"#%s\n",longname);
2633 if (!format || !strcmp(format,"a3m"))
2634 {
2635 if (v>=2) cout<<"Writing A3M alignment to "<<alnfile<<"\n";
2636 for (int k=0; k<N_in; k++)
2637 if (keep[k] || display[k]==KEEP_ALWAYS) // print if either in profile (keep[k]>0) or display obligatory (display[k]==2)
2638 fprintf(alnf,">%s\n%s\n",sname[k],seq[k]+1);
2639 }
2640 else // PSI-BLAST format
2641 {
2642 if (v>=2) cout<<"Writing PSI-BLAST-formatted alignment to "<<alnfile<<"\n";
2643 for (int k=kfirst; k<N_in; k++) // skip sequences before kfirst!!
2644 if (keep[k] || display[k]==KEEP_ALWAYS) // print if either in profile (keep[k]>0) or display obligatory (display[k]==2)
2645 {
2646 strcut(sname[k]);
2647 fprintf(alnf,"%-20.20s ",sname[k]);
2648 // for (int i=1; i<=L; i++) fprintf(alnf,"%c",i2aa(X[k][i]));
2649 // fprintf(alnf,"\n");
2650 char* ptr=seq[k];
2651 for (; *ptr!='\0'; ptr++)
2652 if (*ptr==45 || (*ptr>=65 && *ptr<=90)) fprintf(alnf,"%c",*ptr);
2653 fprintf(alnf,"\n");
2654 }
2655 }
2656
2657 fclose(alnf);
2658 }
2659
2660
2661
2662 /*
2663 * FIXME: this function contains a reference to MAXSEQ & MAXCOL
2664 * however, this may not be accessed (FS)
2665 */
2666 /////////////////////////////////////////////////////////////////////////////////////
2667 /**
2668 * @brief Read a3m slave alignment of hit from file and merge into (query) master alignment
2669 */
2670 void
MergeMasterSlave(Hit & hit,char ta3mfile[])2671 Alignment::MergeMasterSlave(Hit& hit, char ta3mfile[])
2672 {
2673 Alignment Tali;
2674 char* cur_seq = new(char[MAXCOL]); // Sequence currently read in
2675 int maxcol=MAXCOL;
2676 int l,ll; // position in unaligned template (T) sequence Tali.seq[l]
2677 int i; // counts match states in query (Q) HMM
2678 int j; // counts match states in T sequence Tali.seq[l]
2679 int h; // position in aligned T sequence cur_seq[h]
2680 int k; // sequence index
2681 char c; //
2682 printf("****************%s:%s:%d: did get into MergeMasterSlave\n", __FUNCTION__, __FILE__, __LINE__);
2683 if (v>=3) printf("Merging %s to query alignment\n",ta3mfile);
2684
2685 // If par.append==1 do not print query alignment
2686 if (par.append) for (k=0; k<N_in; k++) keep[k]=display[k]=KEEP_NOT;
2687
2688 // Read template alignment into Tali
2689 FILE* ta3mf=fopen(ta3mfile,"r");
2690 if (!ta3mf) OpenFileError(ta3mfile);
2691 Tali.Read(ta3mf,ta3mfile);
2692 fclose(ta3mf);
2693
2694 // Filter Tali alignment
2695 Tali.Compress(ta3mfile);
2696 N_filtered = Tali.Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);
2697
2698 // Record imatch[j]
2699 int* imatch=new int[hit.j2+1];
2700 int step = hit.nsteps;
2701 for (j=hit.j1; j<=hit.j2; j++)
2702 {
2703 // Advance to position of next T match state j
2704 while (hit.j[step]<j) step--;
2705 imatch[j] = hit.i[step];
2706 // printf("step=%-3i i=%-3i j=%-3i\n",step,imatch[j],j);
2707 }
2708
2709 // Determine number of match states of Qali
2710 for (L=0,l=1; seq[kfirst][l]>'\0'; l++)
2711 if ((seq[kfirst][l]>='A' && seq[kfirst][l]<='Z') || seq[kfirst][l]=='-') L++;
2712
2713 // For each sequence in T alignment: align to Qali
2714 for (k=0; k<Tali.N_in; k++)
2715 {
2716 if (!Tali.keep[k]) continue;
2717 if (N_in>=MAXSEQ)
2718 {
2719 fprintf(stderr,"WARNING in %s: maximum number of %i sequences exceeded while reading %s. Skipping all following sequences\n",program_name,MAXSEQ,ta3mfile);
2720 break;
2721 }
2722 cur_seq[0]=' '; // 0'th position not used
2723
2724 // Add the hit.i1-1 left end gaps to aligned sequence
2725 for (h=1; h<hit.i1; h++) cur_seq[h]='-';
2726
2727 // Advance to match state hit.j1 of Tali.seq[k]
2728 for (j=0, l=1; (c=Tali.seq[k][l])>'\0'; l++)
2729 if ((c>='A' && c<='Z') || c=='-') // match state at position l?
2730 if ((++j)==hit.j1) break; // yes: increment j. Reached hit,j1? yes: break
2731
2732 if (j<hit.j1)
2733 {printf("Error: did not find %i match states in sequence %i of %s. Sequence:\n%s\n",hit.j1,k,Tali.name,Tali.seq[k]); exit(1);}
2734
2735 // Write first match state to cur_seq
2736 int iprev=hit.i1; // index of previous query match state
2737 int lprev=l; // previous T match state in Tali.seq[k][l]
2738 cur_seq[h++] = Tali.seq[k][l]; // first column of alignment is Match-Match state
2739
2740 // For each further match state j in alignment
2741 step = hit.nsteps;
2742 for (j=hit.j1+1; j<=hit.j2; j++)
2743 {
2744 // Advance to position of next T match state j
2745 i=imatch[j];
2746
2747 // Advance to position of next T match state j
2748 while ((c=Tali.seq[k][++l])>'\0' && ((c>='a' && c<='z') || c=='.')) ;
2749
2750 int di=i-iprev; // number of Match states in Q between T match state j-1 and j
2751 int dl=l-lprev; // 1 + number of inserted residues in T sequence between T match state j-1 and j
2752 if (di==1)
2753 {
2754 // One Q match state for one T match state (treated as special case for speed reasons)
2755 // i: i-1 i di=1
2756 // Q: XXXXXX.....XXXXXX
2757 // T: YYYYYYyyyyyYYYYYY
2758 // j: j-1 j
2759 // l: lprev l dl=6
2760
2761 // Inserts in lower case
2762 for (ll=lprev+1; ll<l; ll++)
2763 if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]);
2764
2765 // Template Match state -> upper case
2766 cur_seq[h++] = Tali.seq[k][ll];
2767 }
2768 else if (di==0)
2769 {
2770 // Gap in query: no Q match state for on T match state (special case for speed reasons)
2771 // i: i-1 i-1 di=0
2772 // Q: XXXXXX.....~~~XXX
2773 // T: YYYYYYyyyyyYYYYYY
2774 // j: j-1 j
2775 // l: lprev l dl=6
2776
2777 // All T residues (including T match state) in lower case
2778 for (ll=lprev+1; ll<=l; ll++)
2779 if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]);
2780 }
2781 else if (di>=dl)
2782 {
2783 // More Match states in Q than Inserts in the T sequence
2784 // => half T inserts y left, half right-aligned in uc, gaps to fill up
2785 // Number of T insert residues to be left-aligned: (int)(dl/2)
2786 // i: iprev i di=7
2787 // Q: XXXXXXXXXXXXXXXXXX
2788 // T: YYYYYYYyyy-yyYYYYY
2789 // j: j-1 j
2790 // l: lprev l dl=6
2791
2792 // Add left-bounded template residues
2793 for (ll=lprev+1; ll<=lprev+(int)(dl/2); ll++)
2794 cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2795
2796 // Add central gaps
2797 for (int gap=1; gap<=di-dl; gap++) cur_seq[h++]='-';
2798
2799 // Add right-bounded residues
2800 for (; ll<=l; ll++)
2801 cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2802 }
2803 else if (di<dl)
2804 {
2805 // Fewer Match states in Q than inserts in T sequence
2806 // => half of available space di for left- half for right-aligned T inserts, rest in lc
2807 // number of T inserts to be left-aligned in uc: (int)(di/2),
2808 // i: iprev i di=5
2809 // Q: XXXXXXXXX.XXXXXXX
2810 // T: YYYYYYYyyyyyYYYYY
2811 // j: j-1 j
2812 // l: lprev l dl=6
2813
2814 // Add left-bounded template residues
2815 for (ll=lprev+1; ll<=lprev+(int)(di/2); ll++)
2816 cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2817
2818 // Add central inserts
2819 for (int ins=1; ins<=dl-di; ins++,ll++)
2820 if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]);
2821
2822 // Add right-bounded residues
2823 for (; ll<=l; ll++)
2824 cur_seq[h++]=uprchr(Tali.seq[k][ll]);
2825 }
2826 // printf("i=%-3i j=%-3i l=%-3i cur_seq=%s\n",i,j,l,cur_seq);
2827
2828 iprev=i; lprev=l;
2829 if (h>=maxcol-1000) // too few columns? Reserve double space
2830 {
2831 char* new_seq=new char[2*maxcol];
2832 strncpy(new_seq,cur_seq,maxcol); //////// check: maxcol-1 ????
2833 delete[](cur_seq); (cur_seq) = NULL;
2834 cur_seq=new_seq;
2835 maxcol*=2;
2836 }
2837 }
2838
2839 // Add the remaining gaps '-' to the end of the template sequence
2840 for (i=hit.i2+1; i<=L; i++) cur_seq[h++]='-';
2841 cur_seq[h++]='\0';
2842
2843 keep[N_in] = display[N_in] = KEEP_CONDITIONALLY;
2844 seq[N_in]=new char[h];
2845 if (!seq[N_in]) MemoryError("array for input sequences");
2846 strcpy(seq[N_in],cur_seq);
2847 X[N_in]=new char[h];
2848 if (!X[N_in]) MemoryError("array for input sequences");
2849 I[N_in]=new short unsigned int[h];
2850 if (!I[N_in]) MemoryError("array for input sequences");
2851 sname[N_in]=new char[strlen(Tali.sname[k])+1];
2852 if (!sname[N_in]) MemoryError("array for input sequences");
2853 strcpy(sname[N_in],Tali.sname[k]);
2854 N_in++;
2855
2856 // printf("k=%-3i %s\n",k,Tali.seq[k]);
2857 // printf("Query %s\n",seq[kfirst]);
2858 // printf("k=%-3i %s\n\n",k,cur_seq);
2859
2860 } // end for (k)
2861
2862 // printf("N_in=%-5i HMM=%s with %i sequences\n",N_in,ta3mfile,N_filtered);
2863
2864 delete[] cur_seq; cur_seq = NULL;
2865 delete[] imatch; imatch = NULL;
2866 delete[] ksort; ksort=NULL; // if ksort already existed it will be to short for merged alignment
2867 delete[] first; first=NULL; // if first already existed it will be to short for merged alignment
2868 delete[] last; last=NULL; // if last already existed it will be to short for merged alignment
2869
2870 } /* this is the end of Alignment::MergeMasterSlave() */
2871
2872
2873 /////////////////////////////////////////////////////////////////////////////////////
2874 /**
2875 * @brief Add a sequence to Qali
2876 */
2877 void
AddSequence(char Xk[],int Ik[])2878 Alignment::AddSequence(char Xk[], int Ik[])
2879 {
2880 int i; // position in query and target
2881 if (L<=0) InternalError("L is not set in AddSequence()");
2882 X[N_in]=new char[L+2];
2883 for (i=0; i<=L+1; i++) X[N_in][i]=Xk[i];
2884 if (Ik==NULL)
2885 for (i=0; i<=L+1; i++) I[N_in][i]=0;
2886 else
2887 for (i=0; i<=L+1; i++) I[N_in][i]=Ik[i];
2888 N_in++;
2889 }
2890
2891
2892 /////////////////////////////////////////////////////////////////////////////////////
2893 /**
2894 * @brief Determine matrix of position-specific weights w[k][i] for multiple alignment
2895 * Pos-specific weights are calculated like in "Amino_acid_frequencies_and_transitions_from_M_state()"
2896 */
2897 void
GetPositionSpecificWeights(float * w[])2898 Alignment::GetPositionSpecificWeights(float* w[])
2899 {
2900 // Calculate position-dependent weights wi[k] for each i.
2901 // For calculation of weights in column i use sub-alignment
2902 // over sequences which have a *residue* in column i (no gap, no end gap)
2903 // and over columns where none of these sequences has an end gap.
2904 // This is done by updating the arrays n[j][a] at each step i-1->i while letting i run from 1 to L.
2905 // n[j][a] = number of occurences of amino acid a at column j of the subalignment,
2906 // => only columns with n[j][ENDGAP]=0 are contained in the subalignment!
2907 // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0)
2908 // then the old values w[k][i] and ncol are used for the new position i.
2909 // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22
2910
2911 char* in=keep; // to keep the code similar to Amino_acid_frequencies_and_transitions_from_M_state()
2912 int k; // index of sequence
2913 int i,j; // position in alignment
2914 int a; // amino acid (0..19)
2915 int naa; // number of different amino acids
2916 int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j
2917 int nseqi=0; // number of sequences in subalignment i
2918 int ncol=0; // number of columns j that contribute to Neff[i]
2919 char change; // has the set of sequences in subalignment changed? 0:no 1:yes
2920
2921
2922 // Global weights?
2923 if (par.wg==1)
2924 {
2925 for (k=0; k<N_in; k++)
2926 for (i=1; i<=L; i++) w[k][i]=wg[k];
2927 }
2928 else
2929 {
2930
2931 // Initialization
2932 n = new int*[L+2];
2933 for (j=1; j<=L; j++) n[j]=new(int[NAA+3]);
2934 for (j=1; j<=L; j++)
2935 for (a=0; a<NAA+3; a++) n[j][a]=0;
2936
2937 //////////////////////////////////////////////////////////////////////////////////////////////
2938 // Main loop through alignment columns
2939 for (i=1; i<=L; i++) // Calculate w[k][i]
2940 {
2941 change=0;
2942 // Check all sequences k and update n[j][a] and ri[j] if necessary
2943 for (k=0; k<N_in; k++)
2944 {
2945 if (!in[k]) continue;
2946 if (X[k][i-1]>=ANY && X[k][i]<ANY)
2947 { // ... if sequence k was NOT included in i-1 and has to be included for column i
2948 change=1;
2949 nseqi++;
2950 for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]++;
2951 }
2952 else if (X[k][i-1]<ANY && X[k][i]>=ANY)
2953 { // ... if sequence k WAS included in i-1 and has to be thrown out for column i
2954 change=1;
2955 nseqi--;
2956 for (int j=1; j<=L; j++) n[j][ (int)X[k][j]]--;
2957 }
2958 } //end for (k)
2959 nseqs[i]=nseqi;
2960
2961 // If subalignment changed: update weights w[k][i] and Neff[i]
2962 if (change)
2963 {
2964 // Initialize weights and numbers of residues for subalignment i
2965 ncol=0;
2966 for (k=0; k<N_in; k++) w[k][i]=0.0;
2967
2968 // sum wi[k] over all columns j and sequences k of subalignment
2969 for (j=1; j<=L; j++)
2970 {
2971 // do at least a fraction MAXENDGAPFRAC of sequences in subalignment contain an end gap in j?
2972 if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue;
2973 naa=0; for (a=0; a<20; a++) if(n[j][a]) naa++;
2974 if (naa==0) continue;
2975 ncol++;
2976 for (k=0; k<N_in; k++)
2977 {
2978 if (in[k] && X[k][i]<ANY && X[k][j]<ANY)
2979 {
2980 // if (!n[j][ (int)X[k][j]]) {fprintf(stderr,"Error: Mi=%i: n[%i][X[%i]]=0! (X[%i]=%i)\n",i,j,k,k,X[k][j]);}
2981 w[k][i]+=1.0/float(n[j][ (int)X[k][j] ]*naa);
2982 }
2983 }
2984 }
2985
2986 // Check whether number of columns in subalignment is sufficient
2987 if (ncol<NCOLMIN)
2988 // Take global weights
2989 for (k=0; k<N_in; k++)
2990 if(in[k]) {if(X[k][i]<ANY) w[k][i]=wg[k]; else w[k][i]=0.0;}
2991 }
2992 }
2993 // end loop through alignment columns i
2994 ///////////////////////////////////////////////////////////////////////
2995
2996 // delete n[][]
2997 for (j=1; j<=L; j++){
2998 delete[](n[j]); (n[j]) = NULL;
2999 }
3000 delete[](n); (n) = NULL;
3001
3002 }
3003 return;
3004 }
3005
3006 #ifdef CLUSTALO
3007 /* @* Transfer
3008 *
3009 * take sequence data from Clustal and transfer it into
3010 * hhalign accessible information (structure/class)
3011 *
3012 * Note that hhalign does not see all sequences/profiles
3013 * but only sequences that are elements of the 2 profiles
3014 * to be aligned.
3015 *
3016 * References to the required sequences are passed into hhalign
3017 * through auxilliary pointers that are shallow copies of the
3018 * sequence/profile data available to Clustal.
3019 *
3020 * Re-allocating memory for these auxilliary pointers
3021 * would be desaterous, as it might detach the memory
3022 * seen by Clustal.
3023 */
3024 void
Transfer(char ** ppcProf,int iCnt)3025 Alignment::Transfer(char **ppcProf, int iCnt){
3026
3027 /* @<variables local to Transfer@> */
3028 int iLen; /* length of profile */
3029 int k; /* generic iterator */
3030
3031 /* @<initialisation@> */
3032 N_in = iCnt;
3033 N_filtered = N_ss = 0;
3034 kss_dssp = ksa_dssp = kss_pred = kss_conf = -1;
3035 kfirst = 0;
3036 strcpy(longname, "unknown_long_seq_name");
3037 strcpy(name, "unknown_seq_name");
3038 strcpy(file, "unknown_file_name");
3039 n_display = iCnt;
3040
3041 /* @<determine length of profile@>
3042 all sequences in profile should have same length,
3043 so only do it for 1st */
3044 for (iLen = 0; '\0' != ppcProf[0][iLen]; iLen++);
3045
3046 /* @<allocate memory for sequences etc@> */
3047 for (k = 0; k < iCnt; k++){
3048 #define GOOD_MEASURE 1000 /* Temporary -- can be removed once rest in place */
3049 I[k] = new short unsigned int[iLen+2+GOOD_MEASURE];
3050 X[k] = new char[iLen+2+GOOD_MEASURE];
3051 seq[k] = new char[iLen+2+GOOD_MEASURE];
3052 seq[k][0] = ' ';
3053 seq[k][1] = '\0';
3054 if (NULL == ppcProf[k]){
3055 printf("%s:%d: Arena[%d]=NULL, cnt=%d\n", __FILE__, __LINE__, k, iCnt);
3056 exit(-1);
3057 }
3058 strcat(seq[k], ppcProf[k]);
3059 keep[k] = KEEP_CONDITIONALLY;
3060 display[k] = KEEP_CONDITIONALLY;
3061 sname[k] = new(char[GOOD_MEASURE]);
3062 strcpy(sname[k], "unknown_sname");
3063 } /* (0 <= k < iCnt) */
3064 /* FIXME: Soeding always makes 1st sequence permanent */
3065 /*keep[0] = KEEP_ALWAYS;
3066 display[k] = KEEP_ALWAYS;*/
3067 #if 1
3068 /* Believe that the first and last positions are
3069 most important in stability of this algorithm.
3070 Must make sure that at least 2 sequences with
3071 residues in these positions are kept.
3072 Think any sequence will do, but better to keep
3073 the one with the longest 'contig'
3074 */
3075 int iSeq; /* sequence iterator */
3076 int iHeadLen = 0, iHeadID = -1; /* length & ID of longest head contig */
3077 int iTailLen = 0, iTailID = -1; /* length & ID of longest head contig */
3078 int iCont = -1;
3079 char *pcFind = NULL;
3080
3081 #if 0
3082 printf("%s:%s:%d: NEW PROFILE (%d seq) ================\n",
3083 __FUNCTION__, __FILE__, __LINE__, iCnt);
3084 #endif
3085 for (iSeq = 0; iSeq < iCnt; iSeq++){
3086 #if 0
3087 printf("%s:%s:%d: consider seq %d ------------------\n",
3088 __FUNCTION__, __FILE__, __LINE__, iSeq);
3089 #endif
3090 pcFind = strchr(&seq[iSeq][1], '-');
3091 if (NULL == pcFind){
3092 /* no gap at all in this sequences, spans entire profile */
3093 iHeadID = iTailID = iSeq;
3094 iHeadLen = iTailLen = iLen;
3095 break;
3096 }
3097 iCont = (int)(pcFind - &seq[iSeq][1]);
3098 if (iCont > iHeadLen){
3099 iHeadLen = iCont;
3100 iHeadID = iSeq;
3101 }
3102 pcFind = strrchr(seq[iSeq], '-');
3103 iCont = iLen - (int)(pcFind - seq[iSeq]);
3104 if (iCont > iTailLen){
3105 iTailLen = iCont;
3106 iTailID = iSeq;
3107 }
3108
3109 #if 0
3110 printf("%s:%s:%d: seq %3d: len = %d(%d) %s\n",
3111 __FUNCTION__, __FILE__, __LINE__, iSeq, iCont, iLen, seq[iSeq]);
3112 #endif
3113 } /* 0 <= iSeq < iCnt */
3114 #if 0
3115 printf("%s:%s:%d: seq %d is winner with head contig of %d, seq %d tail contig of %d\n"
3116 , __FUNCTION__, __FILE__, __LINE__, iHeadID, iHeadLen, iTailID, iTailLen);
3117 #endif
3118 if ( (-1 == iHeadID) || (-1 == iTailID) ){
3119 printf("%s:%s:%d: profile has no leading and/or trailing residues (h=%d:t=%d:#=%d)\n",
3120 __FUNCTION__, __FILE__, __LINE__, iHeadID, iTailID, iCnt);
3121 }
3122 else{
3123 keep[iHeadID] = KEEP_ALWAYS;
3124 keep[iTailID] = KEEP_ALWAYS;
3125 }
3126 #endif
3127 /* @= */
3128 return;
3129
3130 } /* this is the end of Transfer() */
3131 #endif
3132
3133 #ifdef CLUSTALO
3134 /* @* Alignment::ClobberGlobal (eg: qali)
3135 *
3136 * Note: originally hhalign() was stand-alone code,
3137 * there are a couple of GLOBAL (!) variables,
3138 * which would have been destroyed on exit.
3139 * However, now there is no 'exit' from hhalign(),
3140 * and on re-entry the global variable must be clean again.
3141 */
3142 void
ClobberGlobal(void)3143 Alignment::ClobberGlobal(void){
3144
3145 /* @<essentials@>
3146 these are essential to re-set (as some of them are used as flags) */
3147 for(int k=0; k<N_in; k++)
3148 {
3149 delete[] sname[k]; sname[k] = NULL;
3150 delete[] seq[k]; seq[k] = NULL;
3151 delete[] X[k]; X[k] = NULL;
3152 delete[] I[k]; I[k] = NULL;
3153 }
3154 delete[] nres; nres = NULL;
3155 delete[] first; first = NULL;
3156 delete[] last; last = NULL;
3157 delete[] ksort; ksort = NULL;
3158 N_in = N_filtered = n_display = 0;
3159 L = 0;
3160 kss_dssp = ksa_dssp = kss_pred = kss_conf = kfirst = -1;
3161
3162 /* @<re-set but keep memory@>
3163 do not free the memory but re-set content */
3164 longname[0] = '\0'; //delete[] longname; longname = NULL;
3165 keep[0] = '\0'; //delete[] keep; keep = NULL;
3166 display[0] = '\0'; //delete[] display; display = NULL;
3167 wg[0] = 0; //delete[] wg; wg = NULL;
3168 nseqs[0] = 0; //delete[] nseqs; nseqs = NULL;
3169 name[0]='\0';
3170 fam[0]='\0';
3171 file[0]='\0';
3172 //delete[] sname; sname = NULL;
3173 //delete[] seq; seq = NULL;
3174 //delete[] X; X = NULL;
3175 //delete[] I; I = NULL;
3176 //delete[] l; l = NULL;
3177
3178 /* @= */
3179 return;
3180 }
3181 #endif
3182