1 // hhfullalignment.C
2 
3 #include "hhhalfalignment.h"
4 
5 /////////////////////////////////////////////////////////////////////////////////////
6 /////////////////////////////////////////////////////////////////////////////////////
7 // Methods of class HalfAlignment
8 /////////////////////////////////////////////////////////////////////////////////////
9 /////////////////////////////////////////////////////////////////////////////////////
10 
11 /////////////////////////////////////////////////////////////////////////////////////
12 // Constructor
HalfAlignment(int maxseqdis)13 HalfAlignment::HalfAlignment(int maxseqdis) {
14   n = 0;
15   sname = seq = NULL;
16   nss_dssp = nss_pred = nss_conf = nsa_dssp = ncons = -1;
17   h = new int[maxseqdis];   //h[k] = next position of sequence k to be written
18   s = new char*[maxseqdis];  //s[k][h] = character in column h, sequence k of output alignment
19   l = new int*[maxseqdis];  //counts non-gap residues: l[k][i] = index of last residue AT OR BEFORE match state i in seq k
20   m = new int*[maxseqdis];  //counts positions:        m[k][i] = position of match state i in string seq[k]
21   if (!h || !s || !l || !m)
22     MemoryError("space for formatting HMM-HMM alignment", __FILE__, __LINE__,
23                 __func__);
24 }
25 
26 /////////////////////////////////////////////////////////////////////////////////////
27 // Destructor
~HalfAlignment()28 HalfAlignment::~HalfAlignment() {
29   Unset();
30   delete[] h;
31   delete[] s;
32   delete[] l;
33   delete[] m;
34 }
35 
36 /////////////////////////////////////////////////////////////////////////////////////
37 // Free memory in HalfAlignment arrays s[][], l[][], and m[][]
Unset()38 void HalfAlignment::Unset() {
39   // Free memory for alignment characters and residue counts
40   for (int k = 0; k < n; k++) {
41     delete[] s[k];
42     delete[] l[k];
43     delete[] m[k];
44   }
45   n = 0;
46   sname = seq = NULL;
47   nss_dssp = nss_pred = nss_conf = nsa_dssp = ncons = -1;
48 }
49 
50 /////////////////////////////////////////////////////////////////////////////////////
51 // Prepare a2m/a3m alignment: Calculate l[k][i] (residue indices) and m[k][i] (position in seq[k])
Set(char * name,char ** seq_in,char ** sname_in,int n_in,int L_in,int n1,int n2,int n3,int n4,int nc)52 void HalfAlignment::Set(char* name, char** seq_in, char** sname_in, int n_in,
53                         int L_in, int n1, int n2, int n3, int n4, int nc) {
54   int i;     //counts match states in seq[k]
55   int ll;    //counts residues LEFT from or at current position in seq[k]
56   int mm;    //counts postions in string seq[k]
57   int k;     //counts sequences
58   char c;
59   char warned = 0;
60 
61   nss_dssp = n1;
62   nss_pred = n2;
63   nss_conf = n3;
64   nsa_dssp = n4;
65   ncons = nc;
66   seq = seq_in;     //flat copy of sequences
67   sname = sname_in;  //flat copy of sequence names
68   n = n_in;
69   L = L_in;
70   pos = 0;
71 
72   // Allocate memory for alignment characters and residue counts
73   for (k = 0; k < n; k++) {
74     s[k] = new char[LINELEN];
75     l[k] = new int[L + 10];
76     m[k] = new int[L + 10];
77     if (!s[k] || !l[k] || !m[k])
78       MemoryError("space for formatting HMM-HMM alignment", __FILE__, __LINE__,
79                   __func__);
80     h[k] = 0;  //starting positions in alignment = 0
81   }
82 
83   for (k = 0; k < n; k++) {
84     m[k][0] = 0;  // 0'th match state (virtual) is begin state at 0
85     //if k is consensus sequence
86     if (k == nc) {
87       for (i = 1; i <= L; i++)
88         m[k][i] = l[k][i] = i;
89       m[k][L + 1] = l[k][L + 1] = L;
90       continue;
91     }
92     i = 1;
93     mm = 1;
94     ll = 1;
95     while ((c = seq[k][mm])) {
96       if (MatchChr(c) == c && i <= L)    //count match/delete states
97       {
98         l[k][i] = ll;
99         m[k][i] = mm;
100         i++;
101       }
102       if (WordChr(c))
103         ll++;  //index of next residue
104       mm++;
105     }
106     l[k][i] = ll - 1;  //set l[k][L+1] eq number of residues in seq k (-1 since there is no residue at L+1st match state)
107     m[k][i] = mm;   //set m[k][L+1]
108     if ((i - 1) != L && !warned) {
109       HH_LOG(WARNING) << "Sequence " << sname[k] << " in HMM " << name
110                 << " has " << i << " match states but should have " << L
111                 << "\n";
112       warned = 1;
113     }
114   }
115 
116   HH_LOG(DEBUG2) << "  i chr   m   l\n";
117   for (i = 0; i <= L + 1; i++)
118     HH_LOG(DEBUG2) << i << "   " << seq[0][m[0][i]] << " " << m[0][i]
119                              << " " << l[0][i] << std::endl;
120   HH_LOG(DEBUG2) << std::endl;
121 }
122 
123 /////////////////////////////////////////////////////////////////////////////////////
124 // Fill in insert states following match state i (without inserting '.' to fill up)
AddInserts(int i)125 void HalfAlignment::AddInserts(int i) {
126   for (int k = 0; k < n; k++)                        // for all sequences...
127     for (int mm = m[k][i] + 1; mm < m[k][i + 1]; mm++)  // for all inserts between match state i and i+1...
128       s[k][h[k]++] = seq[k][mm];      // fill inserts into output alignment s[k]
129 }
130 
131 /////////////////////////////////////////////////////////////////////////////////////
132 // Fill up alignment with gaps '.' to generate flush end (all h[k] equal)
FillUpGaps()133 void HalfAlignment::FillUpGaps() {
134   int k;      //counts sequences
135   pos = 0;
136 
137   // Determine max position h[k]
138   for (k = 0; k < n; k++)
139     pos = imax(h[k], pos);
140 
141   // Fill in gaps up to pos
142   for (k = 0; k < n; k++) {
143     for (int hh = h[k]; hh < pos; hh++)
144       s[k][hh] = '.';
145     h[k] = pos;
146   }
147 }
148 
149 /////////////////////////////////////////////////////////////////////////////////////
150 // Fill in insert states following match state i and fill up gaps with '.'
AddInsertsAndFillUpGaps(int i)151 void HalfAlignment::AddInsertsAndFillUpGaps(int i) {
152   AddInserts(i);
153   FillUpGaps();
154 }
155 
156 /////////////////////////////////////////////////////////////////////////////////////
157 // Add gap column '.'
AddChar(char c)158 void HalfAlignment::AddChar(char c) {
159   for (int k = 0; k < n; k++)
160     s[k][h[k]++] = c;
161   pos++;
162 }
163 
164 /////////////////////////////////////////////////////////////////////////////////////
165 // Add match state column i as is
AddColumn(int i)166 void HalfAlignment::AddColumn(int i) {
167   for (int k = 0; k < n; k++)
168     s[k][h[k]++] = seq[k][m[k][i]];
169   pos++;
170 }
171 
172 /////////////////////////////////////////////////////////////////////////////////////
173 // Add match state column i as insert state
AddColumnAsInsert(int i)174 void HalfAlignment::AddColumnAsInsert(int i) {
175   char c;
176   for (int k = 0; k < n; k++)
177     if ((c = seq[k][m[k][i]]) != '-' && (c < '0' || c > '9'))
178       s[k][h[k]++] = InsertChr(c);
179   pos++;
180 }
181 
182 /////////////////////////////////////////////////////////////////////////////////////
183 // Remove all characters c from template sequences
RemoveChars(char c)184 void HalfAlignment::RemoveChars(char c) {
185   int k, h, hh;
186   for (k = 0; k < n; k++) {
187     for (h = hh = 0; h < pos; h++)
188       if (s[k][h] != c)
189         s[k][hh++] = s[k][h];
190     s[k][++hh] = '\0';
191   }
192 }
193 
194 /////////////////////////////////////////////////////////////////////////////////////
195 // Transform alignment sequences from A3M to A2M (insert ".")
BuildFASTA()196 void HalfAlignment::BuildFASTA() {
197   AddInserts(0);
198   FillUpGaps();
199   for (int i = 1; i <= L; i++) {
200     AddColumn(i);
201     AddInserts(i);
202     FillUpGaps();
203   }
204   ToFASTA();
205 }
206 
207 /////////////////////////////////////////////////////////////////////////////////////
208 // Transform alignment sequences from A3M to A2M (insert ".")
BuildA2M()209 void HalfAlignment::BuildA2M() {
210   AddInserts(0);
211   FillUpGaps();
212   for (int i = 1; i <= L; i++) {
213     AddColumn(i);
214     AddInserts(i);
215     FillUpGaps();
216   }
217   AddChar('\0');
218 }
219 
220 /////////////////////////////////////////////////////////////////////////////////////
221 // Transform alignment sequences from A3M to A2M (insert ".")
BuildA3M()222 void HalfAlignment::BuildA3M() {
223   AddInserts(0);
224   for (int i = 1; i <= L; i++) {
225     AddColumn(i);
226     AddInserts(i);
227   }
228   AddChar('\0');
229 }
230 
231 /////////////////////////////////////////////////////////////////////////////////////
232 // Transform alignment sequences from A2M to FASTA ( lowercase to uppercase and '.' to '-')
ToFASTA()233 void HalfAlignment::ToFASTA() {
234   for (int k = 0; k < n; k++) {
235     uprstr(s[k]);
236     strtr(s[k], ".", "-");
237   }
238 }
239 
240 /////////////////////////////////////////////////////////////////////////////////////
241 // Align query (HalfAlignment) to template (i.e. hit) match state structure
242 /////////////////////////////////////////////////////////////////////////////////////
AlignToTemplate(Hit & hit,const char outformat)243 void HalfAlignment::AlignToTemplate(Hit& hit, const char outformat) {
244   int i, j;
245   int step;    // column of the HMM-HMM alignment (first:nstep, last:1)
246   char state;
247 
248   if (0) {  //par.loc==0) { //////////////////////////////////////////// STILL NEEDED??
249     // If in global mode: Add part of alignment before first MM state
250     AddInserts(0);  // Fill in insert states before first match state
251     for (i = 1; i < hit.i[hit.nsteps]; i++) {
252       AddColumnAsInsert(i);
253       AddInserts(i);
254       if (outformat <= 2)
255         FillUpGaps();
256     }
257   }
258 
259   // Add endgaps (First state must be an MM state!!)
260   for (j = 1; j < hit.j[hit.nsteps]; j++) {
261     AddChar('-');
262   }
263 
264   // Add alignment between first and last MM state
265   for (step = hit.nsteps; step >= 1; step--) {
266     state = hit.states[step];
267     i = hit.i[step];
268 
269     switch (state) {
270       case MM:  //MM pair state (both query and template in Match state)
271         AddColumn(i);
272         AddInserts(i);
273         break;
274       case DG:  //D- state
275       case MI:  //MI state
276         AddColumnAsInsert(i);
277         AddInserts(i);
278         break;
279       case GD:  //-D state
280       case IM:  //IM state
281         AddChar('-');
282         break;
283     }
284     if (outformat <= 2)
285       FillUpGaps();
286 
287   }
288 
289   if (0) {  //par.loc==0) { //////////////////////////////////////////// STILL NEEDED??
290 
291     // If in global mode: Add part of alignment after last MM state
292     for (i = hit.i[1] + 1; i <= L; i++) {
293       AddColumnAsInsert(i);
294       AddInserts(i);
295       if (outformat == 2)
296         FillUpGaps();
297     }
298   }
299 
300   // Add endgaps
301   for (j = hit.j[1] + 1; j <= hit.L; j++) {
302     AddChar('-');
303   }
304 
305   // Add end-of-string character
306   AddChar('\0');
307 }
308 
309 /////////////////////////////////////////////////////////////////////////////////////
310 // Write the a2m/a3m alignment into alnfile
311 /////////////////////////////////////////////////////////////////////////////////////
Print(char * alnfile,const char append,char * commentname,const char format[])312 void HalfAlignment::Print(char* alnfile, const char append, char* commentname,
313                           const char format[]) {
314   int k;      //counts sequences
315   int omitted = 0;  // counts number of sequences with no residues in match states
316   FILE *outf;
317   char* tmp_name = new char[NAMELEN];
318   if (strcmp(alnfile, "stdout")) {
319     if (append)
320       outf = fopen(alnfile, "a");
321     else
322       outf = fopen(alnfile, "w");
323     if (!outf)
324       OpenFileError(alnfile, __FILE__, __LINE__, __func__);
325   } else
326     outf = stdout;
327 
328   HH_LOG(DEBUG) << "Writing alignment to " << alnfile << "\n";
329 
330   if (!format || strcmp(format, "psi")) {
331     if (commentname != NULL)
332       fprintf(outf, "#%s\n", commentname);
333 
334     for (k = 0; k < n; k++)
335       if (k == nss_pred || k == nss_conf || k == nss_dssp || k == nsa_dssp) {
336         fprintf(outf, ">%s\n", sname[k]);
337         fprintf(outf, "%s\n", s[k]);
338       }
339     for (k = 0; k < n; k++) {
340       if (!(k == nss_pred || k == nss_conf || k == nss_dssp || k == nsa_dssp)) {
341         // Print sequence only if it contains at least one residue in a match state
342         if (1)  //strpbrk(s[k],"ABCDEFGHIKLMNPQRSTUVWXYZ1234567890"))
343         {
344           fprintf(outf, ">%s\n", sname[k]);
345           fprintf(outf, "%s\n", s[k]);
346         } else {
347           omitted++;
348           HH_LOG(DEBUG)
349               << sname[k]
350               << " contains no residue in match state. Omitting sequence\n";
351         }
352       }
353     }
354     if (omitted) {
355       HH_LOG(INFO) << "Omitted " << omitted << " sequences in "
356                              << alnfile
357                              << " which contained no residue in match state\n";
358     }
359   } else {
360     for (k = 0; k < n; k++) {
361       strwrd(tmp_name, sname[k], NAMELEN);
362       fprintf(outf, "%-20.20s ", tmp_name);
363       char* ptr = s[k];
364       for (; *ptr != '\0'; ptr++)
365         if (*ptr == 45 || (*ptr >= 65 && *ptr <= 90))
366           fprintf(outf, "%c", *ptr);
367       fprintf(outf, "\n");
368     }
369   }
370   fclose(outf);
371   delete[] tmp_name;
372 }
373