1 /* konings.c
2  * 1.0: SRE, Wed Jul  7 08:53:37 1993
3  * adapted for 2.0: SRE, Thu Sep  9 13:38:13 1993
4  *
5  * Representation of secondary structure and secondary structural
6  * alignments using Danielle Konings' string notation, and Hogeweg's
7  * mountain notation.
8  *
9  * See: Konings and Hogeweg, J. Mol. Biol. 207:597-614 1989
10  */
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <ctype.h>
16 
17 #include "structs.h"
18 #include "funcs.h"
19 
20 #ifdef MEMDEBUG
21 #include "dbmalloc.h"
22 #endif
23 
24 
25 /* Function: Align2kh()
26  *
27  * Purpose:  Convert an alignment (align_s linked list) into
28  *           a secondary structure string. The symbols used
29  *           are > and < for the two sides of a MATP; '.'
30  *           for other symbols.
31  *
32  *           Also may return the "aligned" sequence; this
33  *           sequence is only aligned in the sense that deleted
34  *           consensus positions are represented as '.'
35  *           and insert positions are lower case.
36  *
37  *           Either ret_aseq or ret_khseq may be passed as NULL
38  *           if you don't want one of them.
39  */
40 int
Align2kh(struct align_s * ali,char ** ret_aseq,char ** ret_khseq)41 Align2kh(struct align_s  *ali,
42 	 char           **ret_aseq,
43 	 char           **ret_khseq)
44 {
45   struct align_s *curr;         /* ptr into alignment list     */
46   char           *aseq;         /* RETURN: aligned seq         */
47   char           *khseq;        /* RETURN: structure string    */
48   int             len;		/* length of alignment, khseq  */
49   int             pos;		/* position in khseq           */
50 
51 
52   /* Count the length of the list and malloc for khseq.
53    */
54   len = 0;
55   for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt)
56     len++;
57   if ((khseq = (char *) malloc ((len+1) * sizeof(char))) == NULL ||
58       (aseq  = (char *) malloc ((len+1) * sizeof(char))) == NULL)
59     Die("Memory allocation failed, line %d of %s", __LINE__, __FILE__);
60 
61   /* This used to be harder. Now align_s already has a field for
62    * secondary structure annotation, and we just have to copy it.
63    */
64   pos  = 0;
65   for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt)
66     {
67       switch (curr->type)
68 	{
69 	case uBEGIN_ST:
70 	case uBIFURC_ST:
71 	  break;		/* neither should appear in an align_s! */
72 
73 	case uDEL_ST:
74 	  khseq[pos] = ' ';
75 	  aseq[pos] = '-';
76 	  break;
77 
78 	case uINSL_ST:
79         case uINSR_ST:
80 	  khseq[pos] = curr->ss;
81 	  aseq[pos] = sre_tolower(curr->sym);
82 	  break;
83 
84         case uMATL_ST:
85         case uMATR_ST:
86 	case uMATP_ST:
87 	  khseq[pos] = curr->ss;
88 	  aseq[pos] = sre_toupper(curr->sym);
89 	  break;
90 
91 	default:
92 	  Die("unrecognized state type %d in Align2kh()", curr->type);
93 	}
94       pos++;
95     }
96 
97   khseq[pos] = '\0';
98   aseq[pos]  = '\0';
99 
100   if (ret_khseq == NULL) free(khseq); else *ret_khseq = khseq;
101   if (ret_aseq  == NULL) free(aseq);  else *ret_aseq  = aseq;
102   return 1;
103 }
104 
105 
106 
107 
108 /* Function: PrintAliLandscape()
109  *
110  * Purpose:  Print an alignment of sequence to model in the form
111  *           of a Konings/Hogeweg "mountain".
112  */
113 int
PrintAliLandscape(FILE * fp,struct cm_s * cm,struct align_s * ali)114 PrintAliLandscape(FILE           *fp,
115 		  struct cm_s    *cm,
116 		  struct align_s *ali)
117 {
118   int   altitude;		/* current height on the mountain */
119   struct align_s *curr;         /* ptr to current ali element     */
120   struct align_s *prev;         /* ptr to previous ali element    */
121   int   i;
122 
123   altitude = 0;
124   prev     = NULL;
125   for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt)
126     {
127       if (curr->pos >= 0)
128 	fprintf(fp, "%4d  %c ", curr->pos+1, curr->sym);
129       else
130 	fprintf(fp, "      %c ", curr->sym);
131 
132       for (i = 0; i < altitude; i++)
133 	fputc(' ', fp);
134 
135       switch (curr->type)
136 	{
137 	case uBEGIN_ST:
138 	case uBIFURC_ST: break;
139 	case uDEL_ST:    fputs("   DEL ", fp);  break;
140 	case uINSL_ST:   fputs("`  INSL", fp);  break;
141         case uINSR_ST:   fputs("\'  INSR", fp); break;
142 	case uMATL_ST:   fputs("\\  MATL", fp); break;
143 	case uMATR_ST:   fputs("/  MATR", fp);  break;
144 
145 	case uMATP_ST:
146 	  if (prev == NULL || curr->nodeidx > prev->nodeidx)
147 	    {
148 	      fputs(" v  MATP", fp);
149 	      altitude++;
150 	    }
151 	  else
152 	    {
153 	      fputs("^  MATP", fp);
154 	      altitude--;
155 	    }
156 	  break;
157 
158 
159 	default:
160 	  Die("unrecognized state type %d in PrintAliLandscape()", curr->type);
161 	}
162 
163       printf("  %d\n", curr->nodeidx);
164       prev = curr;
165     }
166 
167   return 1;
168 }
169 
170 
171 
172 /* Function: Trace2KHS()
173  *
174  * Purpose:  From a traceback tree of seq, produce a
175  *           secondary structure string. ">" and "<" are
176  *           used for pairwise emissions; "." for single-stranded stuff.
177  *           Note that structure is defined by pairwise emissions,
178  *           not by Watson-Crick-isms and stacking rules.
179  *
180  * Args:     tr          - traceback structure
181  *           seq         - sequence, 0..rlen-1
182  *           rlen        - length of seq and returned ss string
183  *           watsoncrick - TRUE to annotate canonical pairs only
184  *           ret_ss      - RETURN: alloc'ed secondary structure string
185  *
186  * Return:   ret_ss contains a string 0..rlen-1 containing the
187  *           secondary structure. Must be free'd by caller.
188  */
189 void
Trace2KHS(struct trace_s * tr,char * seq,int rlen,int watsoncrick,char ** ret_ss)190 Trace2KHS(struct trace_s *tr, char *seq, int rlen, int watsoncrick,
191           char **ret_ss)
192 {
193   struct tracestack_s *dolist;
194   struct trace_s      *curr;
195   char                *ss;
196 
197   if ((ss = (char *) malloc (sizeof(char) * rlen+1)) == NULL)
198     Die("malloc failed");
199   memset(ss, '.', rlen);
200   ss[rlen] = '\0';
201 
202   dolist = InitTracestack();
203   PushTracestack(dolist, tr->nxtl);
204 
205   while ((curr = PopTracestack(dolist)) != NULL)
206     {
207       if ( curr->type      == uMATP_ST )
208 	{
209 	  if (! watsoncrick  ||
210 	      IsRNAComplement(seq[curr->emitl], seq[curr->emitr], TRUE))
211 	    {
212 	      ss[curr->emitl] = '>';
213 	      ss[curr->emitr] = '<';
214 	    }
215 	}
216 
217       if (curr->nxtr) PushTracestack(dolist, curr->nxtr);
218       if (curr->nxtl) PushTracestack(dolist, curr->nxtl);
219     }
220   FreeTracestack(dolist);
221   *ret_ss = ss;
222 }
223 
224 /* Function: IsRNAComplement()
225  *
226  * Purpose:  Returns TRUE if sym1, sym2 are Watson-Crick complementary.
227  *           If allow_gu is TRUE, GU pairs also return TRUE.
228  */
229 int
IsRNAComplement(char sym1,char sym2,int allow_gu)230 IsRNAComplement(char sym1, char sym2, int allow_gu)
231 {
232   sym1 = toupper(sym1);
233   sym2 = toupper(sym2);
234   if (sym1 == 'T') sym1 = 'U';
235   if (sym2 == 'T') sym2 = 'U';
236 
237   if ((sym1 == 'A' && sym2 == 'U') ||
238       (sym1 == 'C' && sym2 == 'G') ||
239       (sym1 == 'G' && sym2 == 'C') ||
240       (sym1 == 'U' && sym2 == 'A') ||
241       (allow_gu && sym1 == 'G' && sym2 == 'U') ||
242       (allow_gu && sym1 == 'U' && sym2 == 'G'))
243     return TRUE;
244   else
245     return FALSE;
246 }
247 
248 
249 /* Function: KHS2ct()
250  *
251  * Purpose:  Convert a secondary structure string to an array of integers
252  *           representing what position each position is base-paired
253  *           to (0..len-1), or -1 if none. This is off-by-one from a
254  *           Zuker .ct file representation.
255  *
256  *           The .ct representation can accomodate pseudoknots but the
257  *           secondary structure string cannot easily; the string contains
258  *           "Aa", "Bb", etc. pairs as a limited representation of
259  *           pseudoknots. The string contains "><" for base pairs.
260  *           Other symbols are ignored. If allow_pseudoknots is FALSE,
261  *           the pseudoknot symbols will be ignored and these positions
262  *           will be treated as single stranded.
263  *
264  * Return:   ret_ct is allocated here and must be free'd by caller.
265  *           Returns 1 on success, 0 if ss is somehow inconsistent.
266  */
267 int
KHS2ct(char * ss,int len,int allow_pseudoknots,int ** ret_ct)268 KHS2ct(char *ss, int len, int allow_pseudoknots, int **ret_ct)
269 {
270   struct intstack_s *dolist[27];
271   int *ct;
272   int  i;
273   int  pos, pair;
274   int  status = 1;		/* success or failure return status */
275 
276   for (i = 0; i < 27; i++)
277     dolist[i] = InitIntStack();
278 
279   if ((ct = (int *) malloc (len * sizeof(int))) == NULL)
280     Die("malloc failed");
281   for (pos = 0; pos < len; pos++)
282     ct[pos] = -1;
283 
284   for (pos = 0; ss[pos] != '\0'; pos++)
285     {
286       if (ss[pos] > 127) status = 0; /* bulletproof against SGI buggy ctype.h */
287 
288       else if (ss[pos] == '>')	/* left side of a pair: push onto stack 0 */
289 	PushIntStack(dolist[0], pos);
290       else if (ss[pos] == '<')	/* right side of a pair; resolve pair */
291 	{
292 	  if (! PopIntStack(dolist[0], &pair))
293 	    { status = 0; }
294 	  else
295 	    {
296 	      ct[pos]  = pair;
297 	      ct[pair] = pos;
298 	    }
299 	}
300 				/* same stuff for pseudoknots */
301       else if (allow_pseudoknots && isupper((int) ss[pos]))
302 	PushIntStack(dolist[ss[pos] - 'A' + 1], pos);
303       else if (allow_pseudoknots && islower((int) ss[pos]))
304 	{
305 	  if (! PopIntStack(dolist[ss[pos] - 'a' + 1], &pair))
306 	    { status = 0; }
307 	  else
308 	    {
309 	      ct[pos]  = pair;
310 	      ct[pair] = pos;
311 	    }
312 	}
313       else if (allow_pseudoknots && !isgap(ss[pos])) status = 0; /* bad character */
314     }
315 
316   for (i = 0; i < 27; i++)
317     if ( FreeIntStack(dolist[i]) > 0)
318       status = 0;
319 
320   *ret_ct = ct;
321   return status;
322 }
323 
324 
325