1 /* konings.c
2 * 1.0: SRE, Wed Jul 7 08:53:37 1993
3 * adapted for 2.0: SRE, Thu Sep 9 13:38:13 1993
4 *
5 * Representation of secondary structure and secondary structural
6 * alignments using Danielle Konings' string notation, and Hogeweg's
7 * mountain notation.
8 *
9 * See: Konings and Hogeweg, J. Mol. Biol. 207:597-614 1989
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <ctype.h>
16
17 #include "structs.h"
18 #include "funcs.h"
19
20 #ifdef MEMDEBUG
21 #include "dbmalloc.h"
22 #endif
23
24
25 /* Function: Align2kh()
26 *
27 * Purpose: Convert an alignment (align_s linked list) into
28 * a secondary structure string. The symbols used
29 * are > and < for the two sides of a MATP; '.'
30 * for other symbols.
31 *
32 * Also may return the "aligned" sequence; this
33 * sequence is only aligned in the sense that deleted
34 * consensus positions are represented as '.'
35 * and insert positions are lower case.
36 *
37 * Either ret_aseq or ret_khseq may be passed as NULL
38 * if you don't want one of them.
39 */
40 int
Align2kh(struct align_s * ali,char ** ret_aseq,char ** ret_khseq)41 Align2kh(struct align_s *ali,
42 char **ret_aseq,
43 char **ret_khseq)
44 {
45 struct align_s *curr; /* ptr into alignment list */
46 char *aseq; /* RETURN: aligned seq */
47 char *khseq; /* RETURN: structure string */
48 int len; /* length of alignment, khseq */
49 int pos; /* position in khseq */
50
51
52 /* Count the length of the list and malloc for khseq.
53 */
54 len = 0;
55 for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt)
56 len++;
57 if ((khseq = (char *) malloc ((len+1) * sizeof(char))) == NULL ||
58 (aseq = (char *) malloc ((len+1) * sizeof(char))) == NULL)
59 Die("Memory allocation failed, line %d of %s", __LINE__, __FILE__);
60
61 /* This used to be harder. Now align_s already has a field for
62 * secondary structure annotation, and we just have to copy it.
63 */
64 pos = 0;
65 for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt)
66 {
67 switch (curr->type)
68 {
69 case uBEGIN_ST:
70 case uBIFURC_ST:
71 break; /* neither should appear in an align_s! */
72
73 case uDEL_ST:
74 khseq[pos] = ' ';
75 aseq[pos] = '-';
76 break;
77
78 case uINSL_ST:
79 case uINSR_ST:
80 khseq[pos] = curr->ss;
81 aseq[pos] = sre_tolower(curr->sym);
82 break;
83
84 case uMATL_ST:
85 case uMATR_ST:
86 case uMATP_ST:
87 khseq[pos] = curr->ss;
88 aseq[pos] = sre_toupper(curr->sym);
89 break;
90
91 default:
92 Die("unrecognized state type %d in Align2kh()", curr->type);
93 }
94 pos++;
95 }
96
97 khseq[pos] = '\0';
98 aseq[pos] = '\0';
99
100 if (ret_khseq == NULL) free(khseq); else *ret_khseq = khseq;
101 if (ret_aseq == NULL) free(aseq); else *ret_aseq = aseq;
102 return 1;
103 }
104
105
106
107
108 /* Function: PrintAliLandscape()
109 *
110 * Purpose: Print an alignment of sequence to model in the form
111 * of a Konings/Hogeweg "mountain".
112 */
113 int
PrintAliLandscape(FILE * fp,struct cm_s * cm,struct align_s * ali)114 PrintAliLandscape(FILE *fp,
115 struct cm_s *cm,
116 struct align_s *ali)
117 {
118 int altitude; /* current height on the mountain */
119 struct align_s *curr; /* ptr to current ali element */
120 struct align_s *prev; /* ptr to previous ali element */
121 int i;
122
123 altitude = 0;
124 prev = NULL;
125 for (curr = ali->nxt; curr->nxt != NULL; curr = curr->nxt)
126 {
127 if (curr->pos >= 0)
128 fprintf(fp, "%4d %c ", curr->pos+1, curr->sym);
129 else
130 fprintf(fp, " %c ", curr->sym);
131
132 for (i = 0; i < altitude; i++)
133 fputc(' ', fp);
134
135 switch (curr->type)
136 {
137 case uBEGIN_ST:
138 case uBIFURC_ST: break;
139 case uDEL_ST: fputs(" DEL ", fp); break;
140 case uINSL_ST: fputs("` INSL", fp); break;
141 case uINSR_ST: fputs("\' INSR", fp); break;
142 case uMATL_ST: fputs("\\ MATL", fp); break;
143 case uMATR_ST: fputs("/ MATR", fp); break;
144
145 case uMATP_ST:
146 if (prev == NULL || curr->nodeidx > prev->nodeidx)
147 {
148 fputs(" v MATP", fp);
149 altitude++;
150 }
151 else
152 {
153 fputs("^ MATP", fp);
154 altitude--;
155 }
156 break;
157
158
159 default:
160 Die("unrecognized state type %d in PrintAliLandscape()", curr->type);
161 }
162
163 printf(" %d\n", curr->nodeidx);
164 prev = curr;
165 }
166
167 return 1;
168 }
169
170
171
172 /* Function: Trace2KHS()
173 *
174 * Purpose: From a traceback tree of seq, produce a
175 * secondary structure string. ">" and "<" are
176 * used for pairwise emissions; "." for single-stranded stuff.
177 * Note that structure is defined by pairwise emissions,
178 * not by Watson-Crick-isms and stacking rules.
179 *
180 * Args: tr - traceback structure
181 * seq - sequence, 0..rlen-1
182 * rlen - length of seq and returned ss string
183 * watsoncrick - TRUE to annotate canonical pairs only
184 * ret_ss - RETURN: alloc'ed secondary structure string
185 *
186 * Return: ret_ss contains a string 0..rlen-1 containing the
187 * secondary structure. Must be free'd by caller.
188 */
189 void
Trace2KHS(struct trace_s * tr,char * seq,int rlen,int watsoncrick,char ** ret_ss)190 Trace2KHS(struct trace_s *tr, char *seq, int rlen, int watsoncrick,
191 char **ret_ss)
192 {
193 struct tracestack_s *dolist;
194 struct trace_s *curr;
195 char *ss;
196
197 if ((ss = (char *) malloc (sizeof(char) * rlen+1)) == NULL)
198 Die("malloc failed");
199 memset(ss, '.', rlen);
200 ss[rlen] = '\0';
201
202 dolist = InitTracestack();
203 PushTracestack(dolist, tr->nxtl);
204
205 while ((curr = PopTracestack(dolist)) != NULL)
206 {
207 if ( curr->type == uMATP_ST )
208 {
209 if (! watsoncrick ||
210 IsRNAComplement(seq[curr->emitl], seq[curr->emitr], TRUE))
211 {
212 ss[curr->emitl] = '>';
213 ss[curr->emitr] = '<';
214 }
215 }
216
217 if (curr->nxtr) PushTracestack(dolist, curr->nxtr);
218 if (curr->nxtl) PushTracestack(dolist, curr->nxtl);
219 }
220 FreeTracestack(dolist);
221 *ret_ss = ss;
222 }
223
224 /* Function: IsRNAComplement()
225 *
226 * Purpose: Returns TRUE if sym1, sym2 are Watson-Crick complementary.
227 * If allow_gu is TRUE, GU pairs also return TRUE.
228 */
229 int
IsRNAComplement(char sym1,char sym2,int allow_gu)230 IsRNAComplement(char sym1, char sym2, int allow_gu)
231 {
232 sym1 = toupper(sym1);
233 sym2 = toupper(sym2);
234 if (sym1 == 'T') sym1 = 'U';
235 if (sym2 == 'T') sym2 = 'U';
236
237 if ((sym1 == 'A' && sym2 == 'U') ||
238 (sym1 == 'C' && sym2 == 'G') ||
239 (sym1 == 'G' && sym2 == 'C') ||
240 (sym1 == 'U' && sym2 == 'A') ||
241 (allow_gu && sym1 == 'G' && sym2 == 'U') ||
242 (allow_gu && sym1 == 'U' && sym2 == 'G'))
243 return TRUE;
244 else
245 return FALSE;
246 }
247
248
249 /* Function: KHS2ct()
250 *
251 * Purpose: Convert a secondary structure string to an array of integers
252 * representing what position each position is base-paired
253 * to (0..len-1), or -1 if none. This is off-by-one from a
254 * Zuker .ct file representation.
255 *
256 * The .ct representation can accomodate pseudoknots but the
257 * secondary structure string cannot easily; the string contains
258 * "Aa", "Bb", etc. pairs as a limited representation of
259 * pseudoknots. The string contains "><" for base pairs.
260 * Other symbols are ignored. If allow_pseudoknots is FALSE,
261 * the pseudoknot symbols will be ignored and these positions
262 * will be treated as single stranded.
263 *
264 * Return: ret_ct is allocated here and must be free'd by caller.
265 * Returns 1 on success, 0 if ss is somehow inconsistent.
266 */
267 int
KHS2ct(char * ss,int len,int allow_pseudoknots,int ** ret_ct)268 KHS2ct(char *ss, int len, int allow_pseudoknots, int **ret_ct)
269 {
270 struct intstack_s *dolist[27];
271 int *ct;
272 int i;
273 int pos, pair;
274 int status = 1; /* success or failure return status */
275
276 for (i = 0; i < 27; i++)
277 dolist[i] = InitIntStack();
278
279 if ((ct = (int *) malloc (len * sizeof(int))) == NULL)
280 Die("malloc failed");
281 for (pos = 0; pos < len; pos++)
282 ct[pos] = -1;
283
284 for (pos = 0; ss[pos] != '\0'; pos++)
285 {
286 if (ss[pos] > 127) status = 0; /* bulletproof against SGI buggy ctype.h */
287
288 else if (ss[pos] == '>') /* left side of a pair: push onto stack 0 */
289 PushIntStack(dolist[0], pos);
290 else if (ss[pos] == '<') /* right side of a pair; resolve pair */
291 {
292 if (! PopIntStack(dolist[0], &pair))
293 { status = 0; }
294 else
295 {
296 ct[pos] = pair;
297 ct[pair] = pos;
298 }
299 }
300 /* same stuff for pseudoknots */
301 else if (allow_pseudoknots && isupper((int) ss[pos]))
302 PushIntStack(dolist[ss[pos] - 'A' + 1], pos);
303 else if (allow_pseudoknots && islower((int) ss[pos]))
304 {
305 if (! PopIntStack(dolist[ss[pos] - 'a' + 1], &pair))
306 { status = 0; }
307 else
308 {
309 ct[pos] = pair;
310 ct[pair] = pos;
311 }
312 }
313 else if (allow_pseudoknots && !isgap(ss[pos])) status = 0; /* bad character */
314 }
315
316 for (i = 0; i < 27; i++)
317 if ( FreeIntStack(dolist[i]) > 0)
318 status = 0;
319
320 *ret_ct = ct;
321 return status;
322 }
323
324
325