1 /*
2  *  ViennaRNA/utils/structure_tree.c
3  *
4  *  Various functions for tree representation of secondary strucures
5  *
6  *  c  Ivo L Hofacker, Walter Fontana, Ronny Lorenz
7  *     ViennaRNA package
8  */
9 
10 #ifdef HAVE_CONFIG_H
11 #include "config.h"
12 #endif
13 
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <ctype.h>
18 #include <limits.h>
19 
20 #include "ViennaRNA/utils/basic.h"
21 #include "ViennaRNA/datastructures/char_stream.h"
22 #include "ViennaRNA/utils/strings.h"
23 #include "ViennaRNA/utils/structures.h"
24 
25 /*
26  #################################
27  # PRIVATE FUNCTION DECLARATIONS #
28  #################################
29  */
30 
31 PRIVATE char *
32 annotate_enclosing_pairs(const char *structure);
33 
34 
35 PRIVATE char *
36 db2HIT(const char *structure);
37 
38 
39 PRIVATE char *
40 db2Shapiro(const char *structure,
41            int        with_stems,
42            int        with_weights,
43            int        with_external);
44 
45 
46 PRIVATE char *
47 db2ExpandedTree(const char *structure);
48 
49 
50 /*
51  #################################
52  # BEGIN OF FUNCTION DEFINITIONS #
53  #################################
54  */
55 PUBLIC char *
vrna_db_to_tree_string(const char * structure,unsigned int type)56 vrna_db_to_tree_string(const char   *structure,
57                        unsigned int type)
58 {
59   char *tree = NULL;
60 
61   if (structure) {
62     switch (type) {
63       case VRNA_STRUCTURE_TREE_HIT:
64         tree = db2HIT(structure);
65         break;
66 
67       case VRNA_STRUCTURE_TREE_SHAPIRO_SHORT:
68         tree = db2Shapiro(structure, 0, 0, 0);
69         break;
70 
71       case VRNA_STRUCTURE_TREE_SHAPIRO:
72         tree = db2Shapiro(structure, 1, 0, 0);
73         break;
74 
75       case VRNA_STRUCTURE_TREE_SHAPIRO_EXT:
76         tree = db2Shapiro(structure, 1, 0, 1);
77         break;
78 
79       case VRNA_STRUCTURE_TREE_SHAPIRO_WEIGHT:
80         tree = db2Shapiro(structure, 1, 1, 1);
81         break;
82 
83       case VRNA_STRUCTURE_TREE_EXPANDED:
84         tree = db2ExpandedTree(structure);
85         break;
86 
87       default:
88         break;
89     }
90   }
91 
92   return tree;
93 }
94 
95 
96 PUBLIC char *
vrna_tree_string_unweight(const char * structure)97 vrna_tree_string_unweight(const char *structure)
98 {
99   unsigned int  i, l;
100   char          *tree;
101 
102   tree = NULL;
103 
104   if (structure) {
105     tree = (char *)vrna_alloc(strlen(structure) + 1);
106 
107     i = l = 0;
108     while (structure[i]) {
109       if (!isdigit((int)structure[i]))
110         tree[l++] = structure[i];
111 
112       i++;
113     }
114     tree[l] = '\0';
115     /* resize to actual length */
116     tree = (char *)vrna_realloc(tree, sizeof(char) * (l + 1));
117   }
118 
119   return tree;
120 }
121 
122 
123 PUBLIC char *
vrna_tree_string_to_db(const char * structure)124 vrna_tree_string_to_db(const char *structure)
125 {
126   unsigned int  i, j, k, l, n, w, *match_paren;
127   char          id[10], *db;
128   const char    *temp;
129   int           o;
130   vrna_cstr_t   tmp_struct;
131 
132   db = NULL;
133 
134   if (structure) {
135     n           = strlen(structure);
136     tmp_struct  = vrna_cstr(4 * n, NULL);
137     match_paren = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (n / 2 + 1));
138 
139     i     = n - 1;
140     o     = 0;
141     k     = 9;
142     id[k] = '\0';
143 
144     while (1) {
145       switch (structure[i]) {
146         case '(':
147           if (o < 0) {
148             vrna_message_warning("vrna_tree_string_to_db(): "
149                                  "Unbalanced parenthesis detected in tree string!"
150                                  "Can't convert back to dot-bracket notation");
151             goto tree_string_to_db_exit;
152           }
153 
154           for (j = 0; j < match_paren[o]; j++)
155             vrna_cstr_printf(tmp_struct, "(");
156 
157           match_paren[o--] = 0;
158           break;
159 
160         case 'U':
161           w = 1;
162           sscanf(id + k, "%9u", &w);
163           for (j = 0; j < w; j++)
164             vrna_cstr_printf(tmp_struct, ".");
165 
166           k = 9;
167           break;
168 
169         case 'P':
170           w = 1;
171           sscanf(id + k, "%9u", &w);
172           for (j = 0; j < w; j++)
173             vrna_cstr_printf(tmp_struct, ")");
174 
175           match_paren[o]  = w;
176           k               = 9;
177           break;
178 
179         case 'R':
180           break;
181 
182         case ')':
183           o++;
184           break;
185 
186         default:
187           if (!isdigit((int)structure[i])) {
188             vrna_message_warning("vrna_tree_string_to_db(): "
189                                  "Unsupported character \"%c\" detected in tree string! "
190                                  "Can't convert back to dot-bracket notation",
191                                  structure[i]);
192             goto tree_string_to_db_exit;
193           }
194 
195           if (k == 0) {
196             vrna_message_warning("vrna_tree_string_unexpand(): "
197                                  "Node weight too large! "
198                                  "Can't convert back to dot-bracket notation");
199             goto tree_string_to_db_exit;
200           }
201 
202           id[--k] = structure[i];
203           break;
204       }
205 
206       if (i == 0)
207         break;
208 
209       i--;
210     }
211 
212     temp  = vrna_cstr_string(tmp_struct);
213     l     = strlen(temp);
214     db    = (char *)vrna_alloc(sizeof(char) * (l + 1));
215 
216     for (i = 0; i < l; i++)
217       db[i] = temp[l - i - 1];
218 
219     db[l] = '\0';
220 
221 tree_string_to_db_exit:
222 
223     vrna_cstr_free(tmp_struct);
224     free(match_paren);
225   }
226 
227   return db;
228 }
229 
230 
231 /*
232  #####################################
233  # BEGIN OF STATIC HELPER FUNCTIONS  #
234  #####################################
235  */
236 PRIVATE char *
annotate_enclosing_pairs(const char * structure)237 annotate_enclosing_pairs(const char *structure)
238 {
239   int   *match;
240   int   i, n, o, p;
241   char  *annot;
242 
243   annot = NULL;
244 
245   if (structure) {
246     n     = (int)strlen(structure);
247     annot = strdup(structure);
248     match = (int *)vrna_alloc(sizeof(int) * (n / 2 + 1));
249 
250     for (i = o = 0; i < n; i++)
251       switch (annot[i]) {
252         case '.':
253           break;
254 
255         case '(':
256           match[++o] = i;
257           break;
258 
259         case ')':
260           p = i;
261           /* seek for outer-most pair in current helix */
262           while ((annot[p + 1] == ')') && (match[o - 1] == match[o] - 1)) {
263             p++;
264             o--;
265           }
266           /* annotate enclosing pair */
267           annot[p]        = ']';
268           annot[match[o]] = '[';
269           i               = p;
270           o--;
271           break;
272 
273         default:
274           vrna_message_warning("annotate_enclosing_pairs: "
275                                "Dot-braket string contains junk character \"%c\"",
276                                annot[i]);
277           free(annot);
278           free(match);
279           return NULL;
280       }
281 
282     free(match);
283   }
284 
285   return annot;
286 }
287 
288 
289 PRIVATE char *
db2HIT(const char * structure)290 db2HIT(const char *structure)
291 {
292   unsigned int  i, p, u, n;
293   char          *HIT, *annot_struct;
294   vrna_cstr_t   tmp_struct;
295 
296   HIT           = NULL;
297   annot_struct  = annotate_enclosing_pairs(structure);
298   if (annot_struct) {
299     n           = strlen(structure);
300     tmp_struct  = vrna_cstr(4 * n, NULL);
301 
302     /* start of tree */
303     vrna_cstr_printf(tmp_struct, "(");
304 
305     for (i = p = u = 0; i < n; i++) {
306       switch (annot_struct[i]) {
307         case '.':
308           u++;
309           break;
310 
311         case '[':
312           if (u > 0) {
313             vrna_cstr_printf(tmp_struct, "(U%d)", u);
314             u = 0;
315           }
316 
317           vrna_cstr_printf(tmp_struct, "(");
318           break;
319 
320         case ')':
321           if (u > 0) {
322             vrna_cstr_printf(tmp_struct, "(U%d)", u);
323             u = 0;
324           }
325 
326           p++;
327           break;
328 
329         case ']':
330           if (u > 0) {
331             vrna_cstr_printf(tmp_struct, "(U%d)", u);
332             u = 0;
333           }
334 
335           vrna_cstr_printf(tmp_struct, "P%d)", p + 1);
336           p = 0;
337           break;
338       }
339     }
340 
341     if (u > 0)
342       vrna_cstr_printf(tmp_struct, "(U%d)", u);
343 
344     /* add root and end of tree */
345     vrna_cstr_printf(tmp_struct, "R)");
346 
347     HIT = strdup(vrna_cstr_string(tmp_struct));
348 
349     vrna_cstr_free(tmp_struct);
350     free(annot_struct);
351   }
352 
353   return HIT;
354 }
355 
356 
357 PRIVATE char *
db2Shapiro(const char * structure,int with_stems,int with_weights,int with_external)358 db2Shapiro(const char *structure,
359            int        with_stems,
360            int        with_weights,
361            int        with_external)
362 {
363   unsigned int  i, n, p, lp, loops, pairs, unpaired, *loop_size, *loop, *bulge,
364                 *loop_degree, *helix_size;
365   char          *tree, *annot_struct;
366   vrna_cstr_t   tmp_struct;
367 
368   tree          = NULL;
369   annot_struct  = annotate_enclosing_pairs(structure);
370 
371   if (annot_struct) {
372     n           = strlen(structure);
373     tmp_struct  = vrna_cstr(4 * n, NULL);
374 
375     loop_size   = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (n / 2 + 1));
376     helix_size  = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (n / 2 + 1));
377     loop        = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (n / 2 + 1));
378     bulge       = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (n / 2 + 1));
379     loop_degree = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (n / 2 + 1));
380 
381     p = lp = loops = pairs = unpaired = 0;
382 
383     for (i = 0; i < n; i++) {
384       switch (annot_struct[i]) {
385         case '.':
386           unpaired++;
387           loop_size[loop[lp]]++;
388           break;
389 
390         case '[':
391           vrna_cstr_printf(tmp_struct, "(");
392 
393           if (with_stems)
394             vrna_cstr_printf(tmp_struct, "(");
395 
396           if ((i > 0) && (annot_struct[i - 1] == '(' || annot_struct[i - 1] == '['))
397             bulge[lp] = 1;
398 
399           lp++;
400           loop_degree[++loops]  = 1;
401           loop[lp]              = loops;
402           bulge[lp]             = 0;
403           break;
404 
405         case ')':
406           if (annot_struct[i - 1] == ']')
407             bulge[lp] = 1;
408 
409           p++;
410           break;
411 
412         case ']':
413           if (annot_struct[i - 1] == ']')
414             bulge[lp] = 1;
415 
416           switch (loop_degree[loop[lp]]) {
417             case 1:
418               vrna_cstr_printf(tmp_struct, "H");    /* hairpin */
419               break;
420 
421             case 2:
422               if (bulge[lp] == 1)
423                 vrna_cstr_printf(tmp_struct, "B");  /* bulge */
424               else
425                 vrna_cstr_printf(tmp_struct, "I");  /* internal loop */
426 
427               break;
428 
429             default:
430               vrna_cstr_printf(tmp_struct, "M");    /* multiloop */
431           }
432 
433           helix_size[loop[lp]] = p + 1;
434 
435           if (with_weights)
436             vrna_cstr_printf(tmp_struct,
437                              "%d",
438                              loop_size[loop[lp]]);
439 
440           vrna_cstr_printf(tmp_struct, ")");
441 
442           if (with_stems) {
443             vrna_cstr_printf(tmp_struct, "S");
444             if (with_weights)
445               vrna_cstr_printf(tmp_struct,
446                                "%d",
447                                helix_size[loop[lp]]);
448 
449             vrna_cstr_printf(tmp_struct, ")");
450           }
451 
452           pairs += p + 1;
453           p     = 0;
454           loop_degree[loop[--lp]]++;
455           break;
456       }
457     }
458 
459     if ((with_external) && (loop_size[0])) {
460       if (with_weights)
461         tree = vrna_strdup_printf("((%sE%d)R)",
462                                   vrna_cstr_string(tmp_struct),
463                                   loop_size[0]);
464       else
465         tree = vrna_strdup_printf("((%sE)R)",
466                                   vrna_cstr_string(tmp_struct));
467     } else {
468       /* add root and end of tree */
469       tree = vrna_strdup_printf("(%sR)",
470                                 vrna_cstr_string(tmp_struct));
471     }
472 
473     vrna_cstr_free(tmp_struct);
474     free(loop_degree);
475     free(loop_size);
476     free(helix_size);
477     free(loop);
478     free(bulge);
479     free(annot_struct);
480   }
481 
482   return tree;
483 }
484 
485 
486 PRIVATE char *
db2ExpandedTree(const char * structure)487 db2ExpandedTree(const char *structure)
488 {
489   unsigned int  i, n;
490   char          *tree;
491   vrna_cstr_t   tmp_struct;
492 
493   n           = strlen(structure);
494   tmp_struct  = vrna_cstr(4 * n, NULL);
495 
496   for (i = 0; i < n; i++) {
497     if (structure[i] == '(')
498       vrna_cstr_printf(tmp_struct, "(");
499     else if (structure[i] == ')')
500       vrna_cstr_printf(tmp_struct, "P)");
501     else
502       vrna_cstr_printf(tmp_struct, "(U)");
503   }
504 
505   tree = vrna_strdup_printf("(%sR)",
506                             vrna_cstr_string(tmp_struct));
507 
508   vrna_cstr_free(tmp_struct);
509 
510   return tree;
511 }
512