1 #ifndef VIENNA_RNA_PACKAGE_STRUCT_UTILS_H
2 #define VIENNA_RNA_PACKAGE_STRUCT_UTILS_H
3 
4 #ifdef VRNA_WARN_DEPRECATED
5 # if defined(__clang__)
6 #  define DEPRECATED(func, msg) func __attribute__ ((deprecated("", msg)))
7 # elif defined(__GNUC__)
8 #  define DEPRECATED(func, msg) func __attribute__ ((deprecated(msg)))
9 # else
10 #  define DEPRECATED(func, msg) func
11 # endif
12 #else
13 # define DEPRECATED(func, msg) func
14 #endif
15 
16 /**
17  *  @file     ViennaRNA/utils/structures.h
18  *  @ingroup  struct_utils
19  *  @brief    Various utility- and helper-functions for secondary structure parsing, converting, etc.
20  */
21 
22 /**
23  *  @addtogroup   struct_utils
24  *  @{
25  *  @brief  Functions to create, parse, convert, manipulate, and compare secondary structure representations
26  */
27 
28 
29 /**
30  *  @brief Convenience typedef for data structure #vrna_hx_s
31  *  @ingroup  struct_utils_helix_list
32  */
33 typedef struct vrna_hx_s vrna_hx_t;
34 
35 
36 /**
37  *  @brief Convenience typedef for data structure #vrna_elem_prob_s
38  *  @ingroup  struct_utils_plist
39  */
40 typedef struct vrna_elem_prob_s vrna_ep_t;
41 
42 
43 /**
44  *  @addtogroup struct_utils_dot_bracket
45  *  @{
46  *  @brief  The Dot-Bracket notation as introduced already in the early times of the ViennaRNA Package
47  *          denotes base pairs by matching pairs of parenthesis `()` and unpaired nucleotides by dots `.`.
48  *
49  *  As a simple example, consider a helix of size 4 enclosing a hairpin of size 4. In dot-bracket
50  *  notation, this is annotated as
51  *
52  *  `((((....))))`
53  *
54  *  <b>Extended Dot-Bracket Notation</b>
55  *
56  *  A more generalized version of the original Dot-Bracket notation may use additional pairs
57  *  of brackets, such as <tt><></tt>, <tt>{}</tt>, and <tt>[]</tt>, and matching pairs of
58  *  uppercase/lowercase letters. This allows for anotating pseudo-knots, since different
59  *  pairs of brackets are not required to be nested.
60  *
61  *  The follwing annotations of a simple structure with two crossing helices of size 4 are equivalent:
62  *
63  *  `<<<<[[[[....>>>>]]]]`<br>
64  *  `((((AAAA....))))aaaa`<br>
65  *  `AAAA{{{{....aaaa}}}}`
66  */
67 
68 /**
69  *  @brief  Bitflag to indicate secondary structure notations using uppercase/lowercase letters from the latin alphabet
70  *
71  *  @see  vrna_ptable_from_string()
72  */
73 #define VRNA_BRACKETS_ALPHA    4U
74 
75 
76 /**
77  *  @brief  Bitflag to indicate secondary structure notations using round brackets (parenthesis), <tt>()</tt>
78  *
79  *  @see  vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to()
80  */
81 #define VRNA_BRACKETS_RND      8U
82 
83 
84 /**
85  *  @brief  Bitflag to indicate secondary structure notations using curly brackets, <tt>{}</tt>
86  *
87  *  @see  vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to()
88  */
89 #define VRNA_BRACKETS_CLY      16U
90 
91 
92 /**
93  *  @brief  Bitflag to indicate secondary structure notations using angular brackets, <tt><></tt>
94  *
95  *  @see  vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to()
96  */
97 #define VRNA_BRACKETS_ANG      32U
98 
99 
100 /**
101  *  @brief  Bitflag to indicate secondary structure notations using square brackets, <tt>[]</tt>
102  *
103  *  @see  vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to()
104  */
105 #define VRNA_BRACKETS_SQR      64U
106 
107 
108 /**
109  *  @brief  Default bitmask to indicate secondary structure notation using any pair of brackets
110  *
111  *  This set of matching brackets/parenthesis is always nested, i.e. pseudo-knot free, in WUSS
112  *  format. However, in general different kinds of brackets are mostly used for annotating
113  *  pseudo-knots. Thus special care has to be taken to remove pseudo-knots if this bitmask
114  *  is used in functions that return secondary structures without pseudo-knots!
115  *
116  *  @see  vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to(), vrna_db_pk_remove()
117  *        vrna_pt_pk_remove()
118  */
119 #define VRNA_BRACKETS_DEFAULT  \
120   (VRNA_BRACKETS_RND | \
121    VRNA_BRACKETS_CLY | \
122    VRNA_BRACKETS_ANG | \
123    VRNA_BRACKETS_SQR)
124 
125 
126 /**
127  *  @brief  Bitmask to indicate secondary structure notation using any pair of brackets or uppercase/lowercase alphabet letters
128  *
129  *  @see  vrna_ptable_from_string(), vrna_db_pk_remove(), vrna_db_flatten(),
130  *        vrna_db_flatten_to()
131  */
132 #define VRNA_BRACKETS_ANY \
133   (VRNA_BRACKETS_RND | \
134    VRNA_BRACKETS_CLY | \
135    VRNA_BRACKETS_ANG | \
136    VRNA_BRACKETS_SQR | \
137    VRNA_BRACKETS_ALPHA)
138 
139 
140 #include <stdio.h>
141 
142 #include <ViennaRNA/datastructures/basic.h>
143 
144 /**
145  *  @brief Pack secondary secondary structure, 5:1 compression using base 3 encoding
146  *
147  *  Returns a binary string encoding of the secondary structure using
148  *  a 5:1 compression scheme. The string is NULL terminated and can
149  *  therefore be used with standard string functions such as strcmp().
150  *  Useful for programs that need to keep many structures in memory.
151  *
152  *  @see  vrna_db_unpack()
153  *  @param struc    The secondary structure in dot-bracket notation
154  *  @return         The binary encoded structure
155  */
156 char *
157 vrna_db_pack(const char *struc);
158 
159 
160 /**
161  *  @brief Unpack secondary structure previously packed with vrna_db_pack()
162  *
163  *  Translate a compressed binary string produced by vrna_db_pack() back into
164  *  the familiar dot-bracket notation.
165  *
166  *  @see  vrna_db_pack()
167  *  @param packed   The binary encoded packed secondary structure
168  *  @return         The unpacked secondary structure in dot-bracket notation
169  */
170 char *
171 vrna_db_unpack(const char *packed);
172 
173 
174 /**
175  *  @brief Substitute pairs of brackets in a string with parenthesis
176  *
177  *  This function can be used to replace brackets of unusual types,
178  *  such as angular brackets @p <> , to dot-bracket format.
179  *  The @p options parameter is used tpo specify which types of brackets
180  *  will be replaced by round parenthesis @p () .
181  *
182  *  @see vrna_db_flatten_to(),
183  *       #VRNA_BRACKETS_RND, #VRNA_BRACKETS_ANG, #VRNA_BRACKETS_CLY, #VRNA_BRACKETS_SQR,
184  *       #VRNA_BRACKETS_DEFAULT
185  *
186  *  @param  structure   The structure string where brackets are flattened in-place
187  *  @param  options     A bitmask to specify which types of brackets should be flattened out
188  */
189 void
190 vrna_db_flatten(char          *structure,
191                 unsigned int  options);
192 
193 
194 /**
195  *  @brief Substitute pairs of brackets in a string with another type of pair characters
196  *
197  *  This function can be used to replace brackets in a structure annotation string,
198  *  such as square brackets @p [] , to another type of pair characters,
199  *  e.g. angular brackets @p <> .
200  *
201  *  The @p target array must contain a character for the 'pair open' annotation at
202  *  position 0, and one for 'pair close' at position 1. T@p options parameter is used
203  *  to specify which types of brackets will be replaced by the new pairs.
204  *
205  *  @see vrna_db_flatten(),
206  *       #VRNA_BRACKETS_RND, #VRNA_BRACKETS_ANG, #VRNA_BRACKETS_CLY, #VRNA_BRACKETS_SQR,
207  *       #VRNA_BRACKETS_DEFAULT
208  *
209  *  @param  string      The structure string where brackets are flattened in-place
210  *  @param  target      The new pair characters the string will be flattened to
211  *  @param  options     A bitmask to specify which types of brackets should be flattened out
212  */
213 void
214 vrna_db_flatten_to(char         *string,
215                    const char   target[3],
216                    unsigned int options);
217 
218 
219 /**
220  *  @brief Convert a pair table into dot-parenthesis notation
221  *
222  *  @param pt The pair table to be copied
223  *  @return   A char pointer to the dot-bracket string
224  */
225 char *
226 vrna_db_from_ptable(short *pt);
227 
228 
229 /**
230  *  @brief  Convert a list of base pairs into dot-bracket notation
231  *
232  *  @see vrna_plist()
233  *  @param  pairs   A #vrna_ep_t containing the pairs to be included in
234  *                  the dot-bracket string
235  *  @param  n       The length of the structure (number of nucleotides)
236  *  @return         The dot-bracket string containing the provided base pairs
237  */
238 char *
239 vrna_db_from_plist(vrna_ep_t    *pairs,
240                    unsigned int n);
241 
242 
243 /**
244  *  @brief  Convert a secondary structure in dot-bracket notation to a nucleotide annotation of loop contexts
245  *
246  *  @param  structure   The secondary structure in dot-bracket notation
247  *  @return             A string annotating each nucleotide according to it's structural context
248  */
249 char *
250 vrna_db_to_element_string(const char *structure);
251 
252 
253 /**
254  *  @brief  Remove pseudo-knots from an input structure
255  *
256  *  This function removes pseudo-knots from an input structure
257  *  by determining the minimum number of base pairs that need
258  *  to be removed to make the structure pseudo-knot free.
259  *
260  *  To accomplish that, we use a dynamic programming algorithm
261  *  similar to the Nussinov maxmimum matching approach.
262  *
263  *  The input structure must be in a dot-bracket string like form
264  *  where crossing base pairs are denoted by the use of additional
265  *  types of matching brackets, e.g. @p <>, @p {}, @p [], @p {}.
266  *  Furthermore, crossing pairs may be annotated by matching
267  *  uppercase/lowercase letters from the alphabet @p A-Z. For the latter,
268  *  the uppercase letter must be the 5' and the lowercase letter
269  *  the 3' nucleotide of the base pair. The actual type of brackets
270  *  to be recognized by this function must be specifed through the
271  *  @p options parameter.
272  *
273  *  @note Brackets in the input structure string that are not covered
274  *        by the @p options bitmask will be silently ignored!
275  *
276  *  @see vrna_pt_pk_remove(), vrna_db_flatten(),
277  *       #VRNA_BRACKETS_RND, #VRNA_BRACKETS_ANG, #VRNA_BRACKETS_CLY, #VRNA_BRACKETS_SQR,
278  *       #VRNA_BRACKETS_ALPHA, #VRNA_BRACKETS_DEFAULT, #VRNA_BRACKETS_ANY
279  *
280  *  @param  structure   Input structure in dot-bracket format that may include pseudo-knots
281  *  @param  options     A bitmask to specify which types of brackets should be processed
282  *  @return             The input structure devoid of pseudo-knots in dot-bracket notation
283  */
284 char *
285 vrna_db_pk_remove(const char *structure,
286                   unsigned int options);
287 
288 /* End dot-bracket interface */
289 /**@}*/
290 
291 /**
292  *  @addtogroup struct_utils_pair_table
293  *  @{
294  */
295 
296 /**
297  *  @brief Create a pair table from a dot-bracket notation of a secondary structure
298  *
299  *  Returns a newly allocated table, such that table[i]=j if (i.j) pair
300  *  or 0 if i is unpaired, table[0] contains the length of the structure.
301  *
302  *  @see  vrna_ptable_from_string(), vrna_db_from_ptable()
303  *
304  *  @param  structure The secondary structure in dot-bracket notation
305  *  @return           A pointer to the created pair_table
306  */
307 short *
308 vrna_ptable(const char *structure);
309 
310 
311 /**
312  *  @brief  Create a pair table for a secondary structure string
313  *
314  *  This function takes an input string of a secondary structure annotation
315  *  in @ref dot-bracket-notation or @ref dot-bracket-ext-notation, and converts
316  *  it into a pair table representation.
317  *
318  *  @note   This function also extracts crossing base pairs, i.e. pseudo-knots
319  *          if more than a single matching bracket type is allowed through the
320  *          bitmask @p options.
321  *
322  *  @see vrna_ptable(), vrna_db_from_ptable(), vrna_db_flatten_to(), vrna_pt_pk_remove()
323  *       #VRNA_BRACKETS_RND, #VRNA_BRACKETS_ANG, #VRNA_BRACKETS_CLY, #VRNA_BRACKETS_SQR,
324  *       VRNA_BRACKETS_ALPHA, #VRNA_BRACKETS_DEFAULT, #VRNA_BRACKETS_ANY
325  *
326  *  @param  string    Secondary structure in @ref dot-bracket-ext-notation
327  *  @param  options   A bitmask to specify which brackets are recognized during conversion to pair table
328  *  @return           A pointer to a new pair table of the provided secondary structure
329  */
330 short *
331 vrna_ptable_from_string(const char    *string,
332                         unsigned int  options);
333 
334 
335 /**
336  *  @brief Create a pair table of a secondary structure (pseudo-knot version)
337  *
338  *  Returns a newly allocated table, such that table[i]=j if (i.j) pair
339  *  or 0 if i is unpaired, table[0] contains the length of the structure.
340  *
341  *  In contrast to vrna_ptable() this function also recognizes the base pairs
342  *  denoted by '[' and ']' brackets. Thus, this function behaves like
343  *  @code{.c}
344  *  vrna_ptable_from_string(structure, #VRNA_BRACKETS_RND | VRNA_BRACKETS_SQR)
345  *  @endcode
346  *
347  *  @see    vrna_ptable_from_string()
348  *
349  *  @param  structure The secondary structure in (extended) dot-bracket notation
350  *  @return           A pointer to the created pair_table
351  */
352 short *
353 vrna_pt_pk_get(const char *structure);
354 
355 
356 /**
357  *  @brief Get an exact copy of a pair table
358  *
359  *  @param pt The pair table to be copied
360  *  @return   A pointer to the copy of 'pt'
361  */
362 short *
363 vrna_ptable_copy(const short *pt);
364 
365 
366 /**
367  * @brief Create a pair table of a secondary structure (snoop align version)
368  *
369  */
370 short *
371 vrna_pt_ali_get(const char *structure);
372 
373 
374 /**
375  * @brief Create a pair table of a secondary structure (snoop version)
376  *
377  *  returns a newly allocated table, such that:  table[i]=j if (i.j) pair or
378  *  0 if i is unpaired, table[0] contains the length of the structure.
379  *  The special pseudoknotted H/ACA-mRNA structure is taken into account.
380  */
381 short *
382 vrna_pt_snoop_get(const char *structure);
383 
384 
385 /**
386  *  @brief  Remove pseudo-knots from a pair table
387  *
388  *  This function removes pseudo-knots from an input structure
389  *  by determining the minimum number of base pairs that need
390  *  to be removed to make the structure pseudo-knot free.
391  *
392  *  To accomplish that, we use a dynamic programming algorithm
393  *  similar to the Nussinov maxmimum matching approach.
394  *
395  *  @see    vrna_db_pk_remove()
396  *
397  *  @param  ptable  Input structure that may include pseudo-knots
398  *  @param  options
399  *  @return         The input structure devoid of pseudo-knots
400  */
401 short *
402 vrna_pt_pk_remove(const short   *ptable,
403                   unsigned int  options);
404 
405 
406 /* End pair table interface */
407 /**@}*/
408 
409 
410 /**
411  *  @addtogroup struct_utils_plist
412  *  @{
413  */
414 
415 /**
416  *  @brief  A Base Pair element
417  */
418 #define VRNA_PLIST_TYPE_BASEPAIR      0
419 
420 
421 /**
422  *  @brief  A G-Quadruplex element
423  */
424 #define VRNA_PLIST_TYPE_GQUAD         1
425 
426 
427 /**
428  *  @brief  A Hairpin loop motif element
429  */
430 #define VRNA_PLIST_TYPE_H_MOTIF       2
431 
432 
433 /**
434  *  @brief  An Internal loop motif element
435  */
436 #define VRNA_PLIST_TYPE_I_MOTIF       3
437 
438 
439 /**
440  *  @brief  An Unstructured Domain motif element
441  */
442 #define VRNA_PLIST_TYPE_UD_MOTIF      4
443 
444 
445 /**
446  *  @brief  A Base Pair stack element
447  */
448 #define VRNA_PLIST_TYPE_STACK         5
449 
450 
451 /**
452  *  @brief  Data structure representing a single entry of an element probability list
453  *          (e.g. list of pair probabilities)
454  *
455  *  @see vrna_plist(), vrna_plist_from_probs(), vrna_db_from_plist(),
456  *  #VRNA_PLIST_TYPE_BASEPAIR, #VRNA_PLIST_TYPE_GQUAD, #VRNA_PLIST_TYPE_H_MOTIF, #VRNA_PLIST_TYPE_I_MOTIF,
457  *  #VRNA_PLIST_TYPE_UD_MOTIF, #VRNA_PLIST_TYPE_STACK
458  */
459 struct vrna_elem_prob_s {
460   int   i;    /**<  @brief  Start position (usually 5' nucleotide that starts the element, e.g. base pair) */
461   int   j;    /**<  @brief  End position (usually 3' nucleotide that ends the element, e.g. base pair) */
462   float p;    /**<  @brief  Probability of the element */
463   int   type; /**<  @brief  Type of the element */
464 };
465 
466 /**
467  *  @brief Create a #vrna_ep_t from a dot-bracket string
468  *
469  *  The dot-bracket string is parsed and for each base pair an
470  *  entry in the plist is created. The probability of each pair in
471  *  the list is set by a function parameter.
472  *
473  *  The end of the plist is marked by sequence positions i as well as j
474  *  equal to 0. This condition should be used to stop looping over its
475  *  entries
476  *
477  *  @param struc  The secondary structure in dot-bracket notation
478  *  @param pr     The probability for each base pair used in the plist
479  *  @return       The plist array
480  */
481 vrna_ep_t *vrna_plist(const char  *struc,
482                       float       pr);
483 
484 
485 /**
486  *  @brief Create a #vrna_ep_t from base pair probability matrix
487  *
488  *  The probability matrix provided via the #vrna_fold_compound_t is parsed
489  *  and all pair probabilities above the given threshold are used to create
490  *  an entry in the plist
491  *
492  *  The end of the plist is marked by sequence positions i as well as j
493  *  equal to 0. This condition should be used to stop looping over its
494  *  entries
495  *
496  *  @ingroup              part_func_global
497  *  @param[in]  vc        The fold compound
498  *  @param[in]  cut_off   The cutoff value
499  *  @return               A pointer to the plist that is to be created
500  */
501 vrna_ep_t *vrna_plist_from_probs(vrna_fold_compound_t *vc,
502                                  double               cut_off);
503 
504 
505 /* End pair list interface */
506 /**@}*/
507 
508 
509 /**
510  *  @addtogroup struct_utils_wuss
511  *  @{
512  *  @brief  The WUSS notation, as frequently used for consensus secondary structures in @ref msa-formats-stockholm.
513  *
514  *  This notation allows for a fine-grained annotation of base pairs and unpaired nucleotides, including pseudo-knots.
515  *  Below, you'll find a list of secondary structure elements and their corresponding WUSS annotation
516  *  (See also the infernal user guide at http://eddylab.org/infernal/Userguide.pdf)
517  *  @parblock
518  *  - <b>Base pairs</b><br>
519  *    Nested base pairs are annotated by matching pairs of the symbols `<>`,
520  *    `()`, `{}`, and `[]`. Each of the matching pairs
521  *    of parenthesis have their special meaning, however, when used as input in our programs,
522  *    e.g. structure constraint, these details are usually ignored. Furthermore, base pairs
523  *    that constitute as pseudo-knot are denoted by letters from the latin alphabet and are,
524  *    if not denoted otherwise, ignored entirely in our programs.
525  *
526  *  - <b>Hairpin loops</b><br>
527  *    Unpaired nucleotides that constitute the hairpin loop are indicated by underscores, `_`.
528  *
529  *    Example: `<<<<<_____>>>>>`
530  *
531  *  - <b>Bulges and interior loops</b><br>
532  *    Residues that constitute a bulge or interior loop are denoted by dashes, `-`.
533  *
534  *    Example: `(((--<<_____>>-)))`
535  *
536  *  - <b>Multibranch loops</b><br>
537  *    Unpaired nucleotides in multibranch loops are indicated by commas `,`.
538  *
539  *    Example: `(((,,<<_____>>,<<____>>)))`
540  *
541  *  - <b>External residues</b><br>
542  *    Single stranded nucleotides in the exterior loop, i.e. not enclosed by any other pair are
543  *    denoted by colons, `:`.
544  *
545  *    Example: `<<<____>>>:::`
546  *
547  *  - <b>Insertions</b><br>
548  *    In cases where an alignment represents the consensus with a known structure, insertions relative
549  *    to the known structure are denoted by periods, `.`. Regions where local structural
550  *    alignment was invoked, leaving regions of both target and query sequence unaligned, are indicated
551  *    by tildes, `~`.
552  *    @note These symbols only appear in alignments of a known (query) structure annotation to a target
553  *    sequence of unknown structure.
554  *
555  *  - <b>Pseudo-knots</b><br>
556  *    The WUSS notation allows for annotation of pseudo-knots using pairs of upper-case/lower-case letters.
557  *    @note Our programs and library functions usually ignore pseudo-knots entirely treating them as
558  *    unpaired nucleotides, if not stated otherwise.
559  *
560  *    Example:  `<<<_AAA___>>>aaa`
561  *  @endparblock
562  */
563 
564 /**
565  *  @brief  Convert a WUSS annotation string to dot-bracket format
566  *
567  *  @note This function flattens all brackets, and treats pseudo-knots annotated
568  *        by matching pairs of upper/lowercase letters as unpaired nucleotides
569  *
570  *  @param  wuss  The input string in WUSS notation
571  *  @return       A dot-bracket notation of the input secondary structure
572  */
573 char *
574 vrna_db_from_WUSS(const char *wuss);
575 
576 
577 /* End WUSS notation interface */
578 /**@}*/
579 
580 
581 /**
582  *  @addtogroup struct_utils_abstract_shapes
583  *  @{
584  *  @brief  Abstract Shapes, introduced by Giegerich et al. in (2004) @cite giegerich:2004,
585  *          collapse the secondary structure while retaining the nestedness of helices and
586  *          hairpin loops.
587  *
588  *  The abstract shapes representation abstracts the structure from individual base pairs
589  *  and their corresponding location in the sequence, while retaining the inherent nestedness
590  *  of helices and hairpin loops.
591  *
592  *  Below is a description of what is included in the abstract shapes abstraction for each
593  *  respective level together with an example structure:
594  *
595  *      CGUCUUAAACUCAUCACCGUGUGGAGCUGCGACCCUUCCCUAGAUUCGAAGACGAG
596  *      ((((((...(((..(((...))))))...(((..((.....))..)))))))))..
597  *
598  *  ______
599  *
600  *  Shape Level | Description                     |   Result
601  *  ----------- | ------------------------------- | --------
602  *  1           | Most accurate - all loops and all unpaired | `[_[_[]]_[_[]_]]_`
603  *  2           | Nesting pattern for all loop types and unpaired regions in external loop and multiloop | `[[_[]][_[]_]]`
604  *  3           | Nesting pattern for all loop types but no unpaired regions | `[[[]][[]]]`
605  *  4           | Helix nesting pattern in external loop and multiloop | `[[][[]]]`
606  *  5           | Most abstract - helix nesting pattern and no unpaired regions | `[[][]]`
607  *
608  *  @note   Our implementations also provide the special Shape Level 0, which does not
609  *          collapse any structural features but simply convert base pairs and unpaired
610  *          nucleotides into their corresponding set of symbols for abstract shapes.
611  */
612 
613 /**
614  *  @brief  Convert a secondary structure in dot-bracket notation to its abstract shapes representation
615  *
616  *  This function converts a secondary structure into its abstract shapes representation as
617  *  presented by Giegerich et al. 2004 @cite giegerich:2004.
618  *
619  *  @see vrna_abstract_shapes_pt()
620  *
621  *  @param  structure A secondary structure in dot-bracket notation
622  *  @param  level     The abstraction level (integer in the range of 0 to 5)
623  *  @return           The secondary structure in abstract shapes notation
624  */
625 char *
626 vrna_abstract_shapes(const char    *structure,
627                      unsigned int  level);
628 
629 
630 /**
631  *  @brief  Convert a secondary structure to its abstract shapes representation
632  *
633  *  This function converts a secondary structure into its abstract shapes representation as
634  *  presented by Giegerich et al. 2004 @cite giegerich:2004. This function is equivalent to
635  *  vrna_db_to_shapes(), but requires a pair table input instead of a dot-bracket structure.
636  *
637  *  @note   The length of the structure must be present at @p pt[0]!
638  *
639  *  @see vrna_abstract_shapes()
640  *
641  *  @param  pt      A secondary structure in pair table format
642  *  @param  level   The abstraction level (integer in the range of 0 to 5)
643  *  @return         The secondary structure in abstract shapes notation
644  */
645 char *
646 vrna_abstract_shapes_pt(const short  *pt,
647                         unsigned int level);
648 
649 
650 /* End abstract shapes interface */
651 /**@}*/
652 
653 
654 /**
655  *  @addtogroup struct_utils_helix_list
656  *  @{
657  */
658 
659 /**
660  *  @brief  Data structure representing an entry of a helix list
661  */
662 struct vrna_hx_s {
663   unsigned int  start;
664   unsigned int  end;
665   unsigned int  length;
666   unsigned int  up5;
667   unsigned int  up3;
668 };
669 
670 
671 /**
672  *  @brief  Convert a pair table representation of a secondary structure into a helix list
673  *
674  *  @param  pt  The secondary structure in pair table representation
675  *  @return     The secondary structure represented as a helix list
676  */
677 vrna_hx_t *
678 vrna_hx_from_ptable(short *pt);
679 
680 
681 /**
682  *  @brief  Create a merged helix list from another helix list
683  */
684 vrna_hx_t *
685 vrna_hx_merge(const vrna_hx_t *list,
686               int             maxdist);
687 
688 
689 /* End helix list interface */
690 /**@}*/
691 
692 
693 /**
694  *  @brief Get a loop index representation of a structure
695  */
696 int *
697 vrna_loopidx_from_ptable(const short *pt);
698 
699 
700 /**
701  *  @brief Compute the "base pair" distance between two secondary structures s1 and s2.
702  *
703  *  The sequences should have the same length.
704  *  dist = number of base pairs in one structure but not in the other
705  *  same as edit distance with open-pair close-pair as move-set
706  *
707  *  @param str1   First structure in dot-bracket notation
708  *  @param str2   Second structure in dot-bracket notation
709  *  @return       The base pair distance between str1 and str2
710  */
711 int
712 vrna_bp_distance(const char *str1,
713                  const char *str2);
714 
715 
716 double
717 vrna_dist_mountain(const char   *str1,
718                    const char   *str2,
719                    unsigned int p);
720 
721 
722 /**
723  *  @brief Make a reference base pair count matrix
724  *
725  *  Get an upper triangular matrix containing the number of basepairs of a reference
726  *  structure for each interval [i,j] with i<j. Access it via iindx!!!
727  */
728 unsigned int *
729 vrna_refBPcnt_matrix(const short  *reference_pt,
730                      unsigned int turn);
731 
732 
733 /**
734  *  @brief Make a reference base pair distance matrix
735  *
736  *  Get an upper triangular matrix containing the base pair distance of two
737  *  reference structures for each interval [i,j] with i<j. Access it via iindx!!!
738  *
739  */
740 unsigned int *
741 vrna_refBPdist_matrix(const short   *pt1,
742                       const short   *pt2,
743                       unsigned int  turn);
744 
745 
746 /**
747  *  @brief Create a dot-bracket like structure string from base pair probability matrix
748  */
749 char *
750 vrna_db_from_probs(const FLT_OR_DBL *pr,
751                    unsigned int     length);
752 
753 
754 /**
755  *  @brief Get a pseudo dot bracket notation for a given probability information
756  */
757 char
758 vrna_bpp_symbol(const float *x);
759 
760 
761 /**
762  *  @brief Create a dot-backet/parenthesis structure from backtracking stack
763  *
764  *  This function is capable to create dot-bracket structures from suboptimal
765  *  structure prediction sensu M. Zuker
766  *
767  *  @param bp     Base pair stack containing the traced base pairs
768  *  @param length The length of the structure
769  *  @return       The secondary structure in dot-bracket notation as
770  *                provided in the input
771  */
772 char *
773 vrna_db_from_bp_stack(vrna_bp_stack_t *bp,
774                       unsigned int    length);
775 
776 
777 void
778 vrna_letter_structure(char            *structure,
779                       vrna_bp_stack_t *bp,
780                       unsigned int    length);
781 
782 
783 /**
784  *  @addtogroup struct_utils_tree
785  *  @{
786  *  @brief Secondary structures can be readily represented as trees, where internal
787  *  nodes represent base pairs, and leaves represent unpaired nucleotides.
788  *  The dot-bracket structure string already is a tree represented by a string
789  *  of parenthesis (base pairs) and dots for the leaf nodes (unpaired nucleotides).
790  *
791  *  Alternatively, one may find representations with two types of node labels,
792  *  `P` for paired and `U` for unpaired; a dot is then replaced by `(U)`, and
793  *  each closed bracket is assigned an additional identifier `P`.
794  *  We call this the expanded notation. In @cite fontana:1993b a condensed
795  *  representation of the secondary structure is proposed, the so-called
796  *  homeomorphically irreducible tree (HIT) representation. Here a stack is
797  *  represented as a single pair of matching brackets labeled `P` and
798  *  weighted by the number of base pairs.  Correspondingly, a contiguous
799  *  strain of unpaired bases is shown as one pair of matching brackets
800  *  labeled `U` and weighted by its length.  Generally any string consisting
801  *  of matching brackets and identifiers is equivalent to a plane tree with
802  *  as many different types of nodes as there are identifiers.
803  *
804  *  Bruce Shapiro proposed a coarse grained representation @cite shapiro:1988,
805  *  which, does not retain the full information of the secondary structure. He
806  *  represents the different structure elements by single matching brackets
807  *  and labels them as
808  *
809  *  - `H`  (hairpin loop),
810  *  - `I`  (interior loop),
811  *  - `B`  (bulge),
812  *  - `M`  (multi-loop), and
813  *  - `S`  (stack).
814  *
815  *  We extend his alphabet by an extra letter for external elements `E`.
816  *  Again these identifiers may be followed by a weight corresponding to the
817  *  number of unpaired bases or base pairs in the structure element.  All tree
818  *  representations (except for the dot-bracket form) can be encapsulated into
819  *  a virtual root (labeled `R`).
820  *
821  *  The following example illustrates the different linear tree representations
822  *  used by the package:
823  *
824  *  Consider the secondary structure represented by the dot-bracket string (full tree)
825  *  `.((..(((...)))..((..)))).` which is the most convenient
826  *  condensed notation used by our programs and library functions.
827  *
828  *  Then, the following tree representations are equivalent:
829  *
830  *  - Expanded tree:<br>
831  *    `((U)(((U)(U)((((U)(U)(U)P)P)P)(U)(U)(((U)(U)P)P)P)P)(U)R)`
832  *  - HIT representation (Fontana et al. 1993 @cite fontana:1993b):<br>
833  *    `((U1)((U2)((U3)P3)(U2)((U2)P2)P2)(U1)R)`
834  *  - Coarse Grained Tree Representation (Shapiro 1988 @cite shapiro:1988):
835  *    + Short (with root node `R`, without stem nodes `S`):<br>
836  *      `((H)((H)M)R)`
837  *    + Full (with root node `R`):<br>
838  *      `(((((H)S)((H)S)M)S)R)`
839  *    + Extended (with root node `R`, with external nodes `E`):<br>
840  *      `((((((H)S)((H)S)M)S)E)R)`
841  *    + Weighted (with root node `R`, with external nodes `E`):<br>
842  *      `((((((H3)S3)((H2)S2)M4)S2)E2)R)`
843  *
844  *  The Expanded tree is rather clumsy and mostly included for the sake of
845  *  completeness. The different versions of Coarse Grained Tree Representations
846  *  are variatios of Shapiro's linear tree notation.
847  *
848  *  For the output of aligned structures from string editing, different
849  *  representations are needed, where we put the label on both sides.
850  *  The above examples for tree representations would then look like:
851  *
852  *  @verbatim
853  *  a) (UU)(P(P(P(P(UU)(UU)(P(P(P(UU)(UU)(UU)P)P)P)(UU)(UU)(P(P(UU)(U...
854  *  b) (UU)(P2(P2(U2U2)(P2(U3U3)P3)(U2U2)(P2(U2U2)P2)P2)(UU)P2)(UU)
855  *  c) (B(M(HH)(HH)M)B)
856  *     (S(B(S(M(S(HH)S)(S(HH)S)M)S)B)S)
857  *     (E(S(B(S(M(S(HH)S)(S(HH)S)M)S)B)S)E)
858  *  d) (R(E2(S2(B1(S2(M4(S3(H3)S3)((H2)S2)M4)S2)B1)S2)E2)R)
859  *  @endverbatim
860  *
861  *  Aligned structures additionally contain the gap character `_`.
862  */
863 
864 /**
865  *  @brief  Homeomorphically Irreducible Tree (HIT) representation of a secondary structure
866  *  @see    vrna_db_to_tree_string()
867  */
868 #define   VRNA_STRUCTURE_TREE_HIT             1U
869 
870 
871 /**
872  *  @brief  (short) Coarse Grained representation of a secondary structure
873  *  @see    vrna_db_to_tree_string()
874  */
875 #define   VRNA_STRUCTURE_TREE_SHAPIRO_SHORT   2U
876 
877 
878 /**
879  *  @brief  (full)  Coarse Grained representation of a secondary structure
880  *  @see    vrna_db_to_tree_string()
881  */
882 #define   VRNA_STRUCTURE_TREE_SHAPIRO         3U
883 
884 
885 /**
886  *  @brief  (extended) Coarse Grained representation of a secondary structure
887  *  @see    vrna_db_to_tree_string()
888  */
889 #define   VRNA_STRUCTURE_TREE_SHAPIRO_EXT     4U
890 
891 
892 /**
893  *  @brief  (weighted) Coarse Grained representation of a secondary structure
894  *  @see    vrna_db_to_tree_string()
895  */
896 #define   VRNA_STRUCTURE_TREE_SHAPIRO_WEIGHT  5U
897 
898 /**
899  *  @brief  Expanded Tree representation of a secondary structure
900  *  @see    vrna_db_to_tree_string()
901  */
902 #define   VRNA_STRUCTURE_TREE_EXPANDED        6U
903 
904 
905 /**
906  *  @brief  Convert a Dot-Bracket structure string into tree string representation
907  *
908  *  This function allows one to convert a secondary structure in dot-bracket notation
909  *  into one of the various tree representations for secondary structures. The resulting
910  *  tree is then represented as a string of parenthesis and node symbols, similar to
911  *  to the Newick format.
912  *
913  *  Currently we support conversion into the following formats, denoted by the value
914  *  of parameter @p type:
915  *  * #VRNA_STRUCTURE_TREE_HIT            - @copybrief #VRNA_STRUCTURE_TREE_HIT
916  *                                          (See also Fontana et al. 1993 @cite fontana:1993b)
917  *  * #VRNA_STRUCTURE_TREE_SHAPIRO_SHORT  - @copybrief #VRNA_STRUCTURE_TREE_SHAPIRO_SHORT
918  *                                          (same as Shapiro 1988 @cite shapiro:1988, but with root node @p R and without @p S nodes for the stems)
919  *  * #VRNA_STRUCTURE_TREE_SHAPIRO        - @copybrief #VRNA_STRUCTURE_TREE_SHAPIRO
920  *                                          (See also Shapiro 1988 @cite shapiro:1988)
921  *  * #VRNA_STRUCTURE_TREE_SHAPIRO_EXT    - @copybrief #VRNA_STRUCTURE_TREE_SHAPIRO_EXT
922  *                                          (same as Shapiro 1988 @cite shapiro:1988, but external nodes denoted as @p E )
923  *  * #VRNA_STRUCTURE_TREE_SHAPIRO_WEIGHT - @copybrief #VRNA_STRUCTURE_TREE_SHAPIRO_WEIGHT
924  *                                          (same as #VRNA_STRUCTURE_TREE_SHAPIRO_EXT but with additional weights
925  *                                          for number of unpaired nucleotides in loop, and number of pairs in stems)
926  *  * #VRNA_STRUCTURE_TREE_EXPANDED       - @copybrief #VRNA_STRUCTURE_TREE_EXPANDED
927  *
928  *  @see  @ref sec_structure_representations_tree
929  *
930  *  @param  structure   The null-terminated dot-bracket structure string
931  *  @param  type        A switch to determine the type of tree string representation
932  *  @return             A tree representation of the input @p structure
933  */
934 char *
935 vrna_db_to_tree_string(const char   *structure,
936                        unsigned int type);
937 
938 
939 /**
940  *  @brief  Remove weights from a linear string tree representation of a secondary structure
941  *
942  *  This function strips the weights of a linear string tree representation such as @p HIT,
943  *  or Coarse Grained Tree sensu Shapiro @cite shapiro:1988
944  *
945  *  @see vrna_db_to_tree_string()
946  *
947  *  @param  structure   A linear string tree representation of a secondary structure with weights
948  *  @return             A linear string tree representation of a secondary structure without weights
949  */
950 char *
951 vrna_tree_string_unweight(const char *structure);
952 
953 
954 /**
955  *  @brief  Convert a linear tree string representation of a secondary structure back to Dot-Bracket notation
956  *
957  *  @warning  This function only accepts <em>Expanded</em> and <em>HIT</em> tree representations!
958  *
959  *  @see vrna_db_to_tree_string(), #VRNA_STRUCTURE_TREE_EXPANDED, #VRNA_STRUCTURE_TREE_HIT,
960  *       @ref sec_structure_representations_tree
961  *
962  *  @param  tree  A linear tree string representation of a secondary structure
963  *  @return       A dot-bracket notation of the secondary structure provided in @p tree
964  */
965 char *
966 vrna_tree_string_to_db(const char *tree);
967 
968 
969 /* End tree representations */
970 /**@}*/
971 
972 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
973 
974 /*###########################################*/
975 /*# deprecated functions below              #*/
976 /*###########################################*/
977 
978 /**
979  *  @brief Create a #vrna_ep_t from a dot-bracket string
980  *
981  *  The dot-bracket string is parsed and for each base pair an
982  *  entry in the plist is created. The probability of each pair in
983  *  the list is set by a function parameter.
984  *
985  *  The end of the plist is marked by sequence positions i as well as j
986  *  equal to 0. This condition should be used to stop looping over its
987  *  entries
988  *
989  *  @deprecated   Use vrna_plist() instead
990  *
991  *  @ingroup part_func_global_deprecated
992  *
993  *  @param pl     A pointer to the #vrna_ep_t that is to be created
994  *  @param struc  The secondary structure in dot-bracket notation
995  *  @param pr     The probability for each base pair
996  */
997 DEPRECATED(void assign_plist_from_db(vrna_ep_t  **pl,
998                                      const char *struc,
999                                      float      pr),
1000            "Use vrna_plist() instead");
1001 
1002 /**
1003  *  @brief Pack secondary secondary structure, 5:1 compression using base 3 encoding
1004  *
1005  *  Returns a binary string encoding of the secondary structure using
1006  *  a 5:1 compression scheme. The string is NULL terminated and can
1007  *  therefore be used with standard string functions such as strcmp().
1008  *  Useful for programs that need to keep many structures in memory.
1009  *
1010  *  @deprecated     Use vrna_db_pack() as a replacement
1011  *  @ingroup        struct_utils_deprecated
1012  *  @param struc    The secondary structure in dot-bracket notation
1013  *  @return         The binary encoded structure
1014  */
1015 DEPRECATED(char *pack_structure(const char *struc),
1016            "Use vrna_db_pack() instead");
1017 
1018 /**
1019  *  @brief Unpack secondary structure previously packed with pack_structure()
1020  *
1021  *  Translate a compressed binary string produced by pack_structure() back into
1022  *  the familiar dot-bracket notation.
1023  *
1024  *  @deprecated     Use vrna_db_unpack() as a replacement
1025  *  @ingroup        struct_utils_deprecated
1026  *  @param packed   The binary encoded packed secondary structure
1027  *  @return         The unpacked secondary structure in dot-bracket notation
1028  */
1029 DEPRECATED(char *unpack_structure(const char *packed),
1030            "Use vrna_db_unpack() instead");
1031 
1032 /**
1033  *  @brief Create a pair table of a secondary structure
1034  *
1035  *  Returns a newly allocated table, such that table[i]=j if (i.j) pair
1036  *  or 0 if i is unpaired, table[0] contains the length of the structure.
1037  *
1038  *  @deprecated Use vrna_ptable() instead
1039  *  @ingroup        struct_utils_deprecated
1040  *
1041  *  @param  structure The secondary structure in dot-bracket notation
1042  *  @return           A pointer to the created pair_table
1043  */
1044 DEPRECATED(short *make_pair_table(const char *structure),
1045            "Use vrna_ptable() instead");
1046 
1047 DEPRECATED(short *make_pair_table_pk(const char *structure),
1048            "Use vrna_ptable_from_string() instead");
1049 
1050 /**
1051  *  @brief Get an exact copy of a pair table
1052  *
1053  *  @deprecated Use vrna_ptable_copy() instead
1054  *  @ingroup        struct_utils_deprecated
1055  *
1056  *  @param pt The pair table to be copied
1057  *  @return   A pointer to the copy of 'pt'
1058  */
1059 DEPRECATED(short *copy_pair_table(const short *pt),
1060            "Use vrna_ptable_copy() instead");
1061 
1062 /**
1063  *  Pair table for snoop align
1064  *
1065  *  @deprecated Use vrna_pt_ali_get() instead!
1066  *  @ingroup        struct_utils_deprecated
1067  */
1068 DEPRECATED(short *alimake_pair_table(const char *structure),
1069            "Use vrna_pt_ali_get() instead");
1070 
1071 /**
1072  *  returns a newly allocated table, such that:  table[i]=j if (i.j) pair or
1073  *  0 if i is unpaired, table[0] contains the length of the structure.
1074  *  The special pseudoknotted H/ACA-mRNA structure is taken into account.
1075  *  @deprecated Use vrna_pt_snoop_get() instead!
1076  *  @ingroup        struct_utils_deprecated
1077  */
1078 DEPRECATED(short *make_pair_table_snoop(const char *structure),
1079            "Use vrna_pt_snoop_get() instead");
1080 
1081 DEPRECATED(int *make_loop_index_pt(short *pt),
1082            "Use vrna_loopidx_from_ptable() instead");
1083 
1084 /**
1085  *  @brief Compute the "base pair" distance between two secondary structures s1 and s2.
1086  *
1087  *  The sequences should have the same length.
1088  *  dist = number of base pairs in one structure but not in the other
1089  *  same as edit distance with open-pair close-pair as move-set
1090  *
1091  *  @deprecated   Use vrna_bp_distance instead
1092  *  @ingroup        struct_utils_deprecated
1093  *  @param str1   First structure in dot-bracket notation
1094  *  @param str2   Second structure in dot-bracket notation
1095  *  @return       The base pair distance between str1 and str2
1096  */
1097 DEPRECATED(int bp_distance(const char *str1,
1098                            const char *str2),
1099            "Use vrna_bp_distance() instead");
1100 
1101 /**
1102  *  @brief Make a reference base pair count matrix
1103  *
1104  *  Get an upper triangular matrix containing the number of basepairs of a reference
1105  *  structure for each interval [i,j] with i<j. Access it via iindx!!!
1106  *
1107  *  @deprecated Use vrna_refBPcnt_matrix() instead
1108  *  @ingroup        struct_utils_deprecated
1109  */
1110 DEPRECATED(unsigned int *make_referenceBP_array(short         *reference_pt,
1111                                                 unsigned int  turn),
1112            "Use vrna_refBPcnt_matrix() instead");
1113 
1114 /**
1115  *  @brief Make a reference base pair distance matrix
1116  *
1117  *  Get an upper triangular matrix containing the base pair distance of two
1118  *  reference structures for each interval [i,j] with i<j. Access it via iindx!!!
1119  *
1120  *  @deprecated Use vrna_refBPdist_matrix() instead
1121  *  @ingroup        struct_utils_deprecated
1122  */
1123 DEPRECATED(unsigned int *compute_BPdifferences(short        *pt1,
1124                                                short        *pt2,
1125                                                unsigned int turn),
1126            "Use vrna_refBPdist_matrix() instead");
1127 
1128 /**
1129  *  @brief Create a vrna_ep_t from a probability matrix
1130  *
1131  *  The probability matrix given is parsed and all pair probabilities above
1132  *  the given threshold are used to create an entry in the plist
1133  *
1134  *  The end of the plist is marked by sequence positions i as well as j
1135  *  equal to 0. This condition should be used to stop looping over its
1136  *  entries
1137  *
1138  *  @note This function is threadsafe
1139  *  @deprecated Use vrna_plist_from_probs() instead!
1140  *
1141  *  @ingroup part_func_global_deprecated
1142  *
1143  *  @param[out] pl      A pointer to the vrna_ep_t that is to be created
1144  *  @param[in]  probs   The probability matrix used for creating the plist
1145  *  @param[in]  length  The length of the RNA sequence
1146  *  @param[in]  cutoff  The cutoff value
1147  */
1148 DEPRECATED(void  assign_plist_from_pr(vrna_ep_t   **pl,
1149                                       FLT_OR_DBL  *probs,
1150                                       int         length,
1151                                       double      cutoff),
1152            "Use vrna_plist_from_probs() instead");
1153 
1154 /**
1155  *  @brief Create a dot-backet/parenthesis structure from backtracking stack
1156  *
1157  *  @deprecated use vrna_parenthesis_structure() instead
1158  *  @ingroup        struct_utils_deprecated
1159  *
1160  *  @note This function is threadsafe
1161  */
1162 DEPRECATED(void parenthesis_structure(char            *structure,
1163                                       vrna_bp_stack_t *bp,
1164                                       int             length),
1165            "Use vrna_parenthesis_structure() instead");
1166 
1167 /**
1168  *  @brief Create a dot-backet/parenthesis structure from backtracking stack
1169  *  obtained by zuker suboptimal calculation in cofold.c
1170  *
1171  *  @deprecated use vrna_parenthesis_zuker instead
1172  *  @ingroup        struct_utils_deprecated
1173  *
1174  *  @note This function is threadsafe
1175  */
1176 DEPRECATED(void parenthesis_zuker(char            *structure,
1177                                   vrna_bp_stack_t *bp,
1178                                   int             length),
1179            "Use vrna_parenthesis_zuker() instead");
1180 
1181 DEPRECATED(void letter_structure(char             *structure,
1182                                  vrna_bp_stack_t  *bp,
1183                                  int              length),
1184            "Use vrna_letter_structure() instead");
1185 
1186 /**
1187  *  @brief Create a dot-bracket like structure string from base pair probability matrix
1188  *  @deprecated Use vrna_db_from_probs() instead!
1189  *  @ingroup        struct_utils_deprecated
1190  */
1191 DEPRECATED(void  bppm_to_structure(char         *structure,
1192                                    FLT_OR_DBL   *pr,
1193                                    unsigned int length),
1194            "Use vrna_db_from_probs() instead");
1195 
1196 /**
1197  *  @brief Get a pseudo dot bracket notation for a given probability information
1198  *  @deprecated Use vrna_bpp_symbol() instead!
1199  *  @ingroup        struct_utils_deprecated
1200  */
1201 DEPRECATED(char    bppm_symbol(const float *x),
1202            "Use vrna_bpp_symbol() instead");
1203 
1204 #endif
1205 
1206 /**
1207  * @}
1208  */
1209 
1210 #endif
1211