1 #ifndef VIENNA_RNA_PACKAGE_STRUCT_UTILS_H 2 #define VIENNA_RNA_PACKAGE_STRUCT_UTILS_H 3 4 #ifdef VRNA_WARN_DEPRECATED 5 # if defined(__clang__) 6 # define DEPRECATED(func, msg) func __attribute__ ((deprecated("", msg))) 7 # elif defined(__GNUC__) 8 # define DEPRECATED(func, msg) func __attribute__ ((deprecated(msg))) 9 # else 10 # define DEPRECATED(func, msg) func 11 # endif 12 #else 13 # define DEPRECATED(func, msg) func 14 #endif 15 16 /** 17 * @file ViennaRNA/utils/structures.h 18 * @ingroup struct_utils 19 * @brief Various utility- and helper-functions for secondary structure parsing, converting, etc. 20 */ 21 22 /** 23 * @addtogroup struct_utils 24 * @{ 25 * @brief Functions to create, parse, convert, manipulate, and compare secondary structure representations 26 */ 27 28 29 /** 30 * @brief Convenience typedef for data structure #vrna_hx_s 31 * @ingroup struct_utils_helix_list 32 */ 33 typedef struct vrna_hx_s vrna_hx_t; 34 35 36 /** 37 * @brief Convenience typedef for data structure #vrna_elem_prob_s 38 * @ingroup struct_utils_plist 39 */ 40 typedef struct vrna_elem_prob_s vrna_ep_t; 41 42 43 /** 44 * @addtogroup struct_utils_dot_bracket 45 * @{ 46 * @brief The Dot-Bracket notation as introduced already in the early times of the ViennaRNA Package 47 * denotes base pairs by matching pairs of parenthesis `()` and unpaired nucleotides by dots `.`. 48 * 49 * As a simple example, consider a helix of size 4 enclosing a hairpin of size 4. In dot-bracket 50 * notation, this is annotated as 51 * 52 * `((((....))))` 53 * 54 * <b>Extended Dot-Bracket Notation</b> 55 * 56 * A more generalized version of the original Dot-Bracket notation may use additional pairs 57 * of brackets, such as <tt><></tt>, <tt>{}</tt>, and <tt>[]</tt>, and matching pairs of 58 * uppercase/lowercase letters. This allows for anotating pseudo-knots, since different 59 * pairs of brackets are not required to be nested. 60 * 61 * The follwing annotations of a simple structure with two crossing helices of size 4 are equivalent: 62 * 63 * `<<<<[[[[....>>>>]]]]`<br> 64 * `((((AAAA....))))aaaa`<br> 65 * `AAAA{{{{....aaaa}}}}` 66 */ 67 68 /** 69 * @brief Bitflag to indicate secondary structure notations using uppercase/lowercase letters from the latin alphabet 70 * 71 * @see vrna_ptable_from_string() 72 */ 73 #define VRNA_BRACKETS_ALPHA 4U 74 75 76 /** 77 * @brief Bitflag to indicate secondary structure notations using round brackets (parenthesis), <tt>()</tt> 78 * 79 * @see vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to() 80 */ 81 #define VRNA_BRACKETS_RND 8U 82 83 84 /** 85 * @brief Bitflag to indicate secondary structure notations using curly brackets, <tt>{}</tt> 86 * 87 * @see vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to() 88 */ 89 #define VRNA_BRACKETS_CLY 16U 90 91 92 /** 93 * @brief Bitflag to indicate secondary structure notations using angular brackets, <tt><></tt> 94 * 95 * @see vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to() 96 */ 97 #define VRNA_BRACKETS_ANG 32U 98 99 100 /** 101 * @brief Bitflag to indicate secondary structure notations using square brackets, <tt>[]</tt> 102 * 103 * @see vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to() 104 */ 105 #define VRNA_BRACKETS_SQR 64U 106 107 108 /** 109 * @brief Default bitmask to indicate secondary structure notation using any pair of brackets 110 * 111 * This set of matching brackets/parenthesis is always nested, i.e. pseudo-knot free, in WUSS 112 * format. However, in general different kinds of brackets are mostly used for annotating 113 * pseudo-knots. Thus special care has to be taken to remove pseudo-knots if this bitmask 114 * is used in functions that return secondary structures without pseudo-knots! 115 * 116 * @see vrna_ptable_from_string(), vrna_db_flatten(), vrna_db_flatten_to(), vrna_db_pk_remove() 117 * vrna_pt_pk_remove() 118 */ 119 #define VRNA_BRACKETS_DEFAULT \ 120 (VRNA_BRACKETS_RND | \ 121 VRNA_BRACKETS_CLY | \ 122 VRNA_BRACKETS_ANG | \ 123 VRNA_BRACKETS_SQR) 124 125 126 /** 127 * @brief Bitmask to indicate secondary structure notation using any pair of brackets or uppercase/lowercase alphabet letters 128 * 129 * @see vrna_ptable_from_string(), vrna_db_pk_remove(), vrna_db_flatten(), 130 * vrna_db_flatten_to() 131 */ 132 #define VRNA_BRACKETS_ANY \ 133 (VRNA_BRACKETS_RND | \ 134 VRNA_BRACKETS_CLY | \ 135 VRNA_BRACKETS_ANG | \ 136 VRNA_BRACKETS_SQR | \ 137 VRNA_BRACKETS_ALPHA) 138 139 140 #include <stdio.h> 141 142 #include <ViennaRNA/datastructures/basic.h> 143 144 /** 145 * @brief Pack secondary secondary structure, 5:1 compression using base 3 encoding 146 * 147 * Returns a binary string encoding of the secondary structure using 148 * a 5:1 compression scheme. The string is NULL terminated and can 149 * therefore be used with standard string functions such as strcmp(). 150 * Useful for programs that need to keep many structures in memory. 151 * 152 * @see vrna_db_unpack() 153 * @param struc The secondary structure in dot-bracket notation 154 * @return The binary encoded structure 155 */ 156 char * 157 vrna_db_pack(const char *struc); 158 159 160 /** 161 * @brief Unpack secondary structure previously packed with vrna_db_pack() 162 * 163 * Translate a compressed binary string produced by vrna_db_pack() back into 164 * the familiar dot-bracket notation. 165 * 166 * @see vrna_db_pack() 167 * @param packed The binary encoded packed secondary structure 168 * @return The unpacked secondary structure in dot-bracket notation 169 */ 170 char * 171 vrna_db_unpack(const char *packed); 172 173 174 /** 175 * @brief Substitute pairs of brackets in a string with parenthesis 176 * 177 * This function can be used to replace brackets of unusual types, 178 * such as angular brackets @p <> , to dot-bracket format. 179 * The @p options parameter is used tpo specify which types of brackets 180 * will be replaced by round parenthesis @p () . 181 * 182 * @see vrna_db_flatten_to(), 183 * #VRNA_BRACKETS_RND, #VRNA_BRACKETS_ANG, #VRNA_BRACKETS_CLY, #VRNA_BRACKETS_SQR, 184 * #VRNA_BRACKETS_DEFAULT 185 * 186 * @param structure The structure string where brackets are flattened in-place 187 * @param options A bitmask to specify which types of brackets should be flattened out 188 */ 189 void 190 vrna_db_flatten(char *structure, 191 unsigned int options); 192 193 194 /** 195 * @brief Substitute pairs of brackets in a string with another type of pair characters 196 * 197 * This function can be used to replace brackets in a structure annotation string, 198 * such as square brackets @p [] , to another type of pair characters, 199 * e.g. angular brackets @p <> . 200 * 201 * The @p target array must contain a character for the 'pair open' annotation at 202 * position 0, and one for 'pair close' at position 1. T@p options parameter is used 203 * to specify which types of brackets will be replaced by the new pairs. 204 * 205 * @see vrna_db_flatten(), 206 * #VRNA_BRACKETS_RND, #VRNA_BRACKETS_ANG, #VRNA_BRACKETS_CLY, #VRNA_BRACKETS_SQR, 207 * #VRNA_BRACKETS_DEFAULT 208 * 209 * @param string The structure string where brackets are flattened in-place 210 * @param target The new pair characters the string will be flattened to 211 * @param options A bitmask to specify which types of brackets should be flattened out 212 */ 213 void 214 vrna_db_flatten_to(char *string, 215 const char target[3], 216 unsigned int options); 217 218 219 /** 220 * @brief Convert a pair table into dot-parenthesis notation 221 * 222 * @param pt The pair table to be copied 223 * @return A char pointer to the dot-bracket string 224 */ 225 char * 226 vrna_db_from_ptable(short *pt); 227 228 229 /** 230 * @brief Convert a list of base pairs into dot-bracket notation 231 * 232 * @see vrna_plist() 233 * @param pairs A #vrna_ep_t containing the pairs to be included in 234 * the dot-bracket string 235 * @param n The length of the structure (number of nucleotides) 236 * @return The dot-bracket string containing the provided base pairs 237 */ 238 char * 239 vrna_db_from_plist(vrna_ep_t *pairs, 240 unsigned int n); 241 242 243 /** 244 * @brief Convert a secondary structure in dot-bracket notation to a nucleotide annotation of loop contexts 245 * 246 * @param structure The secondary structure in dot-bracket notation 247 * @return A string annotating each nucleotide according to it's structural context 248 */ 249 char * 250 vrna_db_to_element_string(const char *structure); 251 252 253 /** 254 * @brief Remove pseudo-knots from an input structure 255 * 256 * This function removes pseudo-knots from an input structure 257 * by determining the minimum number of base pairs that need 258 * to be removed to make the structure pseudo-knot free. 259 * 260 * To accomplish that, we use a dynamic programming algorithm 261 * similar to the Nussinov maxmimum matching approach. 262 * 263 * The input structure must be in a dot-bracket string like form 264 * where crossing base pairs are denoted by the use of additional 265 * types of matching brackets, e.g. @p <>, @p {}, @p [], @p {}. 266 * Furthermore, crossing pairs may be annotated by matching 267 * uppercase/lowercase letters from the alphabet @p A-Z. For the latter, 268 * the uppercase letter must be the 5' and the lowercase letter 269 * the 3' nucleotide of the base pair. The actual type of brackets 270 * to be recognized by this function must be specifed through the 271 * @p options parameter. 272 * 273 * @note Brackets in the input structure string that are not covered 274 * by the @p options bitmask will be silently ignored! 275 * 276 * @see vrna_pt_pk_remove(), vrna_db_flatten(), 277 * #VRNA_BRACKETS_RND, #VRNA_BRACKETS_ANG, #VRNA_BRACKETS_CLY, #VRNA_BRACKETS_SQR, 278 * #VRNA_BRACKETS_ALPHA, #VRNA_BRACKETS_DEFAULT, #VRNA_BRACKETS_ANY 279 * 280 * @param structure Input structure in dot-bracket format that may include pseudo-knots 281 * @param options A bitmask to specify which types of brackets should be processed 282 * @return The input structure devoid of pseudo-knots in dot-bracket notation 283 */ 284 char * 285 vrna_db_pk_remove(const char *structure, 286 unsigned int options); 287 288 /* End dot-bracket interface */ 289 /**@}*/ 290 291 /** 292 * @addtogroup struct_utils_pair_table 293 * @{ 294 */ 295 296 /** 297 * @brief Create a pair table from a dot-bracket notation of a secondary structure 298 * 299 * Returns a newly allocated table, such that table[i]=j if (i.j) pair 300 * or 0 if i is unpaired, table[0] contains the length of the structure. 301 * 302 * @see vrna_ptable_from_string(), vrna_db_from_ptable() 303 * 304 * @param structure The secondary structure in dot-bracket notation 305 * @return A pointer to the created pair_table 306 */ 307 short * 308 vrna_ptable(const char *structure); 309 310 311 /** 312 * @brief Create a pair table for a secondary structure string 313 * 314 * This function takes an input string of a secondary structure annotation 315 * in @ref dot-bracket-notation or @ref dot-bracket-ext-notation, and converts 316 * it into a pair table representation. 317 * 318 * @note This function also extracts crossing base pairs, i.e. pseudo-knots 319 * if more than a single matching bracket type is allowed through the 320 * bitmask @p options. 321 * 322 * @see vrna_ptable(), vrna_db_from_ptable(), vrna_db_flatten_to(), vrna_pt_pk_remove() 323 * #VRNA_BRACKETS_RND, #VRNA_BRACKETS_ANG, #VRNA_BRACKETS_CLY, #VRNA_BRACKETS_SQR, 324 * VRNA_BRACKETS_ALPHA, #VRNA_BRACKETS_DEFAULT, #VRNA_BRACKETS_ANY 325 * 326 * @param string Secondary structure in @ref dot-bracket-ext-notation 327 * @param options A bitmask to specify which brackets are recognized during conversion to pair table 328 * @return A pointer to a new pair table of the provided secondary structure 329 */ 330 short * 331 vrna_ptable_from_string(const char *string, 332 unsigned int options); 333 334 335 /** 336 * @brief Create a pair table of a secondary structure (pseudo-knot version) 337 * 338 * Returns a newly allocated table, such that table[i]=j if (i.j) pair 339 * or 0 if i is unpaired, table[0] contains the length of the structure. 340 * 341 * In contrast to vrna_ptable() this function also recognizes the base pairs 342 * denoted by '[' and ']' brackets. Thus, this function behaves like 343 * @code{.c} 344 * vrna_ptable_from_string(structure, #VRNA_BRACKETS_RND | VRNA_BRACKETS_SQR) 345 * @endcode 346 * 347 * @see vrna_ptable_from_string() 348 * 349 * @param structure The secondary structure in (extended) dot-bracket notation 350 * @return A pointer to the created pair_table 351 */ 352 short * 353 vrna_pt_pk_get(const char *structure); 354 355 356 /** 357 * @brief Get an exact copy of a pair table 358 * 359 * @param pt The pair table to be copied 360 * @return A pointer to the copy of 'pt' 361 */ 362 short * 363 vrna_ptable_copy(const short *pt); 364 365 366 /** 367 * @brief Create a pair table of a secondary structure (snoop align version) 368 * 369 */ 370 short * 371 vrna_pt_ali_get(const char *structure); 372 373 374 /** 375 * @brief Create a pair table of a secondary structure (snoop version) 376 * 377 * returns a newly allocated table, such that: table[i]=j if (i.j) pair or 378 * 0 if i is unpaired, table[0] contains the length of the structure. 379 * The special pseudoknotted H/ACA-mRNA structure is taken into account. 380 */ 381 short * 382 vrna_pt_snoop_get(const char *structure); 383 384 385 /** 386 * @brief Remove pseudo-knots from a pair table 387 * 388 * This function removes pseudo-knots from an input structure 389 * by determining the minimum number of base pairs that need 390 * to be removed to make the structure pseudo-knot free. 391 * 392 * To accomplish that, we use a dynamic programming algorithm 393 * similar to the Nussinov maxmimum matching approach. 394 * 395 * @see vrna_db_pk_remove() 396 * 397 * @param ptable Input structure that may include pseudo-knots 398 * @param options 399 * @return The input structure devoid of pseudo-knots 400 */ 401 short * 402 vrna_pt_pk_remove(const short *ptable, 403 unsigned int options); 404 405 406 /* End pair table interface */ 407 /**@}*/ 408 409 410 /** 411 * @addtogroup struct_utils_plist 412 * @{ 413 */ 414 415 /** 416 * @brief A Base Pair element 417 */ 418 #define VRNA_PLIST_TYPE_BASEPAIR 0 419 420 421 /** 422 * @brief A G-Quadruplex element 423 */ 424 #define VRNA_PLIST_TYPE_GQUAD 1 425 426 427 /** 428 * @brief A Hairpin loop motif element 429 */ 430 #define VRNA_PLIST_TYPE_H_MOTIF 2 431 432 433 /** 434 * @brief An Internal loop motif element 435 */ 436 #define VRNA_PLIST_TYPE_I_MOTIF 3 437 438 439 /** 440 * @brief An Unstructured Domain motif element 441 */ 442 #define VRNA_PLIST_TYPE_UD_MOTIF 4 443 444 445 /** 446 * @brief A Base Pair stack element 447 */ 448 #define VRNA_PLIST_TYPE_STACK 5 449 450 451 /** 452 * @brief Data structure representing a single entry of an element probability list 453 * (e.g. list of pair probabilities) 454 * 455 * @see vrna_plist(), vrna_plist_from_probs(), vrna_db_from_plist(), 456 * #VRNA_PLIST_TYPE_BASEPAIR, #VRNA_PLIST_TYPE_GQUAD, #VRNA_PLIST_TYPE_H_MOTIF, #VRNA_PLIST_TYPE_I_MOTIF, 457 * #VRNA_PLIST_TYPE_UD_MOTIF, #VRNA_PLIST_TYPE_STACK 458 */ 459 struct vrna_elem_prob_s { 460 int i; /**< @brief Start position (usually 5' nucleotide that starts the element, e.g. base pair) */ 461 int j; /**< @brief End position (usually 3' nucleotide that ends the element, e.g. base pair) */ 462 float p; /**< @brief Probability of the element */ 463 int type; /**< @brief Type of the element */ 464 }; 465 466 /** 467 * @brief Create a #vrna_ep_t from a dot-bracket string 468 * 469 * The dot-bracket string is parsed and for each base pair an 470 * entry in the plist is created. The probability of each pair in 471 * the list is set by a function parameter. 472 * 473 * The end of the plist is marked by sequence positions i as well as j 474 * equal to 0. This condition should be used to stop looping over its 475 * entries 476 * 477 * @param struc The secondary structure in dot-bracket notation 478 * @param pr The probability for each base pair used in the plist 479 * @return The plist array 480 */ 481 vrna_ep_t *vrna_plist(const char *struc, 482 float pr); 483 484 485 /** 486 * @brief Create a #vrna_ep_t from base pair probability matrix 487 * 488 * The probability matrix provided via the #vrna_fold_compound_t is parsed 489 * and all pair probabilities above the given threshold are used to create 490 * an entry in the plist 491 * 492 * The end of the plist is marked by sequence positions i as well as j 493 * equal to 0. This condition should be used to stop looping over its 494 * entries 495 * 496 * @ingroup part_func_global 497 * @param[in] vc The fold compound 498 * @param[in] cut_off The cutoff value 499 * @return A pointer to the plist that is to be created 500 */ 501 vrna_ep_t *vrna_plist_from_probs(vrna_fold_compound_t *vc, 502 double cut_off); 503 504 505 /* End pair list interface */ 506 /**@}*/ 507 508 509 /** 510 * @addtogroup struct_utils_wuss 511 * @{ 512 * @brief The WUSS notation, as frequently used for consensus secondary structures in @ref msa-formats-stockholm. 513 * 514 * This notation allows for a fine-grained annotation of base pairs and unpaired nucleotides, including pseudo-knots. 515 * Below, you'll find a list of secondary structure elements and their corresponding WUSS annotation 516 * (See also the infernal user guide at http://eddylab.org/infernal/Userguide.pdf) 517 * @parblock 518 * - <b>Base pairs</b><br> 519 * Nested base pairs are annotated by matching pairs of the symbols `<>`, 520 * `()`, `{}`, and `[]`. Each of the matching pairs 521 * of parenthesis have their special meaning, however, when used as input in our programs, 522 * e.g. structure constraint, these details are usually ignored. Furthermore, base pairs 523 * that constitute as pseudo-knot are denoted by letters from the latin alphabet and are, 524 * if not denoted otherwise, ignored entirely in our programs. 525 * 526 * - <b>Hairpin loops</b><br> 527 * Unpaired nucleotides that constitute the hairpin loop are indicated by underscores, `_`. 528 * 529 * Example: `<<<<<_____>>>>>` 530 * 531 * - <b>Bulges and interior loops</b><br> 532 * Residues that constitute a bulge or interior loop are denoted by dashes, `-`. 533 * 534 * Example: `(((--<<_____>>-)))` 535 * 536 * - <b>Multibranch loops</b><br> 537 * Unpaired nucleotides in multibranch loops are indicated by commas `,`. 538 * 539 * Example: `(((,,<<_____>>,<<____>>)))` 540 * 541 * - <b>External residues</b><br> 542 * Single stranded nucleotides in the exterior loop, i.e. not enclosed by any other pair are 543 * denoted by colons, `:`. 544 * 545 * Example: `<<<____>>>:::` 546 * 547 * - <b>Insertions</b><br> 548 * In cases where an alignment represents the consensus with a known structure, insertions relative 549 * to the known structure are denoted by periods, `.`. Regions where local structural 550 * alignment was invoked, leaving regions of both target and query sequence unaligned, are indicated 551 * by tildes, `~`. 552 * @note These symbols only appear in alignments of a known (query) structure annotation to a target 553 * sequence of unknown structure. 554 * 555 * - <b>Pseudo-knots</b><br> 556 * The WUSS notation allows for annotation of pseudo-knots using pairs of upper-case/lower-case letters. 557 * @note Our programs and library functions usually ignore pseudo-knots entirely treating them as 558 * unpaired nucleotides, if not stated otherwise. 559 * 560 * Example: `<<<_AAA___>>>aaa` 561 * @endparblock 562 */ 563 564 /** 565 * @brief Convert a WUSS annotation string to dot-bracket format 566 * 567 * @note This function flattens all brackets, and treats pseudo-knots annotated 568 * by matching pairs of upper/lowercase letters as unpaired nucleotides 569 * 570 * @param wuss The input string in WUSS notation 571 * @return A dot-bracket notation of the input secondary structure 572 */ 573 char * 574 vrna_db_from_WUSS(const char *wuss); 575 576 577 /* End WUSS notation interface */ 578 /**@}*/ 579 580 581 /** 582 * @addtogroup struct_utils_abstract_shapes 583 * @{ 584 * @brief Abstract Shapes, introduced by Giegerich et al. in (2004) @cite giegerich:2004, 585 * collapse the secondary structure while retaining the nestedness of helices and 586 * hairpin loops. 587 * 588 * The abstract shapes representation abstracts the structure from individual base pairs 589 * and their corresponding location in the sequence, while retaining the inherent nestedness 590 * of helices and hairpin loops. 591 * 592 * Below is a description of what is included in the abstract shapes abstraction for each 593 * respective level together with an example structure: 594 * 595 * CGUCUUAAACUCAUCACCGUGUGGAGCUGCGACCCUUCCCUAGAUUCGAAGACGAG 596 * ((((((...(((..(((...))))))...(((..((.....))..))))))))).. 597 * 598 * ______ 599 * 600 * Shape Level | Description | Result 601 * ----------- | ------------------------------- | -------- 602 * 1 | Most accurate - all loops and all unpaired | `[_[_[]]_[_[]_]]_` 603 * 2 | Nesting pattern for all loop types and unpaired regions in external loop and multiloop | `[[_[]][_[]_]]` 604 * 3 | Nesting pattern for all loop types but no unpaired regions | `[[[]][[]]]` 605 * 4 | Helix nesting pattern in external loop and multiloop | `[[][[]]]` 606 * 5 | Most abstract - helix nesting pattern and no unpaired regions | `[[][]]` 607 * 608 * @note Our implementations also provide the special Shape Level 0, which does not 609 * collapse any structural features but simply convert base pairs and unpaired 610 * nucleotides into their corresponding set of symbols for abstract shapes. 611 */ 612 613 /** 614 * @brief Convert a secondary structure in dot-bracket notation to its abstract shapes representation 615 * 616 * This function converts a secondary structure into its abstract shapes representation as 617 * presented by Giegerich et al. 2004 @cite giegerich:2004. 618 * 619 * @see vrna_abstract_shapes_pt() 620 * 621 * @param structure A secondary structure in dot-bracket notation 622 * @param level The abstraction level (integer in the range of 0 to 5) 623 * @return The secondary structure in abstract shapes notation 624 */ 625 char * 626 vrna_abstract_shapes(const char *structure, 627 unsigned int level); 628 629 630 /** 631 * @brief Convert a secondary structure to its abstract shapes representation 632 * 633 * This function converts a secondary structure into its abstract shapes representation as 634 * presented by Giegerich et al. 2004 @cite giegerich:2004. This function is equivalent to 635 * vrna_db_to_shapes(), but requires a pair table input instead of a dot-bracket structure. 636 * 637 * @note The length of the structure must be present at @p pt[0]! 638 * 639 * @see vrna_abstract_shapes() 640 * 641 * @param pt A secondary structure in pair table format 642 * @param level The abstraction level (integer in the range of 0 to 5) 643 * @return The secondary structure in abstract shapes notation 644 */ 645 char * 646 vrna_abstract_shapes_pt(const short *pt, 647 unsigned int level); 648 649 650 /* End abstract shapes interface */ 651 /**@}*/ 652 653 654 /** 655 * @addtogroup struct_utils_helix_list 656 * @{ 657 */ 658 659 /** 660 * @brief Data structure representing an entry of a helix list 661 */ 662 struct vrna_hx_s { 663 unsigned int start; 664 unsigned int end; 665 unsigned int length; 666 unsigned int up5; 667 unsigned int up3; 668 }; 669 670 671 /** 672 * @brief Convert a pair table representation of a secondary structure into a helix list 673 * 674 * @param pt The secondary structure in pair table representation 675 * @return The secondary structure represented as a helix list 676 */ 677 vrna_hx_t * 678 vrna_hx_from_ptable(short *pt); 679 680 681 /** 682 * @brief Create a merged helix list from another helix list 683 */ 684 vrna_hx_t * 685 vrna_hx_merge(const vrna_hx_t *list, 686 int maxdist); 687 688 689 /* End helix list interface */ 690 /**@}*/ 691 692 693 /** 694 * @brief Get a loop index representation of a structure 695 */ 696 int * 697 vrna_loopidx_from_ptable(const short *pt); 698 699 700 /** 701 * @brief Compute the "base pair" distance between two secondary structures s1 and s2. 702 * 703 * The sequences should have the same length. 704 * dist = number of base pairs in one structure but not in the other 705 * same as edit distance with open-pair close-pair as move-set 706 * 707 * @param str1 First structure in dot-bracket notation 708 * @param str2 Second structure in dot-bracket notation 709 * @return The base pair distance between str1 and str2 710 */ 711 int 712 vrna_bp_distance(const char *str1, 713 const char *str2); 714 715 716 double 717 vrna_dist_mountain(const char *str1, 718 const char *str2, 719 unsigned int p); 720 721 722 /** 723 * @brief Make a reference base pair count matrix 724 * 725 * Get an upper triangular matrix containing the number of basepairs of a reference 726 * structure for each interval [i,j] with i<j. Access it via iindx!!! 727 */ 728 unsigned int * 729 vrna_refBPcnt_matrix(const short *reference_pt, 730 unsigned int turn); 731 732 733 /** 734 * @brief Make a reference base pair distance matrix 735 * 736 * Get an upper triangular matrix containing the base pair distance of two 737 * reference structures for each interval [i,j] with i<j. Access it via iindx!!! 738 * 739 */ 740 unsigned int * 741 vrna_refBPdist_matrix(const short *pt1, 742 const short *pt2, 743 unsigned int turn); 744 745 746 /** 747 * @brief Create a dot-bracket like structure string from base pair probability matrix 748 */ 749 char * 750 vrna_db_from_probs(const FLT_OR_DBL *pr, 751 unsigned int length); 752 753 754 /** 755 * @brief Get a pseudo dot bracket notation for a given probability information 756 */ 757 char 758 vrna_bpp_symbol(const float *x); 759 760 761 /** 762 * @brief Create a dot-backet/parenthesis structure from backtracking stack 763 * 764 * This function is capable to create dot-bracket structures from suboptimal 765 * structure prediction sensu M. Zuker 766 * 767 * @param bp Base pair stack containing the traced base pairs 768 * @param length The length of the structure 769 * @return The secondary structure in dot-bracket notation as 770 * provided in the input 771 */ 772 char * 773 vrna_db_from_bp_stack(vrna_bp_stack_t *bp, 774 unsigned int length); 775 776 777 void 778 vrna_letter_structure(char *structure, 779 vrna_bp_stack_t *bp, 780 unsigned int length); 781 782 783 /** 784 * @addtogroup struct_utils_tree 785 * @{ 786 * @brief Secondary structures can be readily represented as trees, where internal 787 * nodes represent base pairs, and leaves represent unpaired nucleotides. 788 * The dot-bracket structure string already is a tree represented by a string 789 * of parenthesis (base pairs) and dots for the leaf nodes (unpaired nucleotides). 790 * 791 * Alternatively, one may find representations with two types of node labels, 792 * `P` for paired and `U` for unpaired; a dot is then replaced by `(U)`, and 793 * each closed bracket is assigned an additional identifier `P`. 794 * We call this the expanded notation. In @cite fontana:1993b a condensed 795 * representation of the secondary structure is proposed, the so-called 796 * homeomorphically irreducible tree (HIT) representation. Here a stack is 797 * represented as a single pair of matching brackets labeled `P` and 798 * weighted by the number of base pairs. Correspondingly, a contiguous 799 * strain of unpaired bases is shown as one pair of matching brackets 800 * labeled `U` and weighted by its length. Generally any string consisting 801 * of matching brackets and identifiers is equivalent to a plane tree with 802 * as many different types of nodes as there are identifiers. 803 * 804 * Bruce Shapiro proposed a coarse grained representation @cite shapiro:1988, 805 * which, does not retain the full information of the secondary structure. He 806 * represents the different structure elements by single matching brackets 807 * and labels them as 808 * 809 * - `H` (hairpin loop), 810 * - `I` (interior loop), 811 * - `B` (bulge), 812 * - `M` (multi-loop), and 813 * - `S` (stack). 814 * 815 * We extend his alphabet by an extra letter for external elements `E`. 816 * Again these identifiers may be followed by a weight corresponding to the 817 * number of unpaired bases or base pairs in the structure element. All tree 818 * representations (except for the dot-bracket form) can be encapsulated into 819 * a virtual root (labeled `R`). 820 * 821 * The following example illustrates the different linear tree representations 822 * used by the package: 823 * 824 * Consider the secondary structure represented by the dot-bracket string (full tree) 825 * `.((..(((...)))..((..)))).` which is the most convenient 826 * condensed notation used by our programs and library functions. 827 * 828 * Then, the following tree representations are equivalent: 829 * 830 * - Expanded tree:<br> 831 * `((U)(((U)(U)((((U)(U)(U)P)P)P)(U)(U)(((U)(U)P)P)P)P)(U)R)` 832 * - HIT representation (Fontana et al. 1993 @cite fontana:1993b):<br> 833 * `((U1)((U2)((U3)P3)(U2)((U2)P2)P2)(U1)R)` 834 * - Coarse Grained Tree Representation (Shapiro 1988 @cite shapiro:1988): 835 * + Short (with root node `R`, without stem nodes `S`):<br> 836 * `((H)((H)M)R)` 837 * + Full (with root node `R`):<br> 838 * `(((((H)S)((H)S)M)S)R)` 839 * + Extended (with root node `R`, with external nodes `E`):<br> 840 * `((((((H)S)((H)S)M)S)E)R)` 841 * + Weighted (with root node `R`, with external nodes `E`):<br> 842 * `((((((H3)S3)((H2)S2)M4)S2)E2)R)` 843 * 844 * The Expanded tree is rather clumsy and mostly included for the sake of 845 * completeness. The different versions of Coarse Grained Tree Representations 846 * are variatios of Shapiro's linear tree notation. 847 * 848 * For the output of aligned structures from string editing, different 849 * representations are needed, where we put the label on both sides. 850 * The above examples for tree representations would then look like: 851 * 852 * @verbatim 853 * a) (UU)(P(P(P(P(UU)(UU)(P(P(P(UU)(UU)(UU)P)P)P)(UU)(UU)(P(P(UU)(U... 854 * b) (UU)(P2(P2(U2U2)(P2(U3U3)P3)(U2U2)(P2(U2U2)P2)P2)(UU)P2)(UU) 855 * c) (B(M(HH)(HH)M)B) 856 * (S(B(S(M(S(HH)S)(S(HH)S)M)S)B)S) 857 * (E(S(B(S(M(S(HH)S)(S(HH)S)M)S)B)S)E) 858 * d) (R(E2(S2(B1(S2(M4(S3(H3)S3)((H2)S2)M4)S2)B1)S2)E2)R) 859 * @endverbatim 860 * 861 * Aligned structures additionally contain the gap character `_`. 862 */ 863 864 /** 865 * @brief Homeomorphically Irreducible Tree (HIT) representation of a secondary structure 866 * @see vrna_db_to_tree_string() 867 */ 868 #define VRNA_STRUCTURE_TREE_HIT 1U 869 870 871 /** 872 * @brief (short) Coarse Grained representation of a secondary structure 873 * @see vrna_db_to_tree_string() 874 */ 875 #define VRNA_STRUCTURE_TREE_SHAPIRO_SHORT 2U 876 877 878 /** 879 * @brief (full) Coarse Grained representation of a secondary structure 880 * @see vrna_db_to_tree_string() 881 */ 882 #define VRNA_STRUCTURE_TREE_SHAPIRO 3U 883 884 885 /** 886 * @brief (extended) Coarse Grained representation of a secondary structure 887 * @see vrna_db_to_tree_string() 888 */ 889 #define VRNA_STRUCTURE_TREE_SHAPIRO_EXT 4U 890 891 892 /** 893 * @brief (weighted) Coarse Grained representation of a secondary structure 894 * @see vrna_db_to_tree_string() 895 */ 896 #define VRNA_STRUCTURE_TREE_SHAPIRO_WEIGHT 5U 897 898 /** 899 * @brief Expanded Tree representation of a secondary structure 900 * @see vrna_db_to_tree_string() 901 */ 902 #define VRNA_STRUCTURE_TREE_EXPANDED 6U 903 904 905 /** 906 * @brief Convert a Dot-Bracket structure string into tree string representation 907 * 908 * This function allows one to convert a secondary structure in dot-bracket notation 909 * into one of the various tree representations for secondary structures. The resulting 910 * tree is then represented as a string of parenthesis and node symbols, similar to 911 * to the Newick format. 912 * 913 * Currently we support conversion into the following formats, denoted by the value 914 * of parameter @p type: 915 * * #VRNA_STRUCTURE_TREE_HIT - @copybrief #VRNA_STRUCTURE_TREE_HIT 916 * (See also Fontana et al. 1993 @cite fontana:1993b) 917 * * #VRNA_STRUCTURE_TREE_SHAPIRO_SHORT - @copybrief #VRNA_STRUCTURE_TREE_SHAPIRO_SHORT 918 * (same as Shapiro 1988 @cite shapiro:1988, but with root node @p R and without @p S nodes for the stems) 919 * * #VRNA_STRUCTURE_TREE_SHAPIRO - @copybrief #VRNA_STRUCTURE_TREE_SHAPIRO 920 * (See also Shapiro 1988 @cite shapiro:1988) 921 * * #VRNA_STRUCTURE_TREE_SHAPIRO_EXT - @copybrief #VRNA_STRUCTURE_TREE_SHAPIRO_EXT 922 * (same as Shapiro 1988 @cite shapiro:1988, but external nodes denoted as @p E ) 923 * * #VRNA_STRUCTURE_TREE_SHAPIRO_WEIGHT - @copybrief #VRNA_STRUCTURE_TREE_SHAPIRO_WEIGHT 924 * (same as #VRNA_STRUCTURE_TREE_SHAPIRO_EXT but with additional weights 925 * for number of unpaired nucleotides in loop, and number of pairs in stems) 926 * * #VRNA_STRUCTURE_TREE_EXPANDED - @copybrief #VRNA_STRUCTURE_TREE_EXPANDED 927 * 928 * @see @ref sec_structure_representations_tree 929 * 930 * @param structure The null-terminated dot-bracket structure string 931 * @param type A switch to determine the type of tree string representation 932 * @return A tree representation of the input @p structure 933 */ 934 char * 935 vrna_db_to_tree_string(const char *structure, 936 unsigned int type); 937 938 939 /** 940 * @brief Remove weights from a linear string tree representation of a secondary structure 941 * 942 * This function strips the weights of a linear string tree representation such as @p HIT, 943 * or Coarse Grained Tree sensu Shapiro @cite shapiro:1988 944 * 945 * @see vrna_db_to_tree_string() 946 * 947 * @param structure A linear string tree representation of a secondary structure with weights 948 * @return A linear string tree representation of a secondary structure without weights 949 */ 950 char * 951 vrna_tree_string_unweight(const char *structure); 952 953 954 /** 955 * @brief Convert a linear tree string representation of a secondary structure back to Dot-Bracket notation 956 * 957 * @warning This function only accepts <em>Expanded</em> and <em>HIT</em> tree representations! 958 * 959 * @see vrna_db_to_tree_string(), #VRNA_STRUCTURE_TREE_EXPANDED, #VRNA_STRUCTURE_TREE_HIT, 960 * @ref sec_structure_representations_tree 961 * 962 * @param tree A linear tree string representation of a secondary structure 963 * @return A dot-bracket notation of the secondary structure provided in @p tree 964 */ 965 char * 966 vrna_tree_string_to_db(const char *tree); 967 968 969 /* End tree representations */ 970 /**@}*/ 971 972 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY 973 974 /*###########################################*/ 975 /*# deprecated functions below #*/ 976 /*###########################################*/ 977 978 /** 979 * @brief Create a #vrna_ep_t from a dot-bracket string 980 * 981 * The dot-bracket string is parsed and for each base pair an 982 * entry in the plist is created. The probability of each pair in 983 * the list is set by a function parameter. 984 * 985 * The end of the plist is marked by sequence positions i as well as j 986 * equal to 0. This condition should be used to stop looping over its 987 * entries 988 * 989 * @deprecated Use vrna_plist() instead 990 * 991 * @ingroup part_func_global_deprecated 992 * 993 * @param pl A pointer to the #vrna_ep_t that is to be created 994 * @param struc The secondary structure in dot-bracket notation 995 * @param pr The probability for each base pair 996 */ 997 DEPRECATED(void assign_plist_from_db(vrna_ep_t **pl, 998 const char *struc, 999 float pr), 1000 "Use vrna_plist() instead"); 1001 1002 /** 1003 * @brief Pack secondary secondary structure, 5:1 compression using base 3 encoding 1004 * 1005 * Returns a binary string encoding of the secondary structure using 1006 * a 5:1 compression scheme. The string is NULL terminated and can 1007 * therefore be used with standard string functions such as strcmp(). 1008 * Useful for programs that need to keep many structures in memory. 1009 * 1010 * @deprecated Use vrna_db_pack() as a replacement 1011 * @ingroup struct_utils_deprecated 1012 * @param struc The secondary structure in dot-bracket notation 1013 * @return The binary encoded structure 1014 */ 1015 DEPRECATED(char *pack_structure(const char *struc), 1016 "Use vrna_db_pack() instead"); 1017 1018 /** 1019 * @brief Unpack secondary structure previously packed with pack_structure() 1020 * 1021 * Translate a compressed binary string produced by pack_structure() back into 1022 * the familiar dot-bracket notation. 1023 * 1024 * @deprecated Use vrna_db_unpack() as a replacement 1025 * @ingroup struct_utils_deprecated 1026 * @param packed The binary encoded packed secondary structure 1027 * @return The unpacked secondary structure in dot-bracket notation 1028 */ 1029 DEPRECATED(char *unpack_structure(const char *packed), 1030 "Use vrna_db_unpack() instead"); 1031 1032 /** 1033 * @brief Create a pair table of a secondary structure 1034 * 1035 * Returns a newly allocated table, such that table[i]=j if (i.j) pair 1036 * or 0 if i is unpaired, table[0] contains the length of the structure. 1037 * 1038 * @deprecated Use vrna_ptable() instead 1039 * @ingroup struct_utils_deprecated 1040 * 1041 * @param structure The secondary structure in dot-bracket notation 1042 * @return A pointer to the created pair_table 1043 */ 1044 DEPRECATED(short *make_pair_table(const char *structure), 1045 "Use vrna_ptable() instead"); 1046 1047 DEPRECATED(short *make_pair_table_pk(const char *structure), 1048 "Use vrna_ptable_from_string() instead"); 1049 1050 /** 1051 * @brief Get an exact copy of a pair table 1052 * 1053 * @deprecated Use vrna_ptable_copy() instead 1054 * @ingroup struct_utils_deprecated 1055 * 1056 * @param pt The pair table to be copied 1057 * @return A pointer to the copy of 'pt' 1058 */ 1059 DEPRECATED(short *copy_pair_table(const short *pt), 1060 "Use vrna_ptable_copy() instead"); 1061 1062 /** 1063 * Pair table for snoop align 1064 * 1065 * @deprecated Use vrna_pt_ali_get() instead! 1066 * @ingroup struct_utils_deprecated 1067 */ 1068 DEPRECATED(short *alimake_pair_table(const char *structure), 1069 "Use vrna_pt_ali_get() instead"); 1070 1071 /** 1072 * returns a newly allocated table, such that: table[i]=j if (i.j) pair or 1073 * 0 if i is unpaired, table[0] contains the length of the structure. 1074 * The special pseudoknotted H/ACA-mRNA structure is taken into account. 1075 * @deprecated Use vrna_pt_snoop_get() instead! 1076 * @ingroup struct_utils_deprecated 1077 */ 1078 DEPRECATED(short *make_pair_table_snoop(const char *structure), 1079 "Use vrna_pt_snoop_get() instead"); 1080 1081 DEPRECATED(int *make_loop_index_pt(short *pt), 1082 "Use vrna_loopidx_from_ptable() instead"); 1083 1084 /** 1085 * @brief Compute the "base pair" distance between two secondary structures s1 and s2. 1086 * 1087 * The sequences should have the same length. 1088 * dist = number of base pairs in one structure but not in the other 1089 * same as edit distance with open-pair close-pair as move-set 1090 * 1091 * @deprecated Use vrna_bp_distance instead 1092 * @ingroup struct_utils_deprecated 1093 * @param str1 First structure in dot-bracket notation 1094 * @param str2 Second structure in dot-bracket notation 1095 * @return The base pair distance between str1 and str2 1096 */ 1097 DEPRECATED(int bp_distance(const char *str1, 1098 const char *str2), 1099 "Use vrna_bp_distance() instead"); 1100 1101 /** 1102 * @brief Make a reference base pair count matrix 1103 * 1104 * Get an upper triangular matrix containing the number of basepairs of a reference 1105 * structure for each interval [i,j] with i<j. Access it via iindx!!! 1106 * 1107 * @deprecated Use vrna_refBPcnt_matrix() instead 1108 * @ingroup struct_utils_deprecated 1109 */ 1110 DEPRECATED(unsigned int *make_referenceBP_array(short *reference_pt, 1111 unsigned int turn), 1112 "Use vrna_refBPcnt_matrix() instead"); 1113 1114 /** 1115 * @brief Make a reference base pair distance matrix 1116 * 1117 * Get an upper triangular matrix containing the base pair distance of two 1118 * reference structures for each interval [i,j] with i<j. Access it via iindx!!! 1119 * 1120 * @deprecated Use vrna_refBPdist_matrix() instead 1121 * @ingroup struct_utils_deprecated 1122 */ 1123 DEPRECATED(unsigned int *compute_BPdifferences(short *pt1, 1124 short *pt2, 1125 unsigned int turn), 1126 "Use vrna_refBPdist_matrix() instead"); 1127 1128 /** 1129 * @brief Create a vrna_ep_t from a probability matrix 1130 * 1131 * The probability matrix given is parsed and all pair probabilities above 1132 * the given threshold are used to create an entry in the plist 1133 * 1134 * The end of the plist is marked by sequence positions i as well as j 1135 * equal to 0. This condition should be used to stop looping over its 1136 * entries 1137 * 1138 * @note This function is threadsafe 1139 * @deprecated Use vrna_plist_from_probs() instead! 1140 * 1141 * @ingroup part_func_global_deprecated 1142 * 1143 * @param[out] pl A pointer to the vrna_ep_t that is to be created 1144 * @param[in] probs The probability matrix used for creating the plist 1145 * @param[in] length The length of the RNA sequence 1146 * @param[in] cutoff The cutoff value 1147 */ 1148 DEPRECATED(void assign_plist_from_pr(vrna_ep_t **pl, 1149 FLT_OR_DBL *probs, 1150 int length, 1151 double cutoff), 1152 "Use vrna_plist_from_probs() instead"); 1153 1154 /** 1155 * @brief Create a dot-backet/parenthesis structure from backtracking stack 1156 * 1157 * @deprecated use vrna_parenthesis_structure() instead 1158 * @ingroup struct_utils_deprecated 1159 * 1160 * @note This function is threadsafe 1161 */ 1162 DEPRECATED(void parenthesis_structure(char *structure, 1163 vrna_bp_stack_t *bp, 1164 int length), 1165 "Use vrna_parenthesis_structure() instead"); 1166 1167 /** 1168 * @brief Create a dot-backet/parenthesis structure from backtracking stack 1169 * obtained by zuker suboptimal calculation in cofold.c 1170 * 1171 * @deprecated use vrna_parenthesis_zuker instead 1172 * @ingroup struct_utils_deprecated 1173 * 1174 * @note This function is threadsafe 1175 */ 1176 DEPRECATED(void parenthesis_zuker(char *structure, 1177 vrna_bp_stack_t *bp, 1178 int length), 1179 "Use vrna_parenthesis_zuker() instead"); 1180 1181 DEPRECATED(void letter_structure(char *structure, 1182 vrna_bp_stack_t *bp, 1183 int length), 1184 "Use vrna_letter_structure() instead"); 1185 1186 /** 1187 * @brief Create a dot-bracket like structure string from base pair probability matrix 1188 * @deprecated Use vrna_db_from_probs() instead! 1189 * @ingroup struct_utils_deprecated 1190 */ 1191 DEPRECATED(void bppm_to_structure(char *structure, 1192 FLT_OR_DBL *pr, 1193 unsigned int length), 1194 "Use vrna_db_from_probs() instead"); 1195 1196 /** 1197 * @brief Get a pseudo dot bracket notation for a given probability information 1198 * @deprecated Use vrna_bpp_symbol() instead! 1199 * @ingroup struct_utils_deprecated 1200 */ 1201 DEPRECATED(char bppm_symbol(const float *x), 1202 "Use vrna_bpp_symbol() instead"); 1203 1204 #endif 1205 1206 /** 1207 * @} 1208 */ 1209 1210 #endif 1211