1 #ifndef VIENNA_RNA_PACKAGE_ALN_UTIL_H 2 #define VIENNA_RNA_PACKAGE_ALN_UTIL_H 3 4 #ifdef VRNA_WARN_DEPRECATED 5 # if defined(__clang__) 6 # define DEPRECATED(func, msg) func __attribute__ ((deprecated("", msg))) 7 # elif defined(__GNUC__) 8 # define DEPRECATED(func, msg) func __attribute__ ((deprecated(msg))) 9 # else 10 # define DEPRECATED(func, msg) func 11 # endif 12 #else 13 # define DEPRECATED(func, msg) func 14 #endif 15 16 /** 17 * @file ViennaRNA/utils/alignments.h 18 * @ingroup utils, aln_utils 19 * @brief Various utility- and helper-functions for sequence alignments and comparative structure prediction 20 */ 21 22 /** 23 * @addtogroup aln_utils 24 * @{ 25 * @brief Functions to extract features from and to manipulate multiple sequence alignments 26 */ 27 28 /** @brief Typename for the base pair info repesenting data structure #vrna_pinfo_s */ 29 typedef struct vrna_pinfo_s vrna_pinfo_t; 30 31 32 /** 33 * @brief Use default alignment settings 34 */ 35 #define VRNA_ALN_DEFAULT 0U 36 37 38 /** 39 * @brief Convert to RNA alphabet 40 */ 41 #define VRNA_ALN_RNA 1U 42 43 44 /** 45 * @brief Convert to DNA alphabet 46 */ 47 #define VRNA_ALN_DNA 2U 48 49 50 /** 51 * @brief Convert to uppercase nucleotide letters 52 */ 53 #define VRNA_ALN_UPPERCASE 4U 54 55 56 /** 57 * @brief Convert to lowercase nucleotide letters 58 */ 59 #define VRNA_ALN_LOWERCASE 8U 60 61 /** 62 * @brief Flag indicating Shannon Entropy measure 63 * 64 * Shannon Entropy is defined as @f$ H = - \sum_c p_c \cdot \log_2 p_c @f$ 65 */ 66 #define VRNA_MEASURE_SHANNON_ENTROPY 1U 67 68 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY 69 70 /* the following typedefs are for backward compatibility only */ 71 72 /** 73 * @brief Old typename of #vrna_pinfo_s 74 * @deprecated Use #vrna_pinfo_t instead! 75 * @ingroup aln_utils_deprecated 76 */ 77 typedef struct vrna_pinfo_s pair_info; 78 79 #endif 80 81 #include <ViennaRNA/fold_compound.h> 82 #include <ViennaRNA/model.h> 83 84 /** 85 * @brief A base pair info structure 86 * 87 * For each base pair (i,j) with i,j in [0, n-1] the structure lists: 88 * - its probability 'p' 89 * - an entropy-like measure for its well-definedness 'ent' 90 * - the frequency of each type of pair in 'bp[]' 91 * + 'bp[0]' contains the number of non-compatible sequences 92 * + 'bp[1]' the number of CG pairs, etc. 93 */ 94 struct vrna_pinfo_s { 95 unsigned i; /**< @brief nucleotide position i */ 96 unsigned j; /**< @brief nucleotide position j */ 97 float p; /**< @brief Probability */ 98 float ent; /**< @brief Pseudo entropy for @f$ p(i,j) = S_i + S_j - p_ij*ln(p_ij) @f$ */ 99 short bp[8]; /**< @brief Frequencies of pair_types */ 100 char comp; /**< @brief 1 iff pair is in mfe structure */ 101 }; 102 103 104 /** 105 * @brief Get the mean pairwise identity in steps from ?to?(ident) 106 * 107 * @param alignment Aligned sequences 108 * @return The mean pairwise identity 109 */ 110 int 111 vrna_aln_mpi(const char **alignment); 112 113 114 /** 115 * \brief Retrieve an array of #vrna_pinfo_t structures from precomputed pair probabilities 116 * 117 * This array of structures contains information about positionwise pair probabilies, 118 * base pair entropy and more 119 * 120 * \see #vrna_pinfo_t, and vrna_pf() 121 * 122 * \param vc The #vrna_fold_compound_t of type #VRNA_FC_TYPE_COMPARATIVE with precomputed partition function matrices 123 * \param structure An optional structure in dot-bracket notation (Maybe NULL) 124 * \param threshold Do not include results with pair probabilities below threshold 125 * \return The #vrna_pinfo_t array 126 */ 127 vrna_pinfo_t * 128 vrna_aln_pinfo(vrna_fold_compound_t *vc, 129 const char *structure, 130 double threshold); 131 132 133 int * 134 vrna_aln_pscore(const char **alignment, 135 vrna_md_t *md); 136 137 138 /** 139 * @brief Slice out a subalignment from a larger alignment 140 * 141 * @note The user is responsible to free the memory occupied by 142 * the returned subalignment 143 * 144 * @see vrna_aln_free() 145 * 146 * @param alignment The input alignment 147 * @param i The first column of the subalignment (1-based) 148 * @param j The last column of the subalignment (1-based) 149 * @return The subalignment between column @f$i@f$ and @f$j@f$ 150 */ 151 char ** 152 vrna_aln_slice(const char **alignment, 153 unsigned int i, 154 unsigned int j); 155 156 157 /** 158 * @brief Free memory occupied by a set of aligned sequences 159 * 160 * @param alignment The input alignment 161 */ 162 void 163 vrna_aln_free(char **alignment); 164 165 166 /** 167 * @brief Create a copy of an alignment with only uppercase letters in the sequences 168 * 169 * @see vrna_aln_copy 170 * 171 * @param alignment The input sequence alignment (last entry must be @em NULL terminated) 172 * @return A copy of the input alignment where lowercase sequence letters are replaced by uppercase letters 173 */ 174 char ** 175 vrna_aln_uppercase(const char **alignment); 176 177 178 /** 179 * @brief Create a copy of an alignment where DNA alphabet is replaced by RNA alphabet 180 * 181 * @see vrna_aln_copy 182 * 183 * @param alignment The input sequence alignment (last entry must be @em NULL terminated) 184 * @return A copy of the input alignment where DNA alphabet is replaced by RNA alphabet (T -> U) 185 */ 186 char ** 187 vrna_aln_toRNA(const char **alignment); 188 189 190 /** 191 * @brief Make a copy of a multiple sequence alignment 192 * 193 * This function allows one to create a copy of a multiple sequence alignment. The @p options parameter 194 * additionally allows for sequence manipulation, such as converting DNA to RNA alphabet, and conversion 195 * to uppercase letters. 196 * 197 * @see vrna_aln_copy(), #VRNA_ALN_RNA, #VRNA_ALN_UPPERCASE, #VRNA_ALN_DEFAULT 198 * 199 * @param alignment The input sequence alignment (last entry must be @em NULL terminated) 200 * @param options Option flags indicating whether the aligned sequences should be converted 201 * @return A (manipulated) copy of the input alignment 202 */ 203 char ** 204 vrna_aln_copy(const char **alignment, 205 unsigned int options); 206 207 208 /** 209 * @brief Compute base pair conservation of a consensus structure 210 * 211 * This function computes the base pair conservation (fraction of canonical base pairs) 212 * of a consensus structure given a multiple sequence alignment. The base pair types 213 * that are considered canonical may be specified using the #vrna_md_t.pair array. 214 * Passing @em NULL as parameter @p md results in default pairing rules, i.e. canonical 215 * Watson-Crick and GU Wobble pairs. 216 * 217 * @param alignment The input sequence alignment (last entry must be @em NULL terminated) 218 * @param structure The consensus structure in dot-bracket notation 219 * @param md Model details that specify compatible base pairs (Maybe @em NULL) 220 * @return A 1-based vector of base pair conservations 221 */ 222 float * 223 vrna_aln_conservation_struct(const char **alignment, 224 const char *structure, 225 const vrna_md_t *md); 226 227 228 /** 229 * @brief Compute nucleotide conservation in an alignment 230 * 231 * This function computes the conservation of nucleotides in alignment columns. 232 * The simples measure is Shannon Entropy and can be selected by passing the 233 * #VRNA_MEASURE_SHANNON_ENTROPY flag in the @p options parameter. 234 * 235 * @note Currently, only #VRNA_MEASURE_SHANNON_ENTROPY is supported as 236 * conservation measure. 237 * 238 * @see #VRNA_MEASURE_SHANNON_ENTROPY 239 * 240 * @param alignment The input sequence alignment (last entry must be @em NULL terminated) 241 * @param md Model details that specify known nucleotides (Maybe @em NULL) 242 * @param options A flag indicating which measure of conservation should be applied 243 * @return A 1-based vector of column conservations 244 */ 245 float * 246 vrna_aln_conservation_col(const char **alignment, 247 const vrna_md_t *md_p, 248 unsigned int options); 249 250 251 /** 252 * @brief Compute the consensus sequence for a given multiple sequence alignment 253 * 254 * @param alignment The input sequence alignment (last entry must be @em NULL terminated) 255 * @param md_p Model details that specify known nucleotides (Maybe @em NULL) 256 * @return The consensus sequence of the alignment, i.e. the most frequent nucleotide for each alignment column 257 */ 258 char * 259 vrna_aln_consensus_sequence(const char **alignment, 260 const vrna_md_t *md_p); 261 262 /** 263 * @brief Compute the Most Informative Sequence (MIS) for a given multiple sequence alignment 264 * 265 * The most informative sequence (MIS) @cite freyhult:2005 displays for each alignment column 266 * the nucleotides with frequency greater than the background frequency, projected into IUPAC 267 * notation. Columns where gaps are over-represented are in lower case. 268 * 269 * @param alignment The input sequence alignment (last entry must be @em NULL terminated) 270 * @param md_p Model details that specify known nucleotides (Maybe @em NULL) 271 * @return The most informative sequence for the alignment 272 */ 273 char * 274 vrna_aln_consensus_mis(const char **alignment, 275 const vrna_md_t *md_p); 276 277 278 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY 279 280 #include <stdio.h> 281 /** 282 * @ingroup aln_utils_deprecated 283 */ 284 DEPRECATED(int read_clustal(FILE *clust, 285 char *AlignedSeqs[], 286 char *names[]), 287 "Use vrna_file_msa_read() and vrna_file_msa_read_record() instead"); 288 289 290 /** 291 * @ingroup aln_utils_deprecated 292 */ 293 DEPRECATED(char *consensus(const char *AS[]), 294 "Use vrna_aln_consensus_sequence() instead!"); 295 296 297 /** 298 * @ingroup aln_utils_deprecated 299 */ 300 DEPRECATED(char *consens_mis(const char *AS[]), 301 "Use vrna_aln_consensus_mis() instead!"); 302 303 304 /** 305 * @ingroup aln_utils_deprecated 306 */ 307 DEPRECATED(char *get_ungapped_sequence(const char *seq), 308 "Use vrna_seq_ungapped() instead!"); 309 310 311 /** 312 * @brief Get the mean pairwise identity in steps from ?to?(ident) 313 * 314 * @deprecated Use vrna_aln_mpi() as a replacement 315 * @ingroup aln_utils_deprecated 316 * @param Alseq 317 * @param n_seq The number of sequences in the alignment 318 * @param length The length of the alignment 319 * @param mini 320 * @return The mean pairwise identity 321 */ 322 DEPRECATED(int get_mpi(char *Alseq[], 323 int n_seq, 324 int length, 325 int *mini), 326 "Use vrna_aln_mpi() instead"); 327 328 /* 329 ############################################################# 330 # some helper functions that might be useful in the library # 331 ############################################################# 332 */ 333 334 /** 335 * @brief Get arrays with encoded sequence of the alignment 336 * 337 * this function assumes that in S, S5, s3, ss and as enough 338 * space is already allocated (size must be at least sequence length+2) 339 * 340 * @ingroup aln_utils_deprecated 341 * @param sequence The gapped sequence from the alignment 342 * @param S pointer to an array that holds encoded sequence 343 * @param s5 pointer to an array that holds the next base 5' of alignment position i 344 * @param s3 pointer to an array that holds the next base 3' of alignment position i 345 * @param ss 346 * @param as 347 * @param circ assume the molecules to be circular instead of linear (circ=0) 348 */ 349 DEPRECATED(void encode_ali_sequence(const char *sequence, 350 short *S, 351 short *s5, 352 short *s3, 353 char *ss, 354 unsigned short *as, 355 int circ), 356 "This function is obsolete"); 357 358 359 /** 360 * @brief Allocate memory for sequence array used to deal with aligned sequences 361 * 362 * Note that these arrays will also be initialized according to the sequence alignment given 363 * 364 * @see free_sequence_arrays() 365 * 366 * @ingroup aln_utils_deprecated 367 * @param sequences The aligned sequences 368 * @param S A pointer to the array of encoded sequences 369 * @param S5 A pointer to the array that contains the next 5' nucleotide of a sequence position 370 * @param S3 A pointer to the array that contains the next 3' nucleotide of a sequence position 371 * @param a2s A pointer to the array that contains the alignment to sequence position mapping 372 * @param Ss A pointer to the array that contains the ungapped sequence 373 * @param circ assume the molecules to be circular instead of linear (circ=0) 374 */ 375 DEPRECATED(void alloc_sequence_arrays(const char **sequences, 376 short ***S, 377 short ***S5, 378 short ***S3, 379 unsigned short ***a2s, 380 char ***Ss, 381 int circ), 382 "This function is obsolete"); 383 384 385 /** 386 * @brief Free the memory of the sequence arrays used to deal with aligned sequences 387 * 388 * This function frees the memory previously allocated with alloc_sequence_arrays() 389 * 390 * @see alloc_sequence_arrays() 391 * 392 * @ingroup aln_utils_deprecated 393 * @param n_seq The number of aligned sequences 394 * @param S A pointer to the array of encoded sequences 395 * @param S5 A pointer to the array that contains the next 5' nucleotide of a sequence position 396 * @param S3 A pointer to the array that contains the next 3' nucleotide of a sequence position 397 * @param a2s A pointer to the array that contains the alignment to sequence position mapping 398 * @param Ss A pointer to the array that contains the ungapped sequence 399 */ 400 DEPRECATED(void free_sequence_arrays(unsigned int n_seq, 401 short ***S, 402 short ***S5, 403 short ***S3, 404 unsigned short ***a2s, 405 char ***Ss), 406 "This fucntion is obsolete"); 407 408 #endif 409 410 /** 411 * @} 412 */ 413 414 415 #endif 416