1 #ifndef VIENNA_RNA_PACKAGE_ALN_UTIL_H
2 #define VIENNA_RNA_PACKAGE_ALN_UTIL_H
3 
4 #ifdef VRNA_WARN_DEPRECATED
5 # if defined(__clang__)
6 #  define DEPRECATED(func, msg) func __attribute__ ((deprecated("", msg)))
7 # elif defined(__GNUC__)
8 #  define DEPRECATED(func, msg) func __attribute__ ((deprecated(msg)))
9 # else
10 #  define DEPRECATED(func, msg) func
11 # endif
12 #else
13 # define DEPRECATED(func, msg) func
14 #endif
15 
16 /**
17  *  @file ViennaRNA/utils/alignments.h
18  *  @ingroup utils, aln_utils
19  *  @brief Various utility- and helper-functions for sequence alignments and comparative structure prediction
20  */
21 
22 /**
23  *  @addtogroup aln_utils
24  *  @{
25  *  @brief  Functions to extract features from and to manipulate multiple sequence alignments
26  */
27 
28 /** @brief Typename for the base pair info repesenting data structure #vrna_pinfo_s */
29 typedef struct vrna_pinfo_s vrna_pinfo_t;
30 
31 
32 /**
33  *  @brief  Use default alignment settings
34  */
35 #define VRNA_ALN_DEFAULT      0U
36 
37 
38 /**
39  *  @brief  Convert to RNA alphabet
40  */
41 #define VRNA_ALN_RNA          1U
42 
43 
44 /**
45  *  @brief  Convert to DNA alphabet
46  */
47 #define VRNA_ALN_DNA          2U
48 
49 
50 /**
51  *  @brief  Convert to uppercase nucleotide letters
52  */
53 #define VRNA_ALN_UPPERCASE    4U
54 
55 
56 /**
57  *  @brief  Convert to lowercase nucleotide letters
58  */
59 #define VRNA_ALN_LOWERCASE    8U
60 
61 /**
62  *  @brief  Flag indicating Shannon Entropy measure
63  *
64  *  Shannon Entropy is defined as @f$ H = - \sum_c p_c \cdot \log_2 p_c @f$
65  */
66 #define VRNA_MEASURE_SHANNON_ENTROPY  1U
67 
68 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
69 
70 /* the following typedefs are for backward compatibility only */
71 
72 /**
73  *  @brief Old typename of #vrna_pinfo_s
74  *  @deprecated Use #vrna_pinfo_t instead!
75  *  @ingroup  aln_utils_deprecated
76  */
77 typedef struct vrna_pinfo_s pair_info;
78 
79 #endif
80 
81 #include <ViennaRNA/fold_compound.h>
82 #include <ViennaRNA/model.h>
83 
84 /**
85  *  @brief A base pair info structure
86  *
87  *  For each base pair (i,j) with i,j in [0, n-1] the structure lists:
88  *  - its probability 'p'
89  *  - an entropy-like measure for its well-definedness 'ent'
90  *  - the frequency of each type of pair in 'bp[]'
91  *    + 'bp[0]' contains the number of non-compatible sequences
92  *    + 'bp[1]' the number of CG pairs, etc.
93  */
94 struct vrna_pinfo_s {
95   unsigned  i;      /**<  @brief  nucleotide position i */
96   unsigned  j;      /**<  @brief  nucleotide position j */
97   float     p;      /**< @brief  Probability */
98   float     ent;    /**< @brief  Pseudo entropy for @f$ p(i,j) = S_i + S_j - p_ij*ln(p_ij) @f$ */
99   short     bp[8];  /**< @brief  Frequencies of pair_types */
100   char      comp;   /**< @brief  1 iff pair is in mfe structure */
101 };
102 
103 
104 /**
105  *  @brief Get the mean pairwise identity in steps from ?to?(ident)
106  *
107  *  @param alignment  Aligned sequences
108  *  @return       The mean pairwise identity
109  */
110 int
111 vrna_aln_mpi(const char **alignment);
112 
113 
114 /**
115  *  \brief Retrieve an array of #vrna_pinfo_t structures from precomputed pair probabilities
116  *
117  *  This array of structures contains information about positionwise pair probabilies,
118  *  base pair entropy and more
119  *
120  *  \see #vrna_pinfo_t, and vrna_pf()
121  *
122  *  \param  vc          The #vrna_fold_compound_t of type #VRNA_FC_TYPE_COMPARATIVE with precomputed partition function matrices
123  *  \param  structure   An optional structure in dot-bracket notation (Maybe NULL)
124  *  \param  threshold   Do not include results with pair probabilities below threshold
125  *  \return             The #vrna_pinfo_t array
126  */
127 vrna_pinfo_t *
128 vrna_aln_pinfo(vrna_fold_compound_t *vc,
129                const char           *structure,
130                double               threshold);
131 
132 
133 int *
134 vrna_aln_pscore(const char  **alignment,
135                 vrna_md_t   *md);
136 
137 
138 /**
139  *  @brief  Slice out a subalignment from a larger alignment
140  *
141  *  @note   The user is responsible to free the memory occupied by
142  *          the returned subalignment
143  *
144  *  @see    vrna_aln_free()
145  *
146  *  @param  alignment   The input alignment
147  *  @param  i           The first column of the subalignment (1-based)
148  *  @param  j           The last column of the subalignment (1-based)
149  *  @return             The subalignment between column @f$i@f$ and @f$j@f$
150  */
151 char **
152 vrna_aln_slice(const char   **alignment,
153                unsigned int i,
154                unsigned int j);
155 
156 
157 /**
158  *  @brief  Free memory occupied by a set of aligned sequences
159  *
160  *  @param  alignment   The input alignment
161  */
162 void
163 vrna_aln_free(char **alignment);
164 
165 
166 /**
167  *  @brief  Create a copy of an alignment with only uppercase letters in the sequences
168  *
169  *  @see  vrna_aln_copy
170  *
171  *  @param  alignment   The input sequence alignment (last entry must be @em NULL terminated)
172  *  @return             A copy of the input alignment where lowercase sequence letters are replaced by uppercase letters
173  */
174 char **
175 vrna_aln_uppercase(const char **alignment);
176 
177 
178 /**
179  *  @brief  Create a copy of an alignment where DNA alphabet is replaced by RNA alphabet
180  *
181  *  @see  vrna_aln_copy
182  *
183  *  @param  alignment   The input sequence alignment (last entry must be @em NULL terminated)
184  *  @return             A copy of the input alignment where DNA alphabet is replaced by RNA alphabet (T -> U)
185  */
186 char **
187 vrna_aln_toRNA(const char **alignment);
188 
189 
190 /**
191  *  @brief  Make a copy of a multiple sequence alignment
192  *
193  *  This function allows one to create a copy of a multiple sequence alignment. The @p options parameter
194  *  additionally allows for sequence manipulation, such as converting DNA to RNA alphabet, and conversion
195  *  to uppercase letters.
196  *
197  *  @see  vrna_aln_copy(), #VRNA_ALN_RNA, #VRNA_ALN_UPPERCASE, #VRNA_ALN_DEFAULT
198  *
199  *  @param  alignment   The input sequence alignment (last entry must be @em NULL terminated)
200  *  @param  options     Option flags indicating whether the aligned sequences should be converted
201  *  @return             A (manipulated) copy of the input alignment
202  */
203 char **
204 vrna_aln_copy(const char    **alignment,
205               unsigned int  options);
206 
207 
208 /**
209  *  @brief Compute base pair conservation of a consensus structure
210  *
211  *  This function computes the base pair conservation (fraction of canonical base pairs)
212  *  of a consensus structure given a multiple sequence alignment. The base pair types
213  *  that are considered canonical may be specified using the #vrna_md_t.pair array.
214  *  Passing @em NULL as parameter @p md results in default pairing rules, i.e. canonical
215  *  Watson-Crick and GU Wobble pairs.
216  *
217  *  @param  alignment   The input sequence alignment (last entry must be @em NULL terminated)
218  *  @param  structure   The consensus structure in dot-bracket notation
219  *  @param  md          Model details that specify compatible base pairs (Maybe @em NULL)
220  *  @return             A 1-based vector of base pair conservations
221  */
222 float *
223 vrna_aln_conservation_struct(const char       **alignment,
224                              const char       *structure,
225                              const vrna_md_t  *md);
226 
227 
228 /**
229  *  @brief Compute nucleotide conservation in an alignment
230  *
231  *  This function computes the conservation of nucleotides in alignment columns.
232  *  The simples measure is Shannon Entropy and can be selected by passing the
233  *  #VRNA_MEASURE_SHANNON_ENTROPY flag in the @p options parameter.
234  *
235  *  @note Currently, only #VRNA_MEASURE_SHANNON_ENTROPY is supported as
236  *        conservation measure.
237  *
238  *  @see #VRNA_MEASURE_SHANNON_ENTROPY
239  *
240  *  @param  alignment   The input sequence alignment (last entry must be @em NULL terminated)
241  *  @param  md          Model details that specify known nucleotides (Maybe @em NULL)
242  *  @param  options     A flag indicating which measure of conservation should be applied
243  *  @return             A 1-based vector of column conservations
244  */
245 float *
246 vrna_aln_conservation_col(const char      **alignment,
247                           const vrna_md_t *md_p,
248                           unsigned int    options);
249 
250 
251 /**
252  *  @brief  Compute the consensus sequence for a given multiple sequence alignment
253  *
254  *  @param  alignment   The input sequence alignment (last entry must be @em NULL terminated)
255  *  @param  md_p        Model details that specify known nucleotides (Maybe @em NULL)
256  *  @return             The consensus sequence of the alignment, i.e. the most frequent nucleotide for each alignment column
257  */
258 char *
259 vrna_aln_consensus_sequence(const char      **alignment,
260                             const vrna_md_t *md_p);
261 
262 /**
263  *  @brief  Compute the Most Informative Sequence (MIS) for a given multiple sequence alignment
264  *
265  *  The most informative sequence (MIS) @cite freyhult:2005 displays for each alignment column
266  *  the nucleotides with frequency greater than the background frequency, projected into IUPAC
267  *  notation. Columns where gaps are over-represented are in lower case.
268  *
269  *  @param  alignment   The input sequence alignment (last entry must be @em NULL terminated)
270  *  @param  md_p        Model details that specify known nucleotides (Maybe @em NULL)
271  *  @return             The most informative sequence for the alignment
272  */
273 char *
274 vrna_aln_consensus_mis(const char       **alignment,
275                        const vrna_md_t  *md_p);
276 
277 
278 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
279 
280 #include <stdio.h>
281 /**
282  *  @ingroup  aln_utils_deprecated
283  */
284 DEPRECATED(int read_clustal(FILE  *clust,
285                             char  *AlignedSeqs[],
286                             char  *names[]),
287           "Use vrna_file_msa_read() and vrna_file_msa_read_record() instead");
288 
289 
290 /**
291  *  @ingroup  aln_utils_deprecated
292  */
293 DEPRECATED(char *consensus(const char *AS[]),
294           "Use vrna_aln_consensus_sequence() instead!");
295 
296 
297 /**
298  *  @ingroup  aln_utils_deprecated
299  */
300 DEPRECATED(char *consens_mis(const char *AS[]),
301           "Use vrna_aln_consensus_mis() instead!");
302 
303 
304 /**
305  *  @ingroup  aln_utils_deprecated
306  */
307 DEPRECATED(char *get_ungapped_sequence(const char *seq),
308           "Use vrna_seq_ungapped() instead!");
309 
310 
311 /**
312  *  @brief Get the mean pairwise identity in steps from ?to?(ident)
313  *
314  *  @deprecated Use vrna_aln_mpi() as a replacement
315  *  @ingroup  aln_utils_deprecated
316  *  @param Alseq
317  *  @param n_seq  The number of sequences in the alignment
318  *  @param length The length of the alignment
319  *  @param mini
320  *  @return       The mean pairwise identity
321  */
322 DEPRECATED(int get_mpi(char *Alseq[],
323                        int  n_seq,
324                        int  length,
325                        int  *mini),
326           "Use vrna_aln_mpi() instead");
327 
328 /*
329  #############################################################
330  # some helper functions that might be useful in the library #
331  #############################################################
332  */
333 
334 /**
335  *  @brief Get arrays with encoded sequence of the alignment
336  *
337  *  this function assumes that in S, S5, s3, ss and as enough
338  *  space is already allocated (size must be at least sequence length+2)
339  *
340  *  @ingroup  aln_utils_deprecated
341  *  @param sequence The gapped sequence from the alignment
342  *  @param S        pointer to an array that holds encoded sequence
343  *  @param s5      pointer to an array that holds the next base 5' of alignment position i
344  *  @param s3      pointer to an array that holds the next base 3' of alignment position i
345  *  @param ss
346  *  @param as
347  *  @param circ    assume the molecules to be circular instead of linear (circ=0)
348  */
349 DEPRECATED(void encode_ali_sequence(const char      *sequence,
350                                     short           *S,
351                                     short           *s5,
352                                     short           *s3,
353                                     char            *ss,
354                                     unsigned short  *as,
355                                     int             circ),
356           "This function is obsolete");
357 
358 
359 /**
360  *  @brief Allocate memory for sequence array used to deal with aligned sequences
361  *
362  *  Note that these arrays will also be initialized according to the sequence alignment given
363  *
364  *  @see free_sequence_arrays()
365  *
366  *  @ingroup  aln_utils_deprecated
367  *  @param sequences  The aligned sequences
368  *  @param S          A pointer to the array of encoded sequences
369  *  @param S5         A pointer to the array that contains the next 5' nucleotide of a sequence position
370  *  @param S3         A pointer to the array that contains the next 3' nucleotide of a sequence position
371  *  @param a2s        A pointer to the array that contains the alignment to sequence position mapping
372  *  @param Ss         A pointer to the array that contains the ungapped sequence
373  *  @param circ       assume the molecules to be circular instead of linear (circ=0)
374  */
375 DEPRECATED(void  alloc_sequence_arrays(const char     **sequences,
376                                        short          ***S,
377                                        short          ***S5,
378                                        short          ***S3,
379                                        unsigned short ***a2s,
380                                        char           ***Ss,
381                                        int            circ),
382           "This function is obsolete");
383 
384 
385 /**
386  *  @brief Free the memory of the sequence arrays used to deal with aligned sequences
387  *
388  *  This function frees the memory previously allocated with alloc_sequence_arrays()
389  *
390  *  @see alloc_sequence_arrays()
391  *
392  *  @ingroup  aln_utils_deprecated
393  *  @param n_seq      The number of aligned sequences
394  *  @param S          A pointer to the array of encoded sequences
395  *  @param S5         A pointer to the array that contains the next 5' nucleotide of a sequence position
396  *  @param S3         A pointer to the array that contains the next 3' nucleotide of a sequence position
397  *  @param a2s        A pointer to the array that contains the alignment to sequence position mapping
398  *  @param Ss         A pointer to the array that contains the ungapped sequence
399  */
400 DEPRECATED(void  free_sequence_arrays(unsigned int    n_seq,
401                                       short           ***S,
402                                       short           ***S5,
403                                       short           ***S3,
404                                       unsigned short  ***a2s,
405                                       char            ***Ss),
406           "This fucntion is obsolete");
407 
408 #endif
409 
410 /**
411  * @}
412  */
413 
414 
415 #endif
416