1 #ifndef VIENNA_RNA_PACKAGE_FILE_FORMATS_MSA_H
2 #define VIENNA_RNA_PACKAGE_FILE_FORMATS_MSA_H
3 
4 /**
5  *  @file ViennaRNA/io/file_formats_msa.h
6  *  @ingroup  utils, file_utils, file_formats_msa
7  *  @brief Functions dealing with file formats for Multiple Sequence Alignments (MSA)
8  */
9 
10 #include <stdio.h>
11 
12 /**
13  *  @addtogroup   file_formats_msa
14  *  @{
15  *  @brief  Functions to read/write multiple sequence alignments (MSA) in various file formats
16  */
17 
18 /**
19  *  @brief  Option flag indicating ClustalW formatted files
20  *  @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format()
21  */
22 #define VRNA_FILE_FORMAT_MSA_CLUSTAL      1U
23 
24 /**
25  *  @brief Option flag indicating Stockholm 1.0 formatted files
26  *  @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format()
27  */
28 #define VRNA_FILE_FORMAT_MSA_STOCKHOLM    2U
29 
30 /**
31  *  @brief Option flag indicating FASTA (Pearson) formatted files
32  *  @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format()
33  */
34 #define VRNA_FILE_FORMAT_MSA_FASTA        4U
35 
36 /**
37  *  @brief Option flag indicating MAF formatted files
38  *  @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format()
39  */
40 #define VRNA_FILE_FORMAT_MSA_MAF          8U
41 
42 /**
43  *  @brief Option flag indicating most informative sequence (MIS) output
44  *
45  *  The default reference sequence output for an alignment is simply a consensus sequence.
46  *  This flag allows to write the most informative equence (MIS) instead.
47  *
48  *  @see vrna_file_msa_write()
49  */
50 #define VRNA_FILE_FORMAT_MSA_MIS          16U
51 
52 /**
53  *  @brief Option flag indicating the set of default file formats
54  *  @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format()
55  */
56 #define VRNA_FILE_FORMAT_MSA_DEFAULT      ( \
57     VRNA_FILE_FORMAT_MSA_CLUSTAL \
58     | VRNA_FILE_FORMAT_MSA_STOCKHOLM \
59     | VRNA_FILE_FORMAT_MSA_FASTA \
60     | VRNA_FILE_FORMAT_MSA_MAF \
61     )
62 
63 /**
64  *  @brief Option flag to disable validation of the alignment
65  *  @see  vrna_file_msa_read(), vrna_file_msa_read_record()
66  */
67 #define VRNA_FILE_FORMAT_MSA_NOCHECK      4096U
68 
69 /**
70  *  @brief Return flag of vrna_file_msa_detect_format() to indicate unknown or malformatted alignment
71  *  @see vrna_file_msa_detect_format()
72  */
73 #define VRNA_FILE_FORMAT_MSA_UNKNOWN      8192U
74 
75 /**
76  *  @brief Option flag indicating to append data to a multiple sequence alignment file rather than overwriting it
77  *  @see vrna_file_msa_write()
78  */
79 #define VRNA_FILE_FORMAT_MSA_APPEND       16384U
80 
81 /**
82  *  @brief Option flag to suppress unnecessary spam messages on <tt>stderr</tt>
83  *  @see vrna_file_msa_read(), vrna_file_msa_read_record()
84  */
85 #define VRNA_FILE_FORMAT_MSA_QUIET        32768U
86 
87 /**
88  *  @brief Option flag to completely silence any warnings on <tt>stderr</tt>
89  *  @see vrna_file_msa_read(), vrna_file_msa_read_record()
90  */
91 #define VRNA_FILE_FORMAT_MSA_SILENT       65536U
92 
93 /**
94  *  @brief Read a multiple sequence alignment from file
95  *
96  *  This function reads the (first) multiple sequence alignment from
97  *  an input file. The read alignment is split into the sequence id/name
98  *  part and the actual sequence information and stored in memory as
99  *  arrays of ids/names and sequences. If the alignment file format
100  *  allows for additional information, such as an ID of the entire alignment
101  *  or consensus structure information, this data is retrieved as well
102  *  and made available. The @p options parameter allows to specify the
103  *  set of alignment file formats that should be used to retrieve the data.
104  *  If 0 is passed as option, the list of alignment file formats defaults to
105  *  #VRNA_FILE_FORMAT_MSA_DEFAULT.
106  *
107  *  Currently, the list of parsable multiple sequence alignment file formats
108  *  consists of:
109  *  - @ref msa-formats-clustal
110  *  - @ref msa-formats-stockholm
111  *  - @ref msa-formats-fasta
112  *  - @ref msa-formats-maf
113  *  .
114  *
115  *  @note After successfully reading an alignment, this function performs
116  *        a validation of the data that includes uniqueness of the sequence
117  *        identifiers, and equal sequence lengths. This check can be
118  *        deactivated by passing #VRNA_FILE_FORMAT_MSA_NOCHECK in the
119  *        @p options parameter.
120  *
121  *  @note It is the users responsibility to free any memory occupied by
122  *        the output arguments @p names, @p aln, @p id, and @p structure
123  *        after calling this function. The function automatically sets the
124  *        latter two arguments to <tt>NULL</tt> in case no corresponding
125  *        data could be retrieved from the input alignment.
126  *
127  *  @see  vrna_file_msa_read_record(), #VRNA_FILE_FORMAT_MSA_CLUSTAL,
128  *        #VRNA_FILE_FORMAT_MSA_STOCKHOLM, #VRNA_FILE_FORMAT_MSA_FASTA,
129  *        #VRNA_FILE_FORMAT_MSA_MAF, #VRNA_FILE_FORMAT_MSA_DEFAULT,
130  *        #VRNA_FILE_FORMAT_MSA_NOCHECK
131  *
132  *  @param  filename    The name of input file that contains the alignment
133  *  @param  names       An address to the pointer where sequence identifiers
134  *                      should be written to
135  *  @param  aln         An address to the pointer where aligned sequences should
136  *                      be written to
137  *  @param  id          An address to the pointer where the alignment ID should
138  *                      be written to (Maybe NULL)
139  *  @param  structure   An address to the pointer where consensus structure
140  *                      information should be written to (Maybe NULL)
141  *  @param  options     Options to manipulate the behavior of this function
142  *  @return             The number of sequences in the alignment, or -1 if
143  *                      no alignment record could be found
144  */
145 int
146 vrna_file_msa_read(const char   *filename,
147                    char         ***names,
148                    char         ***aln,
149                    char         **id,
150                    char         **structure,
151                    unsigned int options);
152 
153 
154 /**
155  *  @brief Read a multiple sequence alignment from file handle
156  *
157  *  Similar to vrna_file_msa_read(), this function reads a multiple
158  *  sequence alignment from an input file handle. Since using a file
159  *  handle, this function is not limited to the first alignment record,
160  *  but allows for looping over all alignments within the input.
161  *
162  *  The read alignment is split into the sequence id/name
163  *  part and the actual sequence information and stored in memory as
164  *  arrays of ids/names and sequences. If the alignment file format
165  *  allows for additional information, such as an ID of the entire alignment
166  *  or consensus structure information, this data is retrieved as well
167  *  and made available. The @p options parameter allows to specify the
168  *  alignment file format used to retrieve the data. A single format
169  *  must be specified here, see vrna_file_msa_detect_format() for helping
170  *  to determine the correct MSA file format.
171  *
172  *  Currently, the list of parsable multiple sequence alignment file formats
173  *  consists of:
174  *  - @ref msa-formats-clustal
175  *  - @ref msa-formats-stockholm
176  *  - @ref msa-formats-fasta
177  *  - @ref msa-formats-maf
178  *  .
179  *
180  *  @note After successfully reading an alignment, this function performs
181  *        a validation of the data that includes uniqueness of the sequence
182  *        identifiers, and equal sequence lengths. This check can be
183  *        deactivated by passing #VRNA_FILE_FORMAT_MSA_NOCHECK in the
184  *        @p options parameter.
185  *
186  *  @note It is the users responsibility to free any memory occupied by
187  *        the output arguments @p names, @p aln, @p id, and @p structure
188  *        after calling this function. The function automatically sets the
189  *        latter two arguments to <tt>NULL</tt> in case no corresponding
190  *        data could be retrieved from the input alignment.
191  *
192  *  @see  vrna_file_msa_read(), vrna_file_msa_detect_format(),
193  *        #VRNA_FILE_FORMAT_MSA_CLUSTAL, #VRNA_FILE_FORMAT_MSA_STOCKHOLM,
194  *        #VRNA_FILE_FORMAT_MSA_FASTA, #VRNA_FILE_FORMAT_MSA_MAF,
195  *        #VRNA_FILE_FORMAT_MSA_DEFAULT, #VRNA_FILE_FORMAT_MSA_NOCHECK
196  *
197  *  @param  fp          The file pointer the data will be retrieved from
198  *  @param  names       An address to the pointer where sequence identifiers
199  *                      should be written to
200  *  @param  aln         An address to the pointer where aligned sequences should
201  *                      be written to
202  *  @param  id          An address to the pointer where the alignment ID should
203  *                      be written to (Maybe NULL)
204  *  @param  structure   An address to the pointer where consensus structure
205  *                      information should be written to (Maybe NULL)
206  *  @param  options     Options to manipulate the behavior of this function
207  *  @return             The number of sequences in the alignment, or -1 if
208  *                      no alignment record could be found
209  */
210 int
211 vrna_file_msa_read_record(FILE          *fp,
212                           char          ***names,
213                           char          ***aln,
214                           char          **id,
215                           char          **structure,
216                           unsigned int  options);
217 
218 
219 /**
220  *  @brief Detect the format of a multiple sequence alignment file
221  *
222  *  This function attempts to determine the format of a file that
223  *  supposedly contains a multiple sequence alignment (MSA). This is
224  *  useful in cases where a MSA file contains more than a single record
225  *  and therefore vrna_file_msa_read() can not be applied, since
226  *  it only retrieves the first.
227  *  Here, one can try to guess the correct file format using this
228  *  function and then loop over the file, record by record using one
229  *  of the low-level record retrieval functions for the corresponding
230  *  MSA file format.
231  *
232  *  @note This function parses the entire first record within the
233  *        specified file. As a result, it returns #VRNA_FILE_FORMAT_MSA_UNKNOWN
234  *        not only if it can't detect the file's format, but also
235  *        in cases where the file doesn't contain sequences!
236  *
237  *  @see  vrna_file_msa_read(), vrna_file_stockholm_read_record(),
238  *        vrna_file_clustal_read_record(), vrna_file_fasta_read_record()
239  *
240  *  @param  filename  The name of input file that contains the alignment
241  *  @param  options   Options to manipulate the behavior of this function
242  *  @return           The MSA file format, or #VRNA_FILE_FORMAT_MSA_UNKNOWN
243  */
244 unsigned int
245 vrna_file_msa_detect_format(const char    *filename,
246                             unsigned int  options);
247 
248 
249 /**
250  *  @brief Write multiple sequence alignment file
251  *
252  *  @note Currently, we only support @ref msa-formats-stockholm output
253  *
254  *  @see  #VRNA_FILE_FORMAT_MSA_STOCKHOLM, #VRNA_FILE_FORMAT_MSA_APPEND,
255  *        #VRNA_FILE_FORMAT_MSA_MIS
256  *
257  *  @param  filename  The output filename
258  *  @param  names     The array of sequence names / identifies
259  *  @param  aln       The array of aligned sequences
260  *  @param  id        An optional ID for the alignment
261  *  @param  structure An optional consensus structure
262  *  @param  source    A string describing the source of the alignment
263  *  @param  options   Options to manipulate the behavior of this function
264  *  @return           Non-null upon successfully writing the alignment to file
265  */
266 int
267 vrna_file_msa_write(const char    *filename,
268                     const char    **names,
269                     const char    **aln,
270                     const char    *id,
271                     const char    *structure,
272                     const char    *source,
273                     unsigned int  options);
274 
275 
276 /**
277  * @}
278  */
279 
280 #endif
281