1 #ifndef VIENNA_RNA_PACKAGE_FILE_FORMATS_MSA_H 2 #define VIENNA_RNA_PACKAGE_FILE_FORMATS_MSA_H 3 4 /** 5 * @file ViennaRNA/io/file_formats_msa.h 6 * @ingroup utils, file_utils, file_formats_msa 7 * @brief Functions dealing with file formats for Multiple Sequence Alignments (MSA) 8 */ 9 10 #include <stdio.h> 11 12 /** 13 * @addtogroup file_formats_msa 14 * @{ 15 * @brief Functions to read/write multiple sequence alignments (MSA) in various file formats 16 */ 17 18 /** 19 * @brief Option flag indicating ClustalW formatted files 20 * @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format() 21 */ 22 #define VRNA_FILE_FORMAT_MSA_CLUSTAL 1U 23 24 /** 25 * @brief Option flag indicating Stockholm 1.0 formatted files 26 * @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format() 27 */ 28 #define VRNA_FILE_FORMAT_MSA_STOCKHOLM 2U 29 30 /** 31 * @brief Option flag indicating FASTA (Pearson) formatted files 32 * @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format() 33 */ 34 #define VRNA_FILE_FORMAT_MSA_FASTA 4U 35 36 /** 37 * @brief Option flag indicating MAF formatted files 38 * @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format() 39 */ 40 #define VRNA_FILE_FORMAT_MSA_MAF 8U 41 42 /** 43 * @brief Option flag indicating most informative sequence (MIS) output 44 * 45 * The default reference sequence output for an alignment is simply a consensus sequence. 46 * This flag allows to write the most informative equence (MIS) instead. 47 * 48 * @see vrna_file_msa_write() 49 */ 50 #define VRNA_FILE_FORMAT_MSA_MIS 16U 51 52 /** 53 * @brief Option flag indicating the set of default file formats 54 * @see vrna_file_msa_read(), vrna_file_msa_read_record(), vrna_file_msa_detect_format() 55 */ 56 #define VRNA_FILE_FORMAT_MSA_DEFAULT ( \ 57 VRNA_FILE_FORMAT_MSA_CLUSTAL \ 58 | VRNA_FILE_FORMAT_MSA_STOCKHOLM \ 59 | VRNA_FILE_FORMAT_MSA_FASTA \ 60 | VRNA_FILE_FORMAT_MSA_MAF \ 61 ) 62 63 /** 64 * @brief Option flag to disable validation of the alignment 65 * @see vrna_file_msa_read(), vrna_file_msa_read_record() 66 */ 67 #define VRNA_FILE_FORMAT_MSA_NOCHECK 4096U 68 69 /** 70 * @brief Return flag of vrna_file_msa_detect_format() to indicate unknown or malformatted alignment 71 * @see vrna_file_msa_detect_format() 72 */ 73 #define VRNA_FILE_FORMAT_MSA_UNKNOWN 8192U 74 75 /** 76 * @brief Option flag indicating to append data to a multiple sequence alignment file rather than overwriting it 77 * @see vrna_file_msa_write() 78 */ 79 #define VRNA_FILE_FORMAT_MSA_APPEND 16384U 80 81 /** 82 * @brief Option flag to suppress unnecessary spam messages on <tt>stderr</tt> 83 * @see vrna_file_msa_read(), vrna_file_msa_read_record() 84 */ 85 #define VRNA_FILE_FORMAT_MSA_QUIET 32768U 86 87 /** 88 * @brief Option flag to completely silence any warnings on <tt>stderr</tt> 89 * @see vrna_file_msa_read(), vrna_file_msa_read_record() 90 */ 91 #define VRNA_FILE_FORMAT_MSA_SILENT 65536U 92 93 /** 94 * @brief Read a multiple sequence alignment from file 95 * 96 * This function reads the (first) multiple sequence alignment from 97 * an input file. The read alignment is split into the sequence id/name 98 * part and the actual sequence information and stored in memory as 99 * arrays of ids/names and sequences. If the alignment file format 100 * allows for additional information, such as an ID of the entire alignment 101 * or consensus structure information, this data is retrieved as well 102 * and made available. The @p options parameter allows to specify the 103 * set of alignment file formats that should be used to retrieve the data. 104 * If 0 is passed as option, the list of alignment file formats defaults to 105 * #VRNA_FILE_FORMAT_MSA_DEFAULT. 106 * 107 * Currently, the list of parsable multiple sequence alignment file formats 108 * consists of: 109 * - @ref msa-formats-clustal 110 * - @ref msa-formats-stockholm 111 * - @ref msa-formats-fasta 112 * - @ref msa-formats-maf 113 * . 114 * 115 * @note After successfully reading an alignment, this function performs 116 * a validation of the data that includes uniqueness of the sequence 117 * identifiers, and equal sequence lengths. This check can be 118 * deactivated by passing #VRNA_FILE_FORMAT_MSA_NOCHECK in the 119 * @p options parameter. 120 * 121 * @note It is the users responsibility to free any memory occupied by 122 * the output arguments @p names, @p aln, @p id, and @p structure 123 * after calling this function. The function automatically sets the 124 * latter two arguments to <tt>NULL</tt> in case no corresponding 125 * data could be retrieved from the input alignment. 126 * 127 * @see vrna_file_msa_read_record(), #VRNA_FILE_FORMAT_MSA_CLUSTAL, 128 * #VRNA_FILE_FORMAT_MSA_STOCKHOLM, #VRNA_FILE_FORMAT_MSA_FASTA, 129 * #VRNA_FILE_FORMAT_MSA_MAF, #VRNA_FILE_FORMAT_MSA_DEFAULT, 130 * #VRNA_FILE_FORMAT_MSA_NOCHECK 131 * 132 * @param filename The name of input file that contains the alignment 133 * @param names An address to the pointer where sequence identifiers 134 * should be written to 135 * @param aln An address to the pointer where aligned sequences should 136 * be written to 137 * @param id An address to the pointer where the alignment ID should 138 * be written to (Maybe NULL) 139 * @param structure An address to the pointer where consensus structure 140 * information should be written to (Maybe NULL) 141 * @param options Options to manipulate the behavior of this function 142 * @return The number of sequences in the alignment, or -1 if 143 * no alignment record could be found 144 */ 145 int 146 vrna_file_msa_read(const char *filename, 147 char ***names, 148 char ***aln, 149 char **id, 150 char **structure, 151 unsigned int options); 152 153 154 /** 155 * @brief Read a multiple sequence alignment from file handle 156 * 157 * Similar to vrna_file_msa_read(), this function reads a multiple 158 * sequence alignment from an input file handle. Since using a file 159 * handle, this function is not limited to the first alignment record, 160 * but allows for looping over all alignments within the input. 161 * 162 * The read alignment is split into the sequence id/name 163 * part and the actual sequence information and stored in memory as 164 * arrays of ids/names and sequences. If the alignment file format 165 * allows for additional information, such as an ID of the entire alignment 166 * or consensus structure information, this data is retrieved as well 167 * and made available. The @p options parameter allows to specify the 168 * alignment file format used to retrieve the data. A single format 169 * must be specified here, see vrna_file_msa_detect_format() for helping 170 * to determine the correct MSA file format. 171 * 172 * Currently, the list of parsable multiple sequence alignment file formats 173 * consists of: 174 * - @ref msa-formats-clustal 175 * - @ref msa-formats-stockholm 176 * - @ref msa-formats-fasta 177 * - @ref msa-formats-maf 178 * . 179 * 180 * @note After successfully reading an alignment, this function performs 181 * a validation of the data that includes uniqueness of the sequence 182 * identifiers, and equal sequence lengths. This check can be 183 * deactivated by passing #VRNA_FILE_FORMAT_MSA_NOCHECK in the 184 * @p options parameter. 185 * 186 * @note It is the users responsibility to free any memory occupied by 187 * the output arguments @p names, @p aln, @p id, and @p structure 188 * after calling this function. The function automatically sets the 189 * latter two arguments to <tt>NULL</tt> in case no corresponding 190 * data could be retrieved from the input alignment. 191 * 192 * @see vrna_file_msa_read(), vrna_file_msa_detect_format(), 193 * #VRNA_FILE_FORMAT_MSA_CLUSTAL, #VRNA_FILE_FORMAT_MSA_STOCKHOLM, 194 * #VRNA_FILE_FORMAT_MSA_FASTA, #VRNA_FILE_FORMAT_MSA_MAF, 195 * #VRNA_FILE_FORMAT_MSA_DEFAULT, #VRNA_FILE_FORMAT_MSA_NOCHECK 196 * 197 * @param fp The file pointer the data will be retrieved from 198 * @param names An address to the pointer where sequence identifiers 199 * should be written to 200 * @param aln An address to the pointer where aligned sequences should 201 * be written to 202 * @param id An address to the pointer where the alignment ID should 203 * be written to (Maybe NULL) 204 * @param structure An address to the pointer where consensus structure 205 * information should be written to (Maybe NULL) 206 * @param options Options to manipulate the behavior of this function 207 * @return The number of sequences in the alignment, or -1 if 208 * no alignment record could be found 209 */ 210 int 211 vrna_file_msa_read_record(FILE *fp, 212 char ***names, 213 char ***aln, 214 char **id, 215 char **structure, 216 unsigned int options); 217 218 219 /** 220 * @brief Detect the format of a multiple sequence alignment file 221 * 222 * This function attempts to determine the format of a file that 223 * supposedly contains a multiple sequence alignment (MSA). This is 224 * useful in cases where a MSA file contains more than a single record 225 * and therefore vrna_file_msa_read() can not be applied, since 226 * it only retrieves the first. 227 * Here, one can try to guess the correct file format using this 228 * function and then loop over the file, record by record using one 229 * of the low-level record retrieval functions for the corresponding 230 * MSA file format. 231 * 232 * @note This function parses the entire first record within the 233 * specified file. As a result, it returns #VRNA_FILE_FORMAT_MSA_UNKNOWN 234 * not only if it can't detect the file's format, but also 235 * in cases where the file doesn't contain sequences! 236 * 237 * @see vrna_file_msa_read(), vrna_file_stockholm_read_record(), 238 * vrna_file_clustal_read_record(), vrna_file_fasta_read_record() 239 * 240 * @param filename The name of input file that contains the alignment 241 * @param options Options to manipulate the behavior of this function 242 * @return The MSA file format, or #VRNA_FILE_FORMAT_MSA_UNKNOWN 243 */ 244 unsigned int 245 vrna_file_msa_detect_format(const char *filename, 246 unsigned int options); 247 248 249 /** 250 * @brief Write multiple sequence alignment file 251 * 252 * @note Currently, we only support @ref msa-formats-stockholm output 253 * 254 * @see #VRNA_FILE_FORMAT_MSA_STOCKHOLM, #VRNA_FILE_FORMAT_MSA_APPEND, 255 * #VRNA_FILE_FORMAT_MSA_MIS 256 * 257 * @param filename The output filename 258 * @param names The array of sequence names / identifies 259 * @param aln The array of aligned sequences 260 * @param id An optional ID for the alignment 261 * @param structure An optional consensus structure 262 * @param source A string describing the source of the alignment 263 * @param options Options to manipulate the behavior of this function 264 * @return Non-null upon successfully writing the alignment to file 265 */ 266 int 267 vrna_file_msa_write(const char *filename, 268 const char **names, 269 const char **aln, 270 const char *id, 271 const char *structure, 272 const char *source, 273 unsigned int options); 274 275 276 /** 277 * @} 278 */ 279 280 #endif 281