1 /*
2  *  file_formats_msa.c
3  *
4  *  Various functions dealing with file formats for Multiple Sequence Alignments (MSA)
5  *
6  *  (c) 2016 Ronny Lorenz
7  *
8  *  ViennaRNA package
9  */
10 
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <math.h>
19 #include <ctype.h>
20 
21 #include "ViennaRNA/fold_vars.h"
22 #include "ViennaRNA/utils/basic.h"
23 #include "ViennaRNA/utils/alignments.h"
24 #include "ViennaRNA/io/utils.h"
25 #include "ViennaRNA/io/file_formats.h"
26 #include "ViennaRNA/io/file_formats_msa.h"
27 
28 /*
29  #################################
30  # STATIC DECLARATIONS           #
31  #################################
32  */
33 
34 typedef int (aln_parser_function)(FILE  *fp,
35                                   char  ***names,
36                                   char  ***aln,
37                                   char  **id,
38                                   char  **structure,
39                                   int   verbosity);
40 typedef int (aln_writer_function)(FILE          *fp,
41                                   const char    **names,
42                                   const char    **aln,
43                                   const char    *id,
44                                   const char    *structure,
45                                   const char    *source,
46                                   unsigned int  options,
47                                   int           verbosity);
48 
49 typedef struct {
50   unsigned int        code;
51   aln_parser_function *parser;
52   const char          *name;
53 } parsable;
54 
55 typedef struct {
56   unsigned int        code;
57   aln_writer_function *writer;
58   const char          *name;
59 } writable;
60 
61 PRIVATE aln_parser_function parse_aln_stockholm;
62 
63 PRIVATE aln_parser_function parse_aln_clustal;
64 
65 PRIVATE aln_parser_function parse_aln_fasta;
66 
67 PRIVATE aln_parser_function parse_aln_maf;
68 
69 PRIVATE aln_writer_function write_aln_stockholm;
70 
71 PRIVATE int
72 parse_fasta_alignment(FILE  *fp,
73                       char  ***names,
74                       char  ***aln,
75                       int   verbosity);
76 
77 
78 PRIVATE int
79 parse_clustal_alignment(FILE  *clust,
80                         char  ***names,
81                         char  ***aln,
82                         int   verbosity);
83 
84 
85 PRIVATE int
86 parse_stockholm_alignment(FILE  *fp,
87                           char  ***aln,
88                           char  ***names,
89                           char  **id,
90                           char  **structure,
91                           int   verbosity);
92 
93 
94 PRIVATE int
95 parse_maf_alignment(FILE  *fp,
96                     char  ***aln,
97                     char  ***names,
98                     int   verbosity);
99 
100 
101 PRIVATE int
102 write_stockholm_alignment(FILE          *fp,
103                           const char    **names,
104                           const char    **aln,
105                           const char    *id,
106                           const char    *structure,
107                           const char    *source,
108                           unsigned int  options,
109                           int           verbosity);
110 
111 
112 PRIVATE int
113 check_alignment(const char  **names,
114                 const char  **aln,
115                 int         seq_num,
116                 int         verbosity);
117 
118 
119 PRIVATE void
120 free_msa_record(char  ***names,
121                 char  ***aln,
122                 char  **id,
123                 char  **structure);
124 
125 
126 PRIVATE void
127 add_sequence(const char *id,
128              const char *seq,
129              char       ***names,
130              char       ***aln,
131              int        seq_num);
132 
133 
134 PRIVATE void
135 endmarker_msa_record(char ***names,
136                      char ***aln,
137                      int  seq_num);
138 
139 
140 /*
141  #################################
142  # STATIC VARIABLES              #
143  #################################
144  */
145 
146 /* number of known alignment parsers */
147 #define NUM_PARSERS 4
148 #define NUM_WRITERS 1
149 
150 static parsable known_parsers[NUM_PARSERS] = {
151   /* option, parser, name */
152   { VRNA_FILE_FORMAT_MSA_STOCKHOLM, parse_aln_stockholm, "Stockholm 1.0 format" },
153   { VRNA_FILE_FORMAT_MSA_CLUSTAL,   parse_aln_clustal,   "ClustalW format"      },
154   { VRNA_FILE_FORMAT_MSA_FASTA,     parse_aln_fasta,     "FASTA format"         },
155   { VRNA_FILE_FORMAT_MSA_MAF,       parse_aln_maf,       "MAF format"           }
156 };
157 
158 static writable known_writers[NUM_WRITERS] = {
159   /* option, parser, name */
160   { VRNA_FILE_FORMAT_MSA_STOCKHOLM, write_aln_stockholm, "Stockholm 1.0 format" }
161 };
162 
163 /*
164  #################################
165  # BEGIN OF FUNCTION DEFINITIONS #
166  #################################
167  */
168 PUBLIC unsigned int
vrna_file_msa_detect_format(const char * filename,unsigned int options)169 vrna_file_msa_detect_format(const char    *filename,
170                             unsigned int  options)
171 {
172   FILE          *fp;
173   char          **names, **aln;
174   unsigned int  format;
175   int           i, r;
176   long int      fp_position;
177 
178   names   = NULL;
179   aln     = NULL;
180   format  = VRNA_FILE_FORMAT_MSA_UNKNOWN;
181 
182   /* if no alignment file format(s) were specified we probe for all of them */
183   if (options == 0)
184     options = VRNA_FILE_FORMAT_MSA_DEFAULT;
185 
186   if (!(fp = fopen(filename, "r"))) {
187     if (!(options & VRNA_FILE_FORMAT_MSA_SILENT))
188       vrna_message_warning("vrna_file_msa_detect_format: "
189                            "Can't open alignment file \"%s\"!",
190                            filename);
191 
192     return format;
193   }
194 
195   r           = -1;
196   fp_position = ftell(fp);
197 
198   for (i = 0; i < NUM_PARSERS; i++) {
199     if ((options & known_parsers[i].code) && (known_parsers[i].parser)) {
200       /* go back to beginning of file */
201       if (!fseek(fp, fp_position, SEEK_SET)) {
202         r = known_parsers[i].parser(fp, &names, &aln, NULL, NULL, -1);
203         free_msa_record(&names, &aln, NULL, NULL);
204         if (r > 0) {
205           format = known_parsers[i].code;
206           break;
207         }
208       } else {
209         vrna_message_warning("vrna_file_msa_detect_format: "
210                              "Something unexpected happened while parsing the alignment file");
211         goto msa_detect_format_exit;
212       }
213     }
214   }
215 
216 msa_detect_format_exit:
217 
218   fclose(fp);
219 
220   return format;
221 }
222 
223 
224 PUBLIC int
vrna_file_msa_read(const char * filename,char *** names,char *** aln,char ** id,char ** structure,unsigned int options)225 vrna_file_msa_read(const char   *filename,
226                    char         ***names,
227                    char         ***aln,
228                    char         **id,
229                    char         **structure,
230                    unsigned int options)
231 {
232   FILE      *fp;
233   int       i, seq_num, r, verb_level;
234   long int  fp_position;
235 
236   verb_level  = 1; /* we default to be very verbose */
237   seq_num     = 0;
238 
239   if (options & VRNA_FILE_FORMAT_MSA_QUIET)
240     verb_level = 0;
241 
242   if (options & VRNA_FILE_FORMAT_MSA_SILENT)
243     verb_level = -1;
244 
245   if (!(fp = fopen(filename, "r"))) {
246     if (verb_level >= 0)
247       vrna_message_warning("vrna_file_msa_read: "
248                            "Can't open alignment file \"%s\"!",
249                            filename);
250 
251     return seq_num;
252   }
253 
254   if (names && aln) {
255     *names  = NULL;
256     *aln    = NULL;
257   } else {
258     return seq_num;
259   }
260 
261   if (id)
262     *id = NULL;
263 
264   if (structure)
265     *structure = NULL;
266 
267   /* if no alignment file format was specified, lets try to guess it */
268   if (options == 0)
269     options = VRNA_FILE_FORMAT_MSA_DEFAULT;
270 
271   r           = -1;
272   fp_position = ftell(fp);
273 
274   for (i = 0; i < NUM_PARSERS; i++) {
275     if ((options & known_parsers[i].code) && (known_parsers[i].parser)) {
276       /* go back to beginning of file */
277       if (!fseek(fp, fp_position, SEEK_SET)) {
278         r = known_parsers[i].parser(fp, names, aln, id, structure, verb_level);
279         if (r > 0)
280           break;
281       } else {
282         vrna_message_warning("vrna_file_msa_read: "
283                              "Something unexpected happened while parsing the alignment file");
284         goto msa_read_exit;
285       }
286     }
287   }
288 
289   if (r == -1) {
290     if (verb_level >= 0)
291       vrna_message_warning("vrna_file_msa_read: "
292                            "Alignment file parser is unknown (or not specified?)");
293   } else {
294     seq_num = r;
295 
296     if ((seq_num > 0) && (!(options & VRNA_FILE_FORMAT_MSA_NOCHECK))) {
297       if (!check_alignment((const char **)(*names), (const char **)(*aln), seq_num, verb_level)) {
298         if (verb_level >= 0)
299           vrna_message_warning("vrna_file_msa_read: "
300                                "Alignment did not pass sanity checks!");
301 
302         /* discard the data we've read! */
303         free_msa_record(names, aln, id, structure);
304 
305         seq_num = 0;
306       }
307     }
308   }
309 
310 msa_read_exit:
311 
312   fclose(fp);
313 
314   return seq_num;
315 }
316 
317 
318 PUBLIC int
vrna_file_msa_read_record(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,unsigned int options)319 vrna_file_msa_read_record(FILE          *fp,
320                           char          ***names,
321                           char          ***aln,
322                           char          **id,
323                           char          **structure,
324                           unsigned int  options)
325 {
326   const char          *parser_name;
327   int                 i, r, seq_num, verb_level;
328   aln_parser_function *parser;
329 
330   verb_level  = 1; /* we default to be very verbose */
331   seq_num     = 0;
332   parser_name = NULL;
333   parser      = NULL;
334 
335   if (options & VRNA_FILE_FORMAT_MSA_QUIET)
336     verb_level = 0;
337 
338   if (options & VRNA_FILE_FORMAT_MSA_SILENT)
339     verb_level = -1;
340 
341   if (!fp) {
342     if (verb_level >= 0)
343       vrna_message_warning("Can't read alignment from file pointer!");
344 
345     return seq_num;
346   }
347 
348   if (names && aln) {
349     *names  = NULL;
350     *aln    = NULL;
351   } else {
352     return seq_num;
353   }
354 
355   if (id)
356     *id = NULL;
357 
358   if (structure)
359     *structure = NULL;
360 
361   for (r = i = 0; i < NUM_PARSERS; i++) {
362     if ((options & known_parsers[i].code) && (known_parsers[i].parser)) {
363       if (!parser) {
364         parser      = known_parsers[i].parser;
365         parser_name = known_parsers[i].name;
366       }
367 
368       r++;
369     }
370   }
371 
372   if (r == 0) {
373     if (verb_level >= 0)
374       vrna_message_warning("Did not find parser for specified MSA format!");
375   } else {
376     if ((r > 1) && (verb_level > 0))
377       vrna_message_warning("More than one MSA format parser specified!\n"
378                            "Using parser for %s", parser_name);
379 
380     seq_num = parser(fp, names, aln, id, structure, verb_level);
381 
382     if ((seq_num > 0) && (!(options & VRNA_FILE_FORMAT_MSA_NOCHECK))) {
383       if (!check_alignment((const char **)(*names), (const char **)(*aln), seq_num, verb_level)) {
384         if (verb_level >= 0)
385           vrna_message_warning("Alignment did not pass sanity checks!");
386 
387         /* discard the data we've read! */
388         free_msa_record(names, aln, id, structure);
389 
390         seq_num = -1;
391       }
392     }
393   }
394 
395   return seq_num;
396 }
397 
398 
399 PUBLIC int
vrna_file_msa_write(const char * filename,const char ** names,const char ** aln,const char * id,const char * structure,const char * source,unsigned int options)400 vrna_file_msa_write(const char    *filename,
401                     const char    **names,
402                     const char    **aln,
403                     const char    *id,
404                     const char    *structure,
405                     const char    *source,
406                     unsigned int  options)
407 {
408   int ret, verb_level;
409 
410   verb_level  = 1;  /* we default to be very verbose */
411   ret         = 0;  /* failure */
412 
413   if (options & VRNA_FILE_FORMAT_MSA_QUIET)
414     verb_level = 0;
415 
416   if (options & VRNA_FILE_FORMAT_MSA_SILENT)
417     verb_level = -1;
418 
419   if (filename && names && aln) {
420     /* we require at least a filename, the sequence identifiers (names), and the alignment */
421     FILE                *fp;
422     int                 i, r, seq_num;
423     const char          *writer_name;
424     aln_writer_function *writer;
425 
426     r           = 0;
427     seq_num     = 0;
428     writer_name = NULL;
429     writer      = NULL;
430 
431     /* find out the number of sequences in the alignment */
432     for (seq_num = 0; aln[seq_num]; seq_num++);
433 
434     if (seq_num == 0) {
435       if (verb_level >= 0)
436         vrna_message_warning("Alignment did not pass sanity checks!");
437 
438       return ret;
439     }
440 
441     /* check the alignment itself for consistency */
442     if ((seq_num > 0) && (!(options & VRNA_FILE_FORMAT_MSA_NOCHECK))) {
443       if (!check_alignment(names, aln, seq_num, verb_level)) {
444         if (verb_level >= 0)
445           vrna_message_warning("Alignment did not pass sanity checks!");
446 
447         return ret;
448       }
449     }
450 
451     /* detect output format */
452     for (i = 0; i < NUM_WRITERS; i++) {
453       if ((options & known_writers[i].code) && (known_writers[i].writer)) {
454         if (!writer) {
455           writer      = known_writers[i].writer;
456           writer_name = known_writers[i].name;
457         }
458 
459         r++;
460       }
461     }
462 
463     if (r == 0) {
464       if (verb_level >= 0)
465         vrna_message_warning("Did not find writer for specified MSA format!");
466     } else {
467       if ((r > 1) && (verb_level > 0))
468         vrna_message_warning("More than one MSA format writer specified!\n"
469                              "Using writer for %s",
470                              writer_name);
471 
472       /* try opening the output stream */
473       if (options & VRNA_FILE_FORMAT_MSA_APPEND)
474         fp = fopen(filename, "a");
475       else
476         fp = fopen(filename, "w");
477 
478       if (!fp) {
479         if (verb_level >= 0)
480           vrna_message_warning("Alignment file could not be opened for writing!");
481 
482         return ret;
483       }
484 
485       /* write output alignment to file */
486       ret = writer(fp, names, aln, id, structure, source, options, verb_level);
487 
488       /* close output stream */
489       fclose(fp);
490     }
491   } else if (verb_level >= 0) {
492     vrna_message_warning("vrna_file_msa_write: insufficient input for writing anything!");
493   }
494 
495   return ret;
496 }
497 
498 
499 PRIVATE int
parse_stockholm_alignment(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)500 parse_stockholm_alignment(FILE  *fp,
501                           char  ***names,
502                           char  ***aln,
503                           char  **id,
504                           char  **structure,
505                           int   verbosity)
506 {
507   char  *line = NULL;
508   int   i, n, seq_num, seq_current;
509 
510   seq_num     = 0;
511   seq_current = 0;
512 
513   if (!fp) {
514     if (verbosity >= 0)
515       vrna_message_warning(
516         "Can't read from filepointer while parsing Stockholm formatted sequence alignment!");
517 
518     return -1;
519   }
520 
521   if (names && aln) {
522     *names  = NULL;
523     *aln    = NULL;
524   } else {
525     return -1;
526   }
527 
528   if (id)
529     *id = NULL;
530 
531   if (structure)
532     *structure = NULL;
533 
534   int inrecord = 0;
535   while ((line = vrna_read_line(fp))) {
536     if (strstr(line, "STOCKHOLM 1.0")) {
537       inrecord    = 1;
538       free(line);
539       break;
540     }
541 
542     free(line);
543   }
544 
545   if (inrecord) {
546     while ((line = vrna_read_line(fp))) {
547       if (strncmp(line, "//", 2) == 0) {
548         /* end of alignment */
549         free(line);
550         line = NULL;
551         break;
552       }
553 
554       n = (int)strlen(line);
555 
556       switch (*line) {
557         /* we skip lines that start with whitespace */
558         case ' ':
559         case '\0':
560           seq_current = 0; /* reset number of current sequence */
561           goto stockholm_next_line;
562 
563         /* Stockholm markup, or comment */
564         case '#':
565           if (strstr(line, "STOCKHOLM 1.0")) {
566             if (verbosity >= 0)
567               vrna_message_warning("Malformatted Stockholm record, missing // ?");
568 
569             /* drop everything we've read so far and start new, blank record */
570             free_msa_record(names, aln, id, structure);
571 
572             seq_num = 0;
573           } else if (strncmp(line, "#=GF", 4) == 0) {
574             /* found feature markup */
575             if ((id != NULL) && (strncmp(line, "#=GF ID", 7) == 0)) {
576               *id = (char *)vrna_alloc(sizeof(char) * n);
577               if (sscanf(line, "#=GF ID %s", *id) == 1) {
578                 *id = (char *)vrna_realloc(*id, sizeof(char) * (strlen(*id) + 1));
579               } else {
580                 free(*id);
581                 *id = NULL;
582               }
583             }
584           } else if (strncmp(line, "#=GC", 4) == 0) {
585             /* found per-column annotation */
586             if ((structure != NULL) && (strncmp(line, "#=GC SS_cons ", 13) == 0)) {
587               char *ss = (char *)vrna_alloc(sizeof(char) * n);
588               if (sscanf(line, "#=GC SS_cons %s", ss) == 1) {
589                 /* always append consensus structure */
590                 unsigned int prev_len = (*structure) ? strlen(*structure) : 0;
591                 unsigned int ss_len   = strlen(ss);
592                 *structure = (char *)vrna_realloc(*structure,
593                                                   sizeof(char) *
594                                                   (prev_len + ss_len + 1));
595                 memcpy(*structure + prev_len,
596                        ss,
597                        sizeof(char) * ss_len);
598                 (*structure)[prev_len + ss_len] = '\0';
599               }
600 
601               free(ss);
602             }
603           } else if (strncmp(line, "#=GS", 4) == 0) {
604             /* found generic per-sequence annotation */
605           } else if (strncmp(line, "#=GR", 4) == 0) {
606             /* found generic per-Residue annotation */
607           } else {
608             /* may be comment? */
609           }
610 
611           break;
612 
613         /* should be sequence */
614         default:
615         {
616           char  *tmp_name = (char *)vrna_alloc(sizeof(char) * (n + 1));
617           char  *tmp_seq  = (char *)vrna_alloc(sizeof(char) * (n + 1));
618           if (sscanf(line, "%s %s", tmp_name, tmp_seq) == 2) {
619             for (i = 0; i < strlen(tmp_seq); i++)
620               if (tmp_seq[i] == '.') /* replace '.' gaps with '-' */
621                 tmp_seq[i] = '-';
622 
623             if (seq_current == seq_num) {
624               /* first time */
625               add_sequence(tmp_name, tmp_seq,
626                            names, aln,
627                            seq_current + 1);
628             } else {
629               if (strcmp(tmp_name, (*names)[seq_current]) != 0) {
630                 /* name doesn't match */
631                 if (verbosity >= 0)
632                   vrna_message_warning(
633                     "Sorry, your file is messed up! Inconsistent (order of) sequence identifiers.");
634 
635                 free(line);
636                 free(tmp_name);
637                 free(tmp_seq);
638                 return 0;
639               }
640               unsigned int tmp_seq_len = strlen(tmp_seq);
641               unsigned int current_len = strlen((*aln)[seq_current]);
642 
643               (*aln)[seq_current] = (char *)vrna_realloc((*aln)[seq_current],
644                                                          current_len +
645                                                          tmp_seq_len +
646                                                          1);
647               memcpy((*aln)[seq_current] + current_len,
648                      tmp_seq,
649                      sizeof(char) * tmp_seq_len);
650 
651               (*aln)[seq_current][current_len + tmp_seq_len] = '\0';
652             }
653           }
654 
655           seq_current++;
656           if (seq_current > seq_num)
657             seq_num = seq_current;
658 
659           free(tmp_name);
660           free(tmp_seq);
661         }
662         break;
663       }
664 
665 stockholm_next_line:
666 
667       free(line);
668     }
669   } else {
670     /*
671      *  if (verbosity >= 0)
672      *    vrna_message_warning("Did not find any Stockholm 1.0 formatted record!");
673      */
674     return -1;
675   }
676 
677   free(line);
678 
679   endmarker_msa_record(names, aln, seq_num);
680 
681   if ((seq_num > 0) && (verbosity > 0))
682     vrna_message_info(stderr, "%d sequences; length of alignment %d.", seq_num,
683                       (int)strlen((*aln)[0]));
684 
685   return seq_num;
686 }
687 
688 
689 PRIVATE int
write_stockholm_alignment(FILE * fp,const char ** names,const char ** aln,const char * id,const char * structure,const char * source,unsigned int options,int verbosity)690 write_stockholm_alignment(FILE          *fp,
691                           const char    **names,
692                           const char    **aln,
693                           const char    *id,
694                           const char    *structure,
695                           const char    *source,
696                           unsigned int  options,
697                           int           verbosity)
698 {
699   int ret;
700 
701   ret = 1; /* success */
702 
703   if (fp) {
704     int s, seq_num, l, longest_name;
705 
706     /* detect sequence number and longest sequence name in alignment */
707     for (longest_name = seq_num = 0; names[seq_num]; seq_num++) {
708       l = (int)strlen(names[seq_num]);
709       if (l > longest_name)
710         longest_name = l;
711     }
712 
713     if (seq_num > 0) {
714       /* print header */
715       fprintf(fp, "# STOCKHOLM 1.0\n");
716 
717       if (id)   /* print the ID if available */
718         fprintf(fp, "#=GF ID %s\n", id);
719 
720       if (structure) {
721         /* print structure source if structure is present */
722         if (!source)
723           source = "ViennaRNA Package prediction";
724 
725         fprintf(fp, "#=GF SS %s\n", source);
726         /*
727          * in case we print a consensus structure, reset the longest_name
728          * variable if longest name is shorter than '#=GC SS_cons' which
729          * has 12 characters
730          */
731         if (longest_name < 12)
732           longest_name = 12;
733       }
734 
735       /* print actual alignment */
736       for (s = 0; s < seq_num; s++)
737         fprintf(fp, "%-*s  %s\n", longest_name, names[s], aln[s]);
738 
739       /* output reference sequence */
740       char *cons = (options & VRNA_FILE_FORMAT_MSA_MIS) ?
741                     vrna_aln_consensus_mis(aln, NULL) :
742                     vrna_aln_consensus_sequence(aln, NULL);
743       fprintf(fp, "%-*s  %s\n", longest_name, "#=GC RF", cons);
744       free(cons);
745 
746       /* print consensus structure */
747       if (structure)
748         fprintf(fp, "%-*s  %s\n", longest_name, "#=GC SS_cons", structure);
749 
750       /* print record-end marker */
751       fprintf(fp, "//\n");
752     }
753   }
754 
755   return ret;
756 }
757 
758 
759 PRIVATE int
parse_fasta_alignment(FILE * fp,char *** names,char *** aln,int verbosity)760 parse_fasta_alignment(FILE  *fp,
761                       char  ***names,
762                       char  ***aln,
763                       int   verbosity)
764 {
765   unsigned int  read_opt, rec_type;
766   int           seq_num;
767   char          *rec_id, *rec_sequence, **rec_rest;
768 
769   rec_id        = NULL;
770   rec_sequence  = NULL;
771   rec_rest      = NULL;
772   seq_num       = 0;
773   read_opt      = VRNA_INPUT_NO_REST; /* read sequence and header information only */
774 
775   /* read until EOF or user abort */
776   while (
777     !((rec_type = vrna_file_fasta_read_record(&rec_id, &rec_sequence, &rec_rest, fp, read_opt))
778       & (VRNA_INPUT_ERROR | VRNA_INPUT_QUIT))) {
779     if (rec_id) {
780       /* valid FASTA entry */
781       seq_num++;
782 
783       char *id = (char *)vrna_alloc(sizeof(char) * strlen(rec_id));
784       (void)sscanf(rec_id, ">%s", id);
785 
786       add_sequence(id, rec_sequence,
787                    names, aln,
788                    seq_num);
789 
790       free(id);
791     }
792 
793     free(rec_id);
794     free(rec_sequence);
795     free(rec_rest);
796   }
797 
798   free(rec_id);
799   free(rec_sequence);
800   free(rec_rest);
801 
802   endmarker_msa_record(names, aln, seq_num);
803 
804   if (seq_num > 0) {
805     if (verbosity > 0)
806       vrna_message_info(stderr, "%d sequences; length of alignment %d.", seq_num,
807                         (int)strlen((*aln)[0]));
808   } else {
809     /*
810      *  if (verbosity >= 0)
811      *    vrna_message_warning("Did not find any FASTA formatted record!");
812      */
813     return -1;
814   }
815 
816   return seq_num;
817 }
818 
819 
820 PRIVATE int
parse_clustal_alignment(FILE * clust,char *** names,char *** aln,int verbosity)821 parse_clustal_alignment(FILE  *clust,
822                         char  ***names,
823                         char  ***aln,
824                         int   verbosity)
825 {
826   char  *line, *name, *seq;
827   int   n, nn = 0, seq_num = 0, i;
828 
829   if ((line = vrna_read_line(clust)) == NULL)
830     return -1;
831 
832   if (strncmp(line, "CLUSTAL", 7) != 0) {
833     if (verbosity >= 0)
834       vrna_message_warning("This doesn't look like a CLUSTALW file, sorry");
835 
836     free(line);
837     return -1;
838   }
839 
840   free(line);
841   line = vrna_read_line(clust);
842 
843   while (line != NULL) {
844     n = strlen(line);
845 
846     if ((n < 4) || isspace((int)line[0])) {
847       /* skip non-sequence line */
848       free(line);
849       line  = vrna_read_line(clust);
850       nn    = 0;  /* reset sequence number */
851       continue;
852     }
853 
854     /* skip comments */
855     if (line[0] == '#') {
856       free(line);
857       line = vrna_read_line(clust);
858       continue;
859     }
860 
861     seq   = (char *)vrna_alloc(sizeof(char) * (n + 1));
862     name  = (char *)vrna_alloc(sizeof(char) * (n + 1));
863     if (sscanf(line, "%s %s", name, seq) == 2) {
864       /* realloc to actual sizes */
865       seq   = (char *)vrna_realloc(seq, sizeof(char) * (strlen(seq) + 1));
866       name  = (char *)vrna_realloc(name, sizeof(char) * (strlen(name) + 1));
867       for (i = 0; i < strlen(seq); i++)
868         if (seq[i] == '.') /* replace '.' gaps with '-' */
869           seq[i] = '-';
870 
871       if (nn == seq_num) {
872         /* first time */
873         add_sequence(name, seq,
874                      names, aln,
875                      nn + 1);
876       } else {
877         if (strcmp(name, (*names)[nn]) != 0) {
878           /* name doesn't match */
879           if (verbosity >= 0)
880             vrna_message_warning(
881               "Sorry, your file is messed up! Inconsistent (order of) sequence identifiers.");
882 
883           free(line);
884           free(seq);
885           return 0;
886         }
887 
888         unsigned int current_len = strlen((*aln)[nn]);
889         unsigned int seq_len     = strlen(seq);
890 
891         (*aln)[nn] = (char *)vrna_realloc((*aln)[nn],
892                                           current_len +
893                                           seq_len +
894                                           1);
895         memcpy((*aln)[nn] + current_len,
896                seq,
897                sizeof(char) * seq_len);
898 
899         (*aln)[nn][current_len + seq_len] = '\0';
900       }
901 
902       nn++;
903       if (nn > seq_num)
904         seq_num = nn;
905 
906       free(seq);
907       free(name);
908     }
909 
910     free(line);
911 
912     line = vrna_read_line(clust);
913   }
914 
915   endmarker_msa_record(names, aln, seq_num);
916 
917   if ((seq_num > 0) && (verbosity > 0))
918     vrna_message_info(stderr, "%d sequences; length of alignment %d.", seq_num,
919                       (int)strlen((*aln)[0]));
920 
921   return seq_num;
922 }
923 
924 
925 PRIVATE int
parse_maf_alignment(FILE * fp,char *** names,char *** aln,int verbosity)926 parse_maf_alignment(FILE  *fp,
927                     char  ***names,
928                     char  ***aln,
929                     int   verbosity)
930 {
931   char  *line = NULL, *tmp_name, *tmp_sequence, strand;
932   int   n, seq_num, start, length, src_length;
933 
934   seq_num     = 0;
935 
936   if (!fp) {
937     if (verbosity >= 0)
938       vrna_message_warning(
939         "Can't read from filepointer while parsing MAF formatted sequence alignment!");
940 
941     return -1;
942   }
943 
944   if (names && aln) {
945     *names  = NULL;
946     *aln    = NULL;
947   } else {
948     return -1;
949   }
950 
951   int inrecord = 0;
952   while ((line = vrna_read_line(fp))) {
953     if (*line == 'a') {
954       if ((line[1] == '\0') || isspace(line[1])) {
955         inrecord = 1;
956         free(line);
957         break;
958       }
959     }
960 
961     free(line);
962   }
963 
964   if (inrecord) {
965     while ((line = vrna_read_line(fp))) {
966       n = (int)strlen(line);
967 
968       switch (*line) {
969         case '#': /* comment */
970           break;
971 
972         case 'e': /* ignore and fall through */
973         case 'i': /* ignore and fall through */
974         case 'q': /* ignore */
975           break;
976 
977         case 's': /* a sequence within the alignment block */
978           tmp_name      = (char *)vrna_alloc(sizeof(char) * n);
979           tmp_sequence  = (char *)vrna_alloc(sizeof(char) * n);
980           if (sscanf(line, "s %s %d %d %c %d %s",
981                      tmp_name,
982                      &start,
983                      &length,
984                      &strand,
985                      &src_length,
986                      tmp_sequence) == 6) {
987             seq_num++;
988             tmp_name      = (char *)vrna_realloc(tmp_name, sizeof(char) * (strlen(tmp_name) + 1));
989             tmp_sequence  =
990               (char *)vrna_realloc(tmp_sequence, sizeof(char) * (strlen(tmp_sequence) + 1));
991 
992             add_sequence(tmp_name, tmp_sequence,
993                          names, aln,
994                          seq_num);
995 
996             free(tmp_name);
997             free(tmp_sequence);
998             break;
999           }
1000 
1001           free(tmp_name);
1002           free(tmp_sequence);
1003         /* all through */
1004 
1005         default: /* something else that ends the block */
1006           free(line);
1007           goto maf_exit;
1008       }
1009 
1010       free(line);
1011     }
1012   } else {
1013     /*
1014      *  if (verbosity >= 0)
1015      *    vrna_message_warning("Did not find any MAF formatted record!");
1016      */
1017     return -1;
1018   }
1019 
1020 maf_exit:
1021 
1022   endmarker_msa_record(names, aln, seq_num);
1023 
1024   if ((seq_num > 0) && (verbosity > 0))
1025     vrna_message_info(stderr, "%d sequences; length of alignment %d.", seq_num,
1026                       (int)strlen((*aln)[0]));
1027 
1028   return seq_num;
1029 }
1030 
1031 
1032 PRIVATE void
free_msa_record(char *** names,char *** aln,char ** id,char ** structure)1033 free_msa_record(char  ***names,
1034                 char  ***aln,
1035                 char  **id,
1036                 char  **structure)
1037 {
1038   int s, i;
1039 
1040   s = 0;
1041   if (aln && (*aln))
1042     for (; (*aln)[s]; s++);
1043 
1044   if (id != NULL) {
1045     free(*id);
1046     *id = NULL;
1047   }
1048 
1049   if (structure != NULL) {
1050     free(*structure);
1051     *structure = NULL;
1052   }
1053 
1054   for (i = 0; i < s; i++) {
1055     free((*names)[i]);
1056     free((*aln)[i]);
1057   }
1058 
1059   if (names && (*names)) {
1060     free(*names);
1061     *names = NULL;
1062   }
1063 
1064   if (aln && (*aln)) {
1065     free(*aln);
1066     *aln = NULL;
1067   }
1068 }
1069 
1070 
1071 PRIVATE int
parse_aln_stockholm(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)1072 parse_aln_stockholm(FILE  *fp,
1073                     char  ***names,
1074                     char  ***aln,
1075                     char  **id,
1076                     char  **structure,
1077                     int   verbosity)
1078 {
1079   return parse_stockholm_alignment(fp, names, aln, id, structure, verbosity);
1080 }
1081 
1082 
1083 PRIVATE int
parse_aln_clustal(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)1084 parse_aln_clustal(FILE  *fp,
1085                   char  ***names,
1086                   char  ***aln,
1087                   char  **id,
1088                   char  **structure,
1089                   int   verbosity)
1090 {
1091   /* clustal format doesn't contain id's or structure information */
1092   if (id)
1093     *id = NULL;
1094 
1095   if (structure)
1096     *structure = NULL;
1097 
1098   return parse_clustal_alignment(fp, names, aln, verbosity);
1099 }
1100 
1101 
1102 PRIVATE int
parse_aln_fasta(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)1103 parse_aln_fasta(FILE  *fp,
1104                 char  ***names,
1105                 char  ***aln,
1106                 char  **id,
1107                 char  **structure,
1108                 int   verbosity)
1109 {
1110   /* fasta alignments do not contain an id, or structure information */
1111   if (id)
1112     *id = NULL;
1113 
1114   if (structure)
1115     *structure = NULL;
1116 
1117   return parse_fasta_alignment(fp, names, aln, verbosity);
1118 }
1119 
1120 
1121 PRIVATE int
parse_aln_maf(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)1122 parse_aln_maf(FILE  *fp,
1123               char  ***names,
1124               char  ***aln,
1125               char  **id,
1126               char  **structure,
1127               int   verbosity)
1128 {
1129   /* MAF alignments do not contain an id, or structure information */
1130   if (id)
1131     *id = NULL;
1132 
1133   if (structure)
1134     *structure = NULL;
1135 
1136   return parse_maf_alignment(fp, names, aln, verbosity);
1137 }
1138 
1139 
1140 PRIVATE int
write_aln_stockholm(FILE * fp,const char ** names,const char ** aln,const char * id,const char * structure,const char * source,unsigned int options,int verbosity)1141 write_aln_stockholm(FILE          *fp,
1142                     const char    **names,
1143                     const char    **aln,
1144                     const char    *id,
1145                     const char    *structure,
1146                     const char    *source,
1147                     unsigned int  options,
1148                     int           verbosity)
1149 {
1150   return write_stockholm_alignment(fp, names, aln, id, structure, source, options, verbosity);
1151 }
1152 
1153 
1154 PRIVATE void
add_sequence(const char * id,const char * seq,char *** names,char *** aln,int seq_num)1155 add_sequence(const char *id,
1156              const char *seq,
1157              char       ***names,
1158              char       ***aln,
1159              int        seq_num)
1160 {
1161   (*names)              = (char **)vrna_realloc(*names, sizeof(char *) * (seq_num));
1162   (*names)[seq_num - 1] = strdup(id);
1163   (*aln)                = (char **)vrna_realloc(*aln, sizeof(char *) * (seq_num));
1164   (*aln)[seq_num - 1]   = strdup(seq);
1165 }
1166 
1167 
1168 PRIVATE void
endmarker_msa_record(char *** names,char *** aln,int seq_num)1169 endmarker_msa_record(char ***names,
1170                      char ***aln,
1171                      int  seq_num)
1172 {
1173   /*
1174    * append additional entry in 'aln' and 'names' pointing to NULL (this may be
1175    * used as an indication for the end of the sequence list)
1176    */
1177   if (seq_num > 0) {
1178     (*aln)            = (char **)vrna_realloc(*aln, sizeof(char *) * (seq_num + 1));
1179     (*names)          = (char **)vrna_realloc(*names, sizeof(char *) * (seq_num + 1));
1180     (*aln)[seq_num]   = NULL;
1181     (*names)[seq_num] = NULL;
1182   }
1183 }
1184 
1185 
1186 PRIVATE int
check_alignment(const char ** names,const char ** aln,int seq_num,int verbosity)1187 check_alignment(const char  **names,
1188                 const char  **aln,
1189                 int         seq_num,
1190                 int         verbosity)
1191 {
1192   int i, j, l, pass = 1;
1193 
1194   /* check for unique names */
1195   for (i = 0; i < seq_num; i++) {
1196     for (j = i + 1; j < seq_num; j++) {
1197       if (!strcmp(names[i], names[j])) {
1198         if (verbosity >= 0)
1199           vrna_message_warning("Sequence IDs in input alignment are not unique!");
1200 
1201         pass = 0;
1202       }
1203     }
1204   }
1205 
1206   /* check for equal lengths of sequences */
1207   l = (int)strlen(aln[0]);
1208   for (i = 1; i < seq_num; i++)
1209     if ((int)strlen(aln[i]) != l) {
1210       if (verbosity >= 0)
1211         vrna_message_warning("Sequence lengths in input alignment do not match!");
1212 
1213       pass = 0;
1214     }
1215 
1216   return pass;
1217 }
1218