1 /*
2 * file_formats_msa.c
3 *
4 * Various functions dealing with file formats for Multiple Sequence Alignments (MSA)
5 *
6 * (c) 2016 Ronny Lorenz
7 *
8 * ViennaRNA package
9 */
10
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <math.h>
19 #include <ctype.h>
20
21 #include "ViennaRNA/fold_vars.h"
22 #include "ViennaRNA/utils/basic.h"
23 #include "ViennaRNA/utils/alignments.h"
24 #include "ViennaRNA/io/utils.h"
25 #include "ViennaRNA/io/file_formats.h"
26 #include "ViennaRNA/io/file_formats_msa.h"
27
28 /*
29 #################################
30 # STATIC DECLARATIONS #
31 #################################
32 */
33
34 typedef int (aln_parser_function)(FILE *fp,
35 char ***names,
36 char ***aln,
37 char **id,
38 char **structure,
39 int verbosity);
40 typedef int (aln_writer_function)(FILE *fp,
41 const char **names,
42 const char **aln,
43 const char *id,
44 const char *structure,
45 const char *source,
46 unsigned int options,
47 int verbosity);
48
49 typedef struct {
50 unsigned int code;
51 aln_parser_function *parser;
52 const char *name;
53 } parsable;
54
55 typedef struct {
56 unsigned int code;
57 aln_writer_function *writer;
58 const char *name;
59 } writable;
60
61 PRIVATE aln_parser_function parse_aln_stockholm;
62
63 PRIVATE aln_parser_function parse_aln_clustal;
64
65 PRIVATE aln_parser_function parse_aln_fasta;
66
67 PRIVATE aln_parser_function parse_aln_maf;
68
69 PRIVATE aln_writer_function write_aln_stockholm;
70
71 PRIVATE int
72 parse_fasta_alignment(FILE *fp,
73 char ***names,
74 char ***aln,
75 int verbosity);
76
77
78 PRIVATE int
79 parse_clustal_alignment(FILE *clust,
80 char ***names,
81 char ***aln,
82 int verbosity);
83
84
85 PRIVATE int
86 parse_stockholm_alignment(FILE *fp,
87 char ***aln,
88 char ***names,
89 char **id,
90 char **structure,
91 int verbosity);
92
93
94 PRIVATE int
95 parse_maf_alignment(FILE *fp,
96 char ***aln,
97 char ***names,
98 int verbosity);
99
100
101 PRIVATE int
102 write_stockholm_alignment(FILE *fp,
103 const char **names,
104 const char **aln,
105 const char *id,
106 const char *structure,
107 const char *source,
108 unsigned int options,
109 int verbosity);
110
111
112 PRIVATE int
113 check_alignment(const char **names,
114 const char **aln,
115 int seq_num,
116 int verbosity);
117
118
119 PRIVATE void
120 free_msa_record(char ***names,
121 char ***aln,
122 char **id,
123 char **structure);
124
125
126 PRIVATE void
127 add_sequence(const char *id,
128 const char *seq,
129 char ***names,
130 char ***aln,
131 int seq_num);
132
133
134 PRIVATE void
135 endmarker_msa_record(char ***names,
136 char ***aln,
137 int seq_num);
138
139
140 /*
141 #################################
142 # STATIC VARIABLES #
143 #################################
144 */
145
146 /* number of known alignment parsers */
147 #define NUM_PARSERS 4
148 #define NUM_WRITERS 1
149
150 static parsable known_parsers[NUM_PARSERS] = {
151 /* option, parser, name */
152 { VRNA_FILE_FORMAT_MSA_STOCKHOLM, parse_aln_stockholm, "Stockholm 1.0 format" },
153 { VRNA_FILE_FORMAT_MSA_CLUSTAL, parse_aln_clustal, "ClustalW format" },
154 { VRNA_FILE_FORMAT_MSA_FASTA, parse_aln_fasta, "FASTA format" },
155 { VRNA_FILE_FORMAT_MSA_MAF, parse_aln_maf, "MAF format" }
156 };
157
158 static writable known_writers[NUM_WRITERS] = {
159 /* option, parser, name */
160 { VRNA_FILE_FORMAT_MSA_STOCKHOLM, write_aln_stockholm, "Stockholm 1.0 format" }
161 };
162
163 /*
164 #################################
165 # BEGIN OF FUNCTION DEFINITIONS #
166 #################################
167 */
168 PUBLIC unsigned int
vrna_file_msa_detect_format(const char * filename,unsigned int options)169 vrna_file_msa_detect_format(const char *filename,
170 unsigned int options)
171 {
172 FILE *fp;
173 char **names, **aln;
174 unsigned int format;
175 int i, r;
176 long int fp_position;
177
178 names = NULL;
179 aln = NULL;
180 format = VRNA_FILE_FORMAT_MSA_UNKNOWN;
181
182 /* if no alignment file format(s) were specified we probe for all of them */
183 if (options == 0)
184 options = VRNA_FILE_FORMAT_MSA_DEFAULT;
185
186 if (!(fp = fopen(filename, "r"))) {
187 if (!(options & VRNA_FILE_FORMAT_MSA_SILENT))
188 vrna_message_warning("vrna_file_msa_detect_format: "
189 "Can't open alignment file \"%s\"!",
190 filename);
191
192 return format;
193 }
194
195 r = -1;
196 fp_position = ftell(fp);
197
198 for (i = 0; i < NUM_PARSERS; i++) {
199 if ((options & known_parsers[i].code) && (known_parsers[i].parser)) {
200 /* go back to beginning of file */
201 if (!fseek(fp, fp_position, SEEK_SET)) {
202 r = known_parsers[i].parser(fp, &names, &aln, NULL, NULL, -1);
203 free_msa_record(&names, &aln, NULL, NULL);
204 if (r > 0) {
205 format = known_parsers[i].code;
206 break;
207 }
208 } else {
209 vrna_message_warning("vrna_file_msa_detect_format: "
210 "Something unexpected happened while parsing the alignment file");
211 goto msa_detect_format_exit;
212 }
213 }
214 }
215
216 msa_detect_format_exit:
217
218 fclose(fp);
219
220 return format;
221 }
222
223
224 PUBLIC int
vrna_file_msa_read(const char * filename,char *** names,char *** aln,char ** id,char ** structure,unsigned int options)225 vrna_file_msa_read(const char *filename,
226 char ***names,
227 char ***aln,
228 char **id,
229 char **structure,
230 unsigned int options)
231 {
232 FILE *fp;
233 int i, seq_num, r, verb_level;
234 long int fp_position;
235
236 verb_level = 1; /* we default to be very verbose */
237 seq_num = 0;
238
239 if (options & VRNA_FILE_FORMAT_MSA_QUIET)
240 verb_level = 0;
241
242 if (options & VRNA_FILE_FORMAT_MSA_SILENT)
243 verb_level = -1;
244
245 if (!(fp = fopen(filename, "r"))) {
246 if (verb_level >= 0)
247 vrna_message_warning("vrna_file_msa_read: "
248 "Can't open alignment file \"%s\"!",
249 filename);
250
251 return seq_num;
252 }
253
254 if (names && aln) {
255 *names = NULL;
256 *aln = NULL;
257 } else {
258 return seq_num;
259 }
260
261 if (id)
262 *id = NULL;
263
264 if (structure)
265 *structure = NULL;
266
267 /* if no alignment file format was specified, lets try to guess it */
268 if (options == 0)
269 options = VRNA_FILE_FORMAT_MSA_DEFAULT;
270
271 r = -1;
272 fp_position = ftell(fp);
273
274 for (i = 0; i < NUM_PARSERS; i++) {
275 if ((options & known_parsers[i].code) && (known_parsers[i].parser)) {
276 /* go back to beginning of file */
277 if (!fseek(fp, fp_position, SEEK_SET)) {
278 r = known_parsers[i].parser(fp, names, aln, id, structure, verb_level);
279 if (r > 0)
280 break;
281 } else {
282 vrna_message_warning("vrna_file_msa_read: "
283 "Something unexpected happened while parsing the alignment file");
284 goto msa_read_exit;
285 }
286 }
287 }
288
289 if (r == -1) {
290 if (verb_level >= 0)
291 vrna_message_warning("vrna_file_msa_read: "
292 "Alignment file parser is unknown (or not specified?)");
293 } else {
294 seq_num = r;
295
296 if ((seq_num > 0) && (!(options & VRNA_FILE_FORMAT_MSA_NOCHECK))) {
297 if (!check_alignment((const char **)(*names), (const char **)(*aln), seq_num, verb_level)) {
298 if (verb_level >= 0)
299 vrna_message_warning("vrna_file_msa_read: "
300 "Alignment did not pass sanity checks!");
301
302 /* discard the data we've read! */
303 free_msa_record(names, aln, id, structure);
304
305 seq_num = 0;
306 }
307 }
308 }
309
310 msa_read_exit:
311
312 fclose(fp);
313
314 return seq_num;
315 }
316
317
318 PUBLIC int
vrna_file_msa_read_record(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,unsigned int options)319 vrna_file_msa_read_record(FILE *fp,
320 char ***names,
321 char ***aln,
322 char **id,
323 char **structure,
324 unsigned int options)
325 {
326 const char *parser_name;
327 int i, r, seq_num, verb_level;
328 aln_parser_function *parser;
329
330 verb_level = 1; /* we default to be very verbose */
331 seq_num = 0;
332 parser_name = NULL;
333 parser = NULL;
334
335 if (options & VRNA_FILE_FORMAT_MSA_QUIET)
336 verb_level = 0;
337
338 if (options & VRNA_FILE_FORMAT_MSA_SILENT)
339 verb_level = -1;
340
341 if (!fp) {
342 if (verb_level >= 0)
343 vrna_message_warning("Can't read alignment from file pointer!");
344
345 return seq_num;
346 }
347
348 if (names && aln) {
349 *names = NULL;
350 *aln = NULL;
351 } else {
352 return seq_num;
353 }
354
355 if (id)
356 *id = NULL;
357
358 if (structure)
359 *structure = NULL;
360
361 for (r = i = 0; i < NUM_PARSERS; i++) {
362 if ((options & known_parsers[i].code) && (known_parsers[i].parser)) {
363 if (!parser) {
364 parser = known_parsers[i].parser;
365 parser_name = known_parsers[i].name;
366 }
367
368 r++;
369 }
370 }
371
372 if (r == 0) {
373 if (verb_level >= 0)
374 vrna_message_warning("Did not find parser for specified MSA format!");
375 } else {
376 if ((r > 1) && (verb_level > 0))
377 vrna_message_warning("More than one MSA format parser specified!\n"
378 "Using parser for %s", parser_name);
379
380 seq_num = parser(fp, names, aln, id, structure, verb_level);
381
382 if ((seq_num > 0) && (!(options & VRNA_FILE_FORMAT_MSA_NOCHECK))) {
383 if (!check_alignment((const char **)(*names), (const char **)(*aln), seq_num, verb_level)) {
384 if (verb_level >= 0)
385 vrna_message_warning("Alignment did not pass sanity checks!");
386
387 /* discard the data we've read! */
388 free_msa_record(names, aln, id, structure);
389
390 seq_num = -1;
391 }
392 }
393 }
394
395 return seq_num;
396 }
397
398
399 PUBLIC int
vrna_file_msa_write(const char * filename,const char ** names,const char ** aln,const char * id,const char * structure,const char * source,unsigned int options)400 vrna_file_msa_write(const char *filename,
401 const char **names,
402 const char **aln,
403 const char *id,
404 const char *structure,
405 const char *source,
406 unsigned int options)
407 {
408 int ret, verb_level;
409
410 verb_level = 1; /* we default to be very verbose */
411 ret = 0; /* failure */
412
413 if (options & VRNA_FILE_FORMAT_MSA_QUIET)
414 verb_level = 0;
415
416 if (options & VRNA_FILE_FORMAT_MSA_SILENT)
417 verb_level = -1;
418
419 if (filename && names && aln) {
420 /* we require at least a filename, the sequence identifiers (names), and the alignment */
421 FILE *fp;
422 int i, r, seq_num;
423 const char *writer_name;
424 aln_writer_function *writer;
425
426 r = 0;
427 seq_num = 0;
428 writer_name = NULL;
429 writer = NULL;
430
431 /* find out the number of sequences in the alignment */
432 for (seq_num = 0; aln[seq_num]; seq_num++);
433
434 if (seq_num == 0) {
435 if (verb_level >= 0)
436 vrna_message_warning("Alignment did not pass sanity checks!");
437
438 return ret;
439 }
440
441 /* check the alignment itself for consistency */
442 if ((seq_num > 0) && (!(options & VRNA_FILE_FORMAT_MSA_NOCHECK))) {
443 if (!check_alignment(names, aln, seq_num, verb_level)) {
444 if (verb_level >= 0)
445 vrna_message_warning("Alignment did not pass sanity checks!");
446
447 return ret;
448 }
449 }
450
451 /* detect output format */
452 for (i = 0; i < NUM_WRITERS; i++) {
453 if ((options & known_writers[i].code) && (known_writers[i].writer)) {
454 if (!writer) {
455 writer = known_writers[i].writer;
456 writer_name = known_writers[i].name;
457 }
458
459 r++;
460 }
461 }
462
463 if (r == 0) {
464 if (verb_level >= 0)
465 vrna_message_warning("Did not find writer for specified MSA format!");
466 } else {
467 if ((r > 1) && (verb_level > 0))
468 vrna_message_warning("More than one MSA format writer specified!\n"
469 "Using writer for %s",
470 writer_name);
471
472 /* try opening the output stream */
473 if (options & VRNA_FILE_FORMAT_MSA_APPEND)
474 fp = fopen(filename, "a");
475 else
476 fp = fopen(filename, "w");
477
478 if (!fp) {
479 if (verb_level >= 0)
480 vrna_message_warning("Alignment file could not be opened for writing!");
481
482 return ret;
483 }
484
485 /* write output alignment to file */
486 ret = writer(fp, names, aln, id, structure, source, options, verb_level);
487
488 /* close output stream */
489 fclose(fp);
490 }
491 } else if (verb_level >= 0) {
492 vrna_message_warning("vrna_file_msa_write: insufficient input for writing anything!");
493 }
494
495 return ret;
496 }
497
498
499 PRIVATE int
parse_stockholm_alignment(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)500 parse_stockholm_alignment(FILE *fp,
501 char ***names,
502 char ***aln,
503 char **id,
504 char **structure,
505 int verbosity)
506 {
507 char *line = NULL;
508 int i, n, seq_num, seq_current;
509
510 seq_num = 0;
511 seq_current = 0;
512
513 if (!fp) {
514 if (verbosity >= 0)
515 vrna_message_warning(
516 "Can't read from filepointer while parsing Stockholm formatted sequence alignment!");
517
518 return -1;
519 }
520
521 if (names && aln) {
522 *names = NULL;
523 *aln = NULL;
524 } else {
525 return -1;
526 }
527
528 if (id)
529 *id = NULL;
530
531 if (structure)
532 *structure = NULL;
533
534 int inrecord = 0;
535 while ((line = vrna_read_line(fp))) {
536 if (strstr(line, "STOCKHOLM 1.0")) {
537 inrecord = 1;
538 free(line);
539 break;
540 }
541
542 free(line);
543 }
544
545 if (inrecord) {
546 while ((line = vrna_read_line(fp))) {
547 if (strncmp(line, "//", 2) == 0) {
548 /* end of alignment */
549 free(line);
550 line = NULL;
551 break;
552 }
553
554 n = (int)strlen(line);
555
556 switch (*line) {
557 /* we skip lines that start with whitespace */
558 case ' ':
559 case '\0':
560 seq_current = 0; /* reset number of current sequence */
561 goto stockholm_next_line;
562
563 /* Stockholm markup, or comment */
564 case '#':
565 if (strstr(line, "STOCKHOLM 1.0")) {
566 if (verbosity >= 0)
567 vrna_message_warning("Malformatted Stockholm record, missing // ?");
568
569 /* drop everything we've read so far and start new, blank record */
570 free_msa_record(names, aln, id, structure);
571
572 seq_num = 0;
573 } else if (strncmp(line, "#=GF", 4) == 0) {
574 /* found feature markup */
575 if ((id != NULL) && (strncmp(line, "#=GF ID", 7) == 0)) {
576 *id = (char *)vrna_alloc(sizeof(char) * n);
577 if (sscanf(line, "#=GF ID %s", *id) == 1) {
578 *id = (char *)vrna_realloc(*id, sizeof(char) * (strlen(*id) + 1));
579 } else {
580 free(*id);
581 *id = NULL;
582 }
583 }
584 } else if (strncmp(line, "#=GC", 4) == 0) {
585 /* found per-column annotation */
586 if ((structure != NULL) && (strncmp(line, "#=GC SS_cons ", 13) == 0)) {
587 char *ss = (char *)vrna_alloc(sizeof(char) * n);
588 if (sscanf(line, "#=GC SS_cons %s", ss) == 1) {
589 /* always append consensus structure */
590 unsigned int prev_len = (*structure) ? strlen(*structure) : 0;
591 unsigned int ss_len = strlen(ss);
592 *structure = (char *)vrna_realloc(*structure,
593 sizeof(char) *
594 (prev_len + ss_len + 1));
595 memcpy(*structure + prev_len,
596 ss,
597 sizeof(char) * ss_len);
598 (*structure)[prev_len + ss_len] = '\0';
599 }
600
601 free(ss);
602 }
603 } else if (strncmp(line, "#=GS", 4) == 0) {
604 /* found generic per-sequence annotation */
605 } else if (strncmp(line, "#=GR", 4) == 0) {
606 /* found generic per-Residue annotation */
607 } else {
608 /* may be comment? */
609 }
610
611 break;
612
613 /* should be sequence */
614 default:
615 {
616 char *tmp_name = (char *)vrna_alloc(sizeof(char) * (n + 1));
617 char *tmp_seq = (char *)vrna_alloc(sizeof(char) * (n + 1));
618 if (sscanf(line, "%s %s", tmp_name, tmp_seq) == 2) {
619 for (i = 0; i < strlen(tmp_seq); i++)
620 if (tmp_seq[i] == '.') /* replace '.' gaps with '-' */
621 tmp_seq[i] = '-';
622
623 if (seq_current == seq_num) {
624 /* first time */
625 add_sequence(tmp_name, tmp_seq,
626 names, aln,
627 seq_current + 1);
628 } else {
629 if (strcmp(tmp_name, (*names)[seq_current]) != 0) {
630 /* name doesn't match */
631 if (verbosity >= 0)
632 vrna_message_warning(
633 "Sorry, your file is messed up! Inconsistent (order of) sequence identifiers.");
634
635 free(line);
636 free(tmp_name);
637 free(tmp_seq);
638 return 0;
639 }
640 unsigned int tmp_seq_len = strlen(tmp_seq);
641 unsigned int current_len = strlen((*aln)[seq_current]);
642
643 (*aln)[seq_current] = (char *)vrna_realloc((*aln)[seq_current],
644 current_len +
645 tmp_seq_len +
646 1);
647 memcpy((*aln)[seq_current] + current_len,
648 tmp_seq,
649 sizeof(char) * tmp_seq_len);
650
651 (*aln)[seq_current][current_len + tmp_seq_len] = '\0';
652 }
653 }
654
655 seq_current++;
656 if (seq_current > seq_num)
657 seq_num = seq_current;
658
659 free(tmp_name);
660 free(tmp_seq);
661 }
662 break;
663 }
664
665 stockholm_next_line:
666
667 free(line);
668 }
669 } else {
670 /*
671 * if (verbosity >= 0)
672 * vrna_message_warning("Did not find any Stockholm 1.0 formatted record!");
673 */
674 return -1;
675 }
676
677 free(line);
678
679 endmarker_msa_record(names, aln, seq_num);
680
681 if ((seq_num > 0) && (verbosity > 0))
682 vrna_message_info(stderr, "%d sequences; length of alignment %d.", seq_num,
683 (int)strlen((*aln)[0]));
684
685 return seq_num;
686 }
687
688
689 PRIVATE int
write_stockholm_alignment(FILE * fp,const char ** names,const char ** aln,const char * id,const char * structure,const char * source,unsigned int options,int verbosity)690 write_stockholm_alignment(FILE *fp,
691 const char **names,
692 const char **aln,
693 const char *id,
694 const char *structure,
695 const char *source,
696 unsigned int options,
697 int verbosity)
698 {
699 int ret;
700
701 ret = 1; /* success */
702
703 if (fp) {
704 int s, seq_num, l, longest_name;
705
706 /* detect sequence number and longest sequence name in alignment */
707 for (longest_name = seq_num = 0; names[seq_num]; seq_num++) {
708 l = (int)strlen(names[seq_num]);
709 if (l > longest_name)
710 longest_name = l;
711 }
712
713 if (seq_num > 0) {
714 /* print header */
715 fprintf(fp, "# STOCKHOLM 1.0\n");
716
717 if (id) /* print the ID if available */
718 fprintf(fp, "#=GF ID %s\n", id);
719
720 if (structure) {
721 /* print structure source if structure is present */
722 if (!source)
723 source = "ViennaRNA Package prediction";
724
725 fprintf(fp, "#=GF SS %s\n", source);
726 /*
727 * in case we print a consensus structure, reset the longest_name
728 * variable if longest name is shorter than '#=GC SS_cons' which
729 * has 12 characters
730 */
731 if (longest_name < 12)
732 longest_name = 12;
733 }
734
735 /* print actual alignment */
736 for (s = 0; s < seq_num; s++)
737 fprintf(fp, "%-*s %s\n", longest_name, names[s], aln[s]);
738
739 /* output reference sequence */
740 char *cons = (options & VRNA_FILE_FORMAT_MSA_MIS) ?
741 vrna_aln_consensus_mis(aln, NULL) :
742 vrna_aln_consensus_sequence(aln, NULL);
743 fprintf(fp, "%-*s %s\n", longest_name, "#=GC RF", cons);
744 free(cons);
745
746 /* print consensus structure */
747 if (structure)
748 fprintf(fp, "%-*s %s\n", longest_name, "#=GC SS_cons", structure);
749
750 /* print record-end marker */
751 fprintf(fp, "//\n");
752 }
753 }
754
755 return ret;
756 }
757
758
759 PRIVATE int
parse_fasta_alignment(FILE * fp,char *** names,char *** aln,int verbosity)760 parse_fasta_alignment(FILE *fp,
761 char ***names,
762 char ***aln,
763 int verbosity)
764 {
765 unsigned int read_opt, rec_type;
766 int seq_num;
767 char *rec_id, *rec_sequence, **rec_rest;
768
769 rec_id = NULL;
770 rec_sequence = NULL;
771 rec_rest = NULL;
772 seq_num = 0;
773 read_opt = VRNA_INPUT_NO_REST; /* read sequence and header information only */
774
775 /* read until EOF or user abort */
776 while (
777 !((rec_type = vrna_file_fasta_read_record(&rec_id, &rec_sequence, &rec_rest, fp, read_opt))
778 & (VRNA_INPUT_ERROR | VRNA_INPUT_QUIT))) {
779 if (rec_id) {
780 /* valid FASTA entry */
781 seq_num++;
782
783 char *id = (char *)vrna_alloc(sizeof(char) * strlen(rec_id));
784 (void)sscanf(rec_id, ">%s", id);
785
786 add_sequence(id, rec_sequence,
787 names, aln,
788 seq_num);
789
790 free(id);
791 }
792
793 free(rec_id);
794 free(rec_sequence);
795 free(rec_rest);
796 }
797
798 free(rec_id);
799 free(rec_sequence);
800 free(rec_rest);
801
802 endmarker_msa_record(names, aln, seq_num);
803
804 if (seq_num > 0) {
805 if (verbosity > 0)
806 vrna_message_info(stderr, "%d sequences; length of alignment %d.", seq_num,
807 (int)strlen((*aln)[0]));
808 } else {
809 /*
810 * if (verbosity >= 0)
811 * vrna_message_warning("Did not find any FASTA formatted record!");
812 */
813 return -1;
814 }
815
816 return seq_num;
817 }
818
819
820 PRIVATE int
parse_clustal_alignment(FILE * clust,char *** names,char *** aln,int verbosity)821 parse_clustal_alignment(FILE *clust,
822 char ***names,
823 char ***aln,
824 int verbosity)
825 {
826 char *line, *name, *seq;
827 int n, nn = 0, seq_num = 0, i;
828
829 if ((line = vrna_read_line(clust)) == NULL)
830 return -1;
831
832 if (strncmp(line, "CLUSTAL", 7) != 0) {
833 if (verbosity >= 0)
834 vrna_message_warning("This doesn't look like a CLUSTALW file, sorry");
835
836 free(line);
837 return -1;
838 }
839
840 free(line);
841 line = vrna_read_line(clust);
842
843 while (line != NULL) {
844 n = strlen(line);
845
846 if ((n < 4) || isspace((int)line[0])) {
847 /* skip non-sequence line */
848 free(line);
849 line = vrna_read_line(clust);
850 nn = 0; /* reset sequence number */
851 continue;
852 }
853
854 /* skip comments */
855 if (line[0] == '#') {
856 free(line);
857 line = vrna_read_line(clust);
858 continue;
859 }
860
861 seq = (char *)vrna_alloc(sizeof(char) * (n + 1));
862 name = (char *)vrna_alloc(sizeof(char) * (n + 1));
863 if (sscanf(line, "%s %s", name, seq) == 2) {
864 /* realloc to actual sizes */
865 seq = (char *)vrna_realloc(seq, sizeof(char) * (strlen(seq) + 1));
866 name = (char *)vrna_realloc(name, sizeof(char) * (strlen(name) + 1));
867 for (i = 0; i < strlen(seq); i++)
868 if (seq[i] == '.') /* replace '.' gaps with '-' */
869 seq[i] = '-';
870
871 if (nn == seq_num) {
872 /* first time */
873 add_sequence(name, seq,
874 names, aln,
875 nn + 1);
876 } else {
877 if (strcmp(name, (*names)[nn]) != 0) {
878 /* name doesn't match */
879 if (verbosity >= 0)
880 vrna_message_warning(
881 "Sorry, your file is messed up! Inconsistent (order of) sequence identifiers.");
882
883 free(line);
884 free(seq);
885 return 0;
886 }
887
888 unsigned int current_len = strlen((*aln)[nn]);
889 unsigned int seq_len = strlen(seq);
890
891 (*aln)[nn] = (char *)vrna_realloc((*aln)[nn],
892 current_len +
893 seq_len +
894 1);
895 memcpy((*aln)[nn] + current_len,
896 seq,
897 sizeof(char) * seq_len);
898
899 (*aln)[nn][current_len + seq_len] = '\0';
900 }
901
902 nn++;
903 if (nn > seq_num)
904 seq_num = nn;
905
906 free(seq);
907 free(name);
908 }
909
910 free(line);
911
912 line = vrna_read_line(clust);
913 }
914
915 endmarker_msa_record(names, aln, seq_num);
916
917 if ((seq_num > 0) && (verbosity > 0))
918 vrna_message_info(stderr, "%d sequences; length of alignment %d.", seq_num,
919 (int)strlen((*aln)[0]));
920
921 return seq_num;
922 }
923
924
925 PRIVATE int
parse_maf_alignment(FILE * fp,char *** names,char *** aln,int verbosity)926 parse_maf_alignment(FILE *fp,
927 char ***names,
928 char ***aln,
929 int verbosity)
930 {
931 char *line = NULL, *tmp_name, *tmp_sequence, strand;
932 int n, seq_num, start, length, src_length;
933
934 seq_num = 0;
935
936 if (!fp) {
937 if (verbosity >= 0)
938 vrna_message_warning(
939 "Can't read from filepointer while parsing MAF formatted sequence alignment!");
940
941 return -1;
942 }
943
944 if (names && aln) {
945 *names = NULL;
946 *aln = NULL;
947 } else {
948 return -1;
949 }
950
951 int inrecord = 0;
952 while ((line = vrna_read_line(fp))) {
953 if (*line == 'a') {
954 if ((line[1] == '\0') || isspace(line[1])) {
955 inrecord = 1;
956 free(line);
957 break;
958 }
959 }
960
961 free(line);
962 }
963
964 if (inrecord) {
965 while ((line = vrna_read_line(fp))) {
966 n = (int)strlen(line);
967
968 switch (*line) {
969 case '#': /* comment */
970 break;
971
972 case 'e': /* ignore and fall through */
973 case 'i': /* ignore and fall through */
974 case 'q': /* ignore */
975 break;
976
977 case 's': /* a sequence within the alignment block */
978 tmp_name = (char *)vrna_alloc(sizeof(char) * n);
979 tmp_sequence = (char *)vrna_alloc(sizeof(char) * n);
980 if (sscanf(line, "s %s %d %d %c %d %s",
981 tmp_name,
982 &start,
983 &length,
984 &strand,
985 &src_length,
986 tmp_sequence) == 6) {
987 seq_num++;
988 tmp_name = (char *)vrna_realloc(tmp_name, sizeof(char) * (strlen(tmp_name) + 1));
989 tmp_sequence =
990 (char *)vrna_realloc(tmp_sequence, sizeof(char) * (strlen(tmp_sequence) + 1));
991
992 add_sequence(tmp_name, tmp_sequence,
993 names, aln,
994 seq_num);
995
996 free(tmp_name);
997 free(tmp_sequence);
998 break;
999 }
1000
1001 free(tmp_name);
1002 free(tmp_sequence);
1003 /* all through */
1004
1005 default: /* something else that ends the block */
1006 free(line);
1007 goto maf_exit;
1008 }
1009
1010 free(line);
1011 }
1012 } else {
1013 /*
1014 * if (verbosity >= 0)
1015 * vrna_message_warning("Did not find any MAF formatted record!");
1016 */
1017 return -1;
1018 }
1019
1020 maf_exit:
1021
1022 endmarker_msa_record(names, aln, seq_num);
1023
1024 if ((seq_num > 0) && (verbosity > 0))
1025 vrna_message_info(stderr, "%d sequences; length of alignment %d.", seq_num,
1026 (int)strlen((*aln)[0]));
1027
1028 return seq_num;
1029 }
1030
1031
1032 PRIVATE void
free_msa_record(char *** names,char *** aln,char ** id,char ** structure)1033 free_msa_record(char ***names,
1034 char ***aln,
1035 char **id,
1036 char **structure)
1037 {
1038 int s, i;
1039
1040 s = 0;
1041 if (aln && (*aln))
1042 for (; (*aln)[s]; s++);
1043
1044 if (id != NULL) {
1045 free(*id);
1046 *id = NULL;
1047 }
1048
1049 if (structure != NULL) {
1050 free(*structure);
1051 *structure = NULL;
1052 }
1053
1054 for (i = 0; i < s; i++) {
1055 free((*names)[i]);
1056 free((*aln)[i]);
1057 }
1058
1059 if (names && (*names)) {
1060 free(*names);
1061 *names = NULL;
1062 }
1063
1064 if (aln && (*aln)) {
1065 free(*aln);
1066 *aln = NULL;
1067 }
1068 }
1069
1070
1071 PRIVATE int
parse_aln_stockholm(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)1072 parse_aln_stockholm(FILE *fp,
1073 char ***names,
1074 char ***aln,
1075 char **id,
1076 char **structure,
1077 int verbosity)
1078 {
1079 return parse_stockholm_alignment(fp, names, aln, id, structure, verbosity);
1080 }
1081
1082
1083 PRIVATE int
parse_aln_clustal(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)1084 parse_aln_clustal(FILE *fp,
1085 char ***names,
1086 char ***aln,
1087 char **id,
1088 char **structure,
1089 int verbosity)
1090 {
1091 /* clustal format doesn't contain id's or structure information */
1092 if (id)
1093 *id = NULL;
1094
1095 if (structure)
1096 *structure = NULL;
1097
1098 return parse_clustal_alignment(fp, names, aln, verbosity);
1099 }
1100
1101
1102 PRIVATE int
parse_aln_fasta(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)1103 parse_aln_fasta(FILE *fp,
1104 char ***names,
1105 char ***aln,
1106 char **id,
1107 char **structure,
1108 int verbosity)
1109 {
1110 /* fasta alignments do not contain an id, or structure information */
1111 if (id)
1112 *id = NULL;
1113
1114 if (structure)
1115 *structure = NULL;
1116
1117 return parse_fasta_alignment(fp, names, aln, verbosity);
1118 }
1119
1120
1121 PRIVATE int
parse_aln_maf(FILE * fp,char *** names,char *** aln,char ** id,char ** structure,int verbosity)1122 parse_aln_maf(FILE *fp,
1123 char ***names,
1124 char ***aln,
1125 char **id,
1126 char **structure,
1127 int verbosity)
1128 {
1129 /* MAF alignments do not contain an id, or structure information */
1130 if (id)
1131 *id = NULL;
1132
1133 if (structure)
1134 *structure = NULL;
1135
1136 return parse_maf_alignment(fp, names, aln, verbosity);
1137 }
1138
1139
1140 PRIVATE int
write_aln_stockholm(FILE * fp,const char ** names,const char ** aln,const char * id,const char * structure,const char * source,unsigned int options,int verbosity)1141 write_aln_stockholm(FILE *fp,
1142 const char **names,
1143 const char **aln,
1144 const char *id,
1145 const char *structure,
1146 const char *source,
1147 unsigned int options,
1148 int verbosity)
1149 {
1150 return write_stockholm_alignment(fp, names, aln, id, structure, source, options, verbosity);
1151 }
1152
1153
1154 PRIVATE void
add_sequence(const char * id,const char * seq,char *** names,char *** aln,int seq_num)1155 add_sequence(const char *id,
1156 const char *seq,
1157 char ***names,
1158 char ***aln,
1159 int seq_num)
1160 {
1161 (*names) = (char **)vrna_realloc(*names, sizeof(char *) * (seq_num));
1162 (*names)[seq_num - 1] = strdup(id);
1163 (*aln) = (char **)vrna_realloc(*aln, sizeof(char *) * (seq_num));
1164 (*aln)[seq_num - 1] = strdup(seq);
1165 }
1166
1167
1168 PRIVATE void
endmarker_msa_record(char *** names,char *** aln,int seq_num)1169 endmarker_msa_record(char ***names,
1170 char ***aln,
1171 int seq_num)
1172 {
1173 /*
1174 * append additional entry in 'aln' and 'names' pointing to NULL (this may be
1175 * used as an indication for the end of the sequence list)
1176 */
1177 if (seq_num > 0) {
1178 (*aln) = (char **)vrna_realloc(*aln, sizeof(char *) * (seq_num + 1));
1179 (*names) = (char **)vrna_realloc(*names, sizeof(char *) * (seq_num + 1));
1180 (*aln)[seq_num] = NULL;
1181 (*names)[seq_num] = NULL;
1182 }
1183 }
1184
1185
1186 PRIVATE int
check_alignment(const char ** names,const char ** aln,int seq_num,int verbosity)1187 check_alignment(const char **names,
1188 const char **aln,
1189 int seq_num,
1190 int verbosity)
1191 {
1192 int i, j, l, pass = 1;
1193
1194 /* check for unique names */
1195 for (i = 0; i < seq_num; i++) {
1196 for (j = i + 1; j < seq_num; j++) {
1197 if (!strcmp(names[i], names[j])) {
1198 if (verbosity >= 0)
1199 vrna_message_warning("Sequence IDs in input alignment are not unique!");
1200
1201 pass = 0;
1202 }
1203 }
1204 }
1205
1206 /* check for equal lengths of sequences */
1207 l = (int)strlen(aln[0]);
1208 for (i = 1; i < seq_num; i++)
1209 if ((int)strlen(aln[i]) != l) {
1210 if (verbosity >= 0)
1211 vrna_message_warning("Sequence lengths in input alignment do not match!");
1212
1213 pass = 0;
1214 }
1215
1216 return pass;
1217 }
1218