1 /* Merge alignments into a single alignment based on their reference (RF) annotation.
2  *
3  * Special care is taken to be consistent with missing data '~'
4  * annotation in two different contexts.
5  *
6  * First, from Infernal 1.1rc2 and later cmalign and cmsearch -A
7  * output alignments: '~' can occur in GC RF and SS_cons
8  * annotation. The '~' after a given nongap/nonmissing RF position x
9  * (and before nongap/nonmissing RF position x+1) must all occur 5' of
10  * all inserted positions (gap: '.') that occur between x and x+1.
11  *
12  * Second, from HMMER3, hmmbuild reads and writes '~' in sequences to
13  * mark the ends of fragments. Any fragment must begin and end with a
14  * contiguous string of 1 or more '~'. '~' characters are not allowed
15  * within sequences anywhere except within these leading and trailing
16  * contiguous stretches.
17  *
18  * esl-alimerge will accept as input and produce as output alignments
19  * that are consistent with both of these conventions.
20  *
21  * EPN, Fri Nov 20 16:28:59 2009
22  */
23 #include "esl_config.h"
24 
25 #include <stdlib.h>
26 #include <stdio.h>
27 #include <string.h>
28 
29 #include "easel.h"
30 #include "esl_alphabet.h"
31 #include "esl_distance.h"
32 #include "esl_fileparser.h"
33 #include "esl_getopts.h"
34 #include "esl_msa.h"
35 #include "esl_msafile.h"
36 #include "esl_msafile2.h"
37 #include "esl_sqio.h"
38 #include "esl_stopwatch.h"
39 #include "esl_vectorops.h"
40 
41 static char banner[] = "merge alignments based on their reference (RF) annotation";
42 static char usage1[]  = "[options] <alignment file 1> <alignment file 2>";
43 static char usage2[]  = "[options] --list <file listing n > 1 ali files to merge>\n\
44 \n\
45   Input alignments must be in Stockholm or Pfam format.\n\
46   Ouput format choices\n\
47   --------------------\n\
48   stockholm [default]\n\
49   pfam\n\
50   a2m\n\
51   psiblast\n\
52   afa";
53 
54 static void read_list_file(char *listfile, char ***ret_alifile_list, int *ret_nalifile);
55 static int  update_maxgap_and_maxmis(ESL_MSA *msa, char *errbuf, int clen, int64_t alen, int *maxgap, int *maxmis);
56 static int  validate_and_copy_msa_annotation(const ESL_GETOPTS *go, int outfmt, ESL_MSA *mmsa, ESL_MSA **msaA, int clen, int nmsa, int alen_merged, int *maxgap, int *maxmis, char *errbuf);
57 static int  add_msa(ESL_MSA *mmsa, char *errbuf, ESL_MSA *msa_to_add, int *maxgap, int *maxmis, int clen, int alen_merged);
58 static int  inflate_string_with_gaps_and_missing(char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, int *nmisA, char mischar, char **ret_dst_str);
59 static int  inflate_seq_with_gaps(const ESL_ALPHABET *abc, char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, char **ret_dst_str);
60 static int  validate_no_nongaps_in_rf_gaps(const ESL_ALPHABET *abc, char *rf_str, char *other_str, int64_t len);
61 static int  determine_gap_columns_to_add(ESL_MSA *msa, int *maxgap, int *maxmis, int clen, int **ret_ngapA, int **ret_nmisA, int **ret_neitherA, char *errbuf);
62 static void write_pfam_msa_top(FILE *fp, ESL_MSA *msa);
63 static void write_pfam_msa_gc(FILE *fp, ESL_MSA *msa, int maxwidth);
64 static int64_t maxwidth(char **s, int n);
65 static int  rfchar_is_nongap_nonmissing(const ESL_ALPHABET *abc, char rfchar);
66 
67 static ESL_OPTIONS options[] = {
68   /* name         type          default  env   range togs reqs  incomp           help                                                             docgroup */
69   { "--list",     eslARG_NONE,    FALSE, NULL, NULL, NULL,NULL, NULL,            "command-line argument is a file that lists ali files to merge",  99 },
70   { "-h",         eslARG_NONE,    FALSE, NULL, NULL, NULL,NULL, NULL,            "help; show brief info on version and usage",                     1 },
71   { "-o",         eslARG_OUTFILE,  NULL, NULL, NULL, NULL,NULL, NULL,            "output the final alignment to file <f>, not stdout",             1 },
72   { "-v",         eslARG_NONE,    FALSE, NULL, NULL, NULL,"-o", NULL,            "print info on merge to stdout; requires -o",                     1 },
73   { "--small",    eslARG_NONE,    FALSE, NULL, NULL, NULL,NULL, NULL,            "use minimal RAM (RAM usage will be independent of aln sizes)",   1 },
74   { "--rfonly",   eslARG_NONE,    FALSE, NULL, NULL, NULL,NULL, NULL,            "remove all columns that are gaps in GC RF annotation",           1 },
75   { "--informat", eslARG_STRING,  FALSE, NULL, NULL, NULL,NULL, NULL,            "NOT YET DISPLAYED",                                              99 },
76   { "--outformat",eslARG_STRING,  FALSE, NULL, NULL, NULL,NULL, NULL,            "specify that output aln be format <s> (see choices above)",      1 },
77   { "--rna",      eslARG_NONE,    FALSE, NULL, NULL, NULL,NULL,"--amino,--rna",  "alignments to merge are RNA alignments",                         1 },
78   { "--dna",      eslARG_NONE,    FALSE, NULL, NULL, NULL,NULL,"--amino,--rna",  "alignments to merge are DNA alignments",                         1 },
79   { "--amino",    eslARG_NONE,    FALSE, NULL, NULL, NULL,NULL,"--dna,--rna",    "alignments to merge are protein alignments",                     1 },
80   { "--stall",    eslARG_NONE,    FALSE, NULL, NULL, NULL,NULL, NULL,            "arrest after start: for debugging under gdb",                    99 },
81   { 0,0,0,0,0,0,0,0,0,0 },
82 };
83 
84 int
main(int argc,char ** argv)85 main(int argc, char **argv)
86 {
87   int           status;		               /* easel return code               */
88   ESL_GETOPTS  *go      = NULL;	               /* application configuration       */
89   ESL_ALPHABET *abc     = NULL;      	       /* biological alphabet             */
90   char         *msafile1 = NULL;	       /* msa file 1 (stays NULL if --list) */
91   char         *msafile2 = NULL;	       /* msa file 2 (stays NULL if --list) */
92   char         *listfile = NULL;	       /* list file name (stays NULL unless --list) */
93   int           infmt   = eslMSAFILE_UNKNOWN;  /* format code for input alifiles  */
94   int           outfmt  = eslMSAFILE_UNKNOWN;  /* format code for output ali      */
95   ESL_MSAFILE  *afp     = NULL;	               /* open alignment file, normal interface */
96   ESL_MSAFILE2 *afp2    = NULL;	               /* open alignment file, small-mem interface */
97   FILE         *ofp;		               /* output file (default is stdout) */
98   char        **alifile_list = NULL;           /* list of alignment files to merge */
99   int           nalifile;                      /* size of alifile_list             */
100   int           do_stall;                      /* used to stall when debugging     */
101   int           fi;                            /* counter over alignment files */
102   int           ai, ai2;                       /* counters over alignments */
103   int           nali_tot;                      /* number of alignments in all files */
104   int          *nali_per_file = NULL;          /* [0..nalifile-1] number of alignments per file */
105   int           nseq_tot;                      /* number of sequences in all alignments */
106   int           nseq_cur;                      /* number of sequences in current alignment */
107   int64_t       alen_cur;                      /* length of current alignment */
108   int64_t      *alenA = NULL;                  /* [0..nali_tot-1] alignment length of input msas (even after
109 						* potentially removingeinserts (--rfonly)) */
110   ESL_MSA     **msaA = NULL;                   /* [0..nali_tot-1] all msas read from all files */
111   int          *maxgap = NULL;                 /* [0..cpos..cm->clen+1] max number of gap columns
112 						* before each consensus position in all alignments */
113   int          *maxmis = NULL;                 /* [0..cpos..cm->clen+1] max number of missing data columns
114 						* before each consensus position in all alignments */
115   int           nalloc = 0;                    /* current size of msaA */
116   int           chunksize = 10;                /* size to increase nalloc by when realloc'ing */
117   void         *tmp;                           /* for ESL_RALLOC() */
118   int           clen;                          /* consensus length (non-gap #=GC RF length) of all alignments */
119   int           cur_clen;                      /* consensus length (non-gap #=GC RF length) of current alignments */
120   int           apos;                          /* alignment position */
121   ESL_MSA      *mmsa = NULL;                   /* the merged alignment created by merging all alignments in msaA */
122   int           alen_mmsa;                     /* number of columns in merged MSA */
123   char          errbuf[eslERRBUFSIZE];         /* buffer for error messages */
124   char         *tmpstr;                        /* used if -v, for printing file names */
125   int         **usemeA = NULL;                 /* [0..nali_tot-1][0..alen]  used only if --rfonly enabled, for removing gap RF columns */
126   ESL_STOPWATCH *w  = NULL;                    /* for timing the merge, only used if -o enabled */
127   int           do_small;                      /* TRUE if --small, operate in special small memory mode, aln seq data is not stored */
128   int           do_rfonly;                     /* TRUE if --rfonly, output only non-gap RF columns (remove all insert columns) */
129   int          *ngapA = NULL;              /* [0..alen] number of insert gap columns to add after each alignment column when merging */
130   int          *nmisA = NULL;               /* [0..alen] number of missing data ('~') gap columns to add after each alignment column when merging */
131   int          *neitherA = NULL;           /* [0..apos..alen] = ngapA[apos] + nmisA[apos] */
132 
133   /* output formatting, only relevant if -v */
134   char      *namedashes = NULL;                /* string of dashes, an underline */
135   int        ni;                               /* counter                        */
136   int        namewidth;                        /* max width of file name         */
137   int        tmp_len;                          /* current width of merged aln    */
138 
139   /* variables only used in small mode (--small) */
140   int           ngs_cur;                       /* number of GS lines in current alignment (only used if do_small) */
141   int           gs_exists = FALSE;             /* set to TRUE if do_small and any input aln has >= 1 GS line */
142   int           maxname, maxgf, maxgc, maxgr;  /* max length of seqname, GF tag, GC tag, GR tag in all input alignments */
143   int           maxname_cur, maxgf_cur, maxgc_cur, maxgr_cur; /* max length of seqname, GF tag, GC tag, GR tag in current input alignment */
144   int           margin;                        /* total margin length for output msa */
145 
146   /***********************************************
147    * Parse command line
148    ***********************************************/
149 
150   go = esl_getopts_Create(options);
151   if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
152       esl_opt_VerifyConfig(go)               != eslOK)
153     {
154       printf("Failed to parse command line: %s\n", go->errbuf);
155       esl_usage(stdout, argv[0], usage1);
156       esl_usage(stdout, argv[0], usage2);
157       printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
158       exit(1);
159     }
160 
161   if (esl_opt_GetBoolean(go, "-h") )
162     {
163       esl_banner(stdout, argv[0], banner);
164       esl_usage (stdout, argv[0], usage1);
165       esl_usage (stdout, argv[0], usage2);
166       puts("\n where options are:");
167       esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
168       exit(0);
169     }
170 
171   if(((! esl_opt_GetBoolean(go, "--list")) && (esl_opt_ArgNumber(go) != 2)) ||
172      ((  esl_opt_GetBoolean(go, "--list")) && (esl_opt_ArgNumber(go) != 1)))
173     {
174       printf("Incorrect number of command line arguments.\n");
175       esl_usage(stdout, argv[0], usage1);
176       esl_usage(stdout, argv[0], usage2);
177       printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
178       exit(1);
179     }
180 
181   if(esl_opt_GetBoolean(go, "--list")) {
182     listfile = esl_opt_GetArg(go, 1);
183   }
184   else {
185     msafile1 = esl_opt_GetArg(go, 1);
186     msafile2 = esl_opt_GetArg(go, 2);
187   }
188 
189   do_small  = (esl_opt_IsOn(go, "--small")) ? TRUE : FALSE;
190   do_rfonly = (esl_opt_IsOn(go, "--rfonly"))  ? TRUE : FALSE;
191 
192   /* open output file */
193   if (esl_opt_GetString(go, "-o") != NULL) {
194     if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL)
195       esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o"));
196   } else ofp = stdout;
197 
198   if (esl_opt_IsOn(go, "--informat")) {
199     infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"));
200     if (infmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat"));
201     if (do_small && infmt != eslMSAFILE_PFAM) esl_fatal("small memory mode requires Pfam formatted alignments");
202   }
203   if (esl_opt_IsOn(go, "--outformat")) {
204     outfmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"));
205     if (outfmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --outformat", esl_opt_GetString(go, "--outformat"));
206     if (do_small && outfmt != eslMSAFILE_PFAM) esl_fatal("we can only output Pfam formatted alignments in small memory mode");
207   }
208   else outfmt = eslMSAFILE_STOCKHOLM;
209 
210   if (do_small) {
211     infmt  = eslMSAFILE_PFAM; /* this must be true, else we can't do small memory mode */
212     outfmt = eslMSAFILE_PFAM;
213   }
214 
215   do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */
216   while (do_stall);
217 
218   /* determine file names to merge */
219   if(listfile != NULL) { /* read list file */
220     read_list_file(listfile, &alifile_list, &nalifile);
221     if(nalifile == 0) esl_fatal("Failed to read a single alignment file name from %s\n", listfile);
222   }
223   else { /* we're merging two alignment files from command-line */
224     nalifile = 2;
225     ESL_ALLOC(alifile_list, sizeof(char *) * nalifile);
226     if((status = esl_strdup(msafile1, -1, &(alifile_list[0]))) != eslOK) esl_fatal("Error storing alignment file name %s, error status: %d\n", msafile1, status);
227     if((status = esl_strdup(msafile2, -1, &(alifile_list[1]))) != eslOK) esl_fatal("Error storing alignment file name %s, error status: %d\n", msafile2, status);
228   }
229 
230   /* create and start stopwatch */
231   if(ofp != stdout) {
232     w = esl_stopwatch_Create();
233     esl_stopwatch_Start(w);
234   }
235 
236   if(esl_opt_GetBoolean(go, "-v")) {
237     /* determine the longest file name in alifile_list */
238     namewidth = 9; /* length of 'file name' */
239     for(fi = 0; fi < nalifile; fi++) {
240       if((status = esl_FileTail(alifile_list[fi], FALSE, &tmpstr)) != eslOK) esl_fatal("Memory allocation error.");
241       namewidth = ESL_MAX(namewidth, strlen(tmpstr));
242       free(tmpstr);
243     }
244 
245     ESL_ALLOC(namedashes, sizeof(char) * (namewidth+1));
246     namedashes[namewidth] = '\0';
247     for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-';
248     fprintf(stdout, "# Reading %d alignment files...\n", nalifile);
249     fprintf(stdout, "#\n");
250     fprintf(stdout, "# %7s  %-*s  %7s  %9s  %9s  %13s  %8s\n", "",        namewidth,"",          "",        "",          "",            "",               "ncols");
251     fprintf(stdout, "# %7s  %-*s  %7s  %9s  %9s  %13s  %8s\n", "file #",  namewidth,"file name", "ali #",   "#seq/ali",  "ncols/ali",   "# seq total",    "required");
252     fprintf(stdout, "# %7s  %*s  %7s  %9s  %9s  %13s  %8s\n", "-------", namewidth, namedashes,  "-------", "---------", "---------",   "-------------", "--------");
253   }
254 
255   /* Allocate and initialize */
256   nalloc = chunksize;
257   ESL_ALLOC(msaA,   sizeof(ESL_MSA *) * nalloc);
258   ESL_ALLOC(alenA,  sizeof(int64_t) * nalloc);
259   if(do_rfonly) ESL_ALLOC(usemeA, sizeof(int *) * nalloc);
260 
261   /* abc handling is weird. We only use alphabet to define gap characters in this miniapp.
262    * msa's are actually read in text mode. Thus eslRNA suffices for anything.
263    */
264   if      (esl_opt_GetBoolean(go, "--amino"))   abc = esl_alphabet_Create(eslAMINO);
265   else if (esl_opt_GetBoolean(go, "--dna"))     abc = esl_alphabet_Create(eslDNA);
266   else if (esl_opt_GetBoolean(go, "--rna"))     abc = esl_alphabet_Create(eslRNA);
267   else if (do_small)                      	abc = esl_alphabet_Create(eslRNA); /* alphabet is only used to define gap characters, so (in this miniapp) we're okay specifying RNA for any alignment (even non-RNA ones) */
268   else                                          abc = esl_alphabet_Create(eslRNA); /* ditto */
269 
270   /****************************************************************
271    *  Read alignments one at a time, storing them all, separately *
272    ****************************************************************/
273 
274   ai = 0;
275   nseq_tot = 0;
276   maxname = maxgf = maxgc = maxgr = 0;
277   ESL_ALLOC(nali_per_file, sizeof(int) * nalifile);
278   esl_vec_ISet(nali_per_file, nalifile, 0);
279 
280   for (fi = 0; fi < nalifile; fi++) {
281 
282     if (do_small)
283       {
284 	status  = esl_msafile2_Open(alifile_list[fi], NULL, &afp2);
285 	if      (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile_list[fi]);
286 	else if (status != eslOK)        esl_fatal("Alignment file %s open failed with error %d\n", alifile_list[fi], status);
287       }
288     else
289       {
290 	status = esl_msafile_Open(NULL, alifile_list[fi], NULL, infmt, NULL, &afp);
291 	if (status != eslOK) esl_msafile_OpenFailure(afp, status);
292       }
293 
294 
295     /* while loop: while we have an alignment in current alignment file, (statement looks weird b/c we use a different function if --small) */
296     while((status = (do_small) ?
297 	   esl_msafile2_ReadInfoPfam(afp2, NULL, NULL, -1, NULL,NULL, &(msaA[ai]), &nseq_cur, &alen_cur, &ngs_cur, &maxname_cur, &maxgf_cur, &maxgc_cur, &maxgr_cur, NULL, NULL, NULL, NULL, NULL) :
298 	   esl_msafile_Read        (afp, &(msaA[ai]))) == eslOK) {
299 
300       if(msaA[ai]->rf == NULL) esl_fatal("Error, all alignments must have #=GC RF annotation; alignment %d of file %d does not (%s)\n", nali_per_file[fi], (fi+1), alifile_list[fi]);
301       msaA[ai]->abc = abc; /* msa's are read in text mode, so this is (currently) only used to define gap characters, it doesn't even have to be the correct alphabet. if --small, this is set as RNA regardless of input */
302 
303       if (do_small) {
304 	maxname = ESL_MAX(maxname, maxname_cur);
305 	maxgf   = ESL_MAX(maxgf, maxgf_cur);
306 	maxgc   = ESL_MAX(maxgc, maxgc_cur);
307 	maxgr   = ESL_MAX(maxgr, maxgr_cur);
308 	msaA[ai]->alen = alen_cur;
309 	if(ngs_cur > 0) gs_exists = TRUE;
310       }
311       else {
312 	nseq_cur = msaA[ai]->nseq;
313       }
314       alenA[ai] = msaA[ai]->alen; /* impt if --small and --rfonly, to remember total width of aln to expect in second pass */
315       nali_per_file[fi]++;
316       nseq_tot += nseq_cur;
317 
318       /* reallocate per-alignment data, if nec */
319       if((ai+1) == nalloc) {
320 	nalloc += chunksize;
321 	ESL_RALLOC(msaA,   tmp, sizeof(ESL_MSA *) * nalloc);
322 	ESL_RALLOC(alenA,  tmp, sizeof(int64_t) * nalloc);
323 	for(ai2 = ai+1; ai2 < nalloc; ai2++) { msaA[ai2] = NULL; }
324 	if(do_rfonly) {
325 	  ESL_RALLOC(usemeA, tmp, sizeof(int *) * nalloc);
326 	  for(ai2 = ai+1; ai2 < nalloc; ai2++) { usemeA[ai2] = NULL; }
327 	}
328       }
329 
330       /* either store consensus (non-gap RF) length (if first aln), or verify it is what we expect */
331       cur_clen = 0;
332       for(apos = 0; apos < (int) msaA[ai]->alen; apos++) {
333 	if(rfchar_is_nongap_nonmissing(msaA[ai]->abc, msaA[ai]->rf[apos])) cur_clen++;
334       }
335       if(ai == 0) { /* first alignment, store clen, allocate maxgap, maxmis */
336 	clen = cur_clen;
337 	ESL_ALLOC(maxgap, sizeof(int) * (clen+1));
338 	esl_vec_ISet(maxgap, (clen+1), 0);
339 	ESL_ALLOC(maxmis, sizeof(int) * (clen+1));
340 	esl_vec_ISet(maxmis, (clen+1), 0); /* these will all stay 0 unless we see '~' in the alignments */
341       }
342       else if(cur_clen != clen) {
343 	esl_fatal("Error, all alignments must have identical non-gap #=GC RF lengths; expected (RF length of first ali read): %d,\nalignment %d of file %d length is %d (%s))\n", clen, nali_per_file[fi], (fi+1), cur_clen, alifile_list[fi]);
344       }
345 
346 
347       if(do_rfonly) {
348 	/* Remove all columns that are gaps/missing data in the RF annotation, we keep an array of usemes,
349 	 * one per aln, in case of --small, so we know useme upon second pass of alignment files */
350 	ESL_ALLOC(usemeA[ai], sizeof(int) * (msaA[ai]->alen));
351 	for(apos = 0; apos < msaA[ai]->alen; apos++) { usemeA[ai][apos] = (rfchar_is_nongap_nonmissing(abc, msaA[ai]->rf[apos])) ? TRUE : FALSE; }
352 	if((status = esl_msa_ColumnSubset(msaA[ai], errbuf, usemeA[ai])) != eslOK) {
353 	  esl_fatal("status code: %d removing gap RF columns for msa %d from file %s:\n%s", status, (ai+1), alifile_list[fi], errbuf);
354 	}
355       }
356       else { /* --rfonly not enabled, determine max number inserts between each position */
357 	/* update_maxgap_and_maxmis checks to make sure the msa follows our rule for missing data and gaps:
358 	 * for all positions x..y b/t any two adjacent nongap RF positions (x-1 and y+1 are nongap RF positions)
359 	 * all missing data columns '~' must come before all gap positions ('.', '-', or '_')
360 	 */
361       	if((status = update_maxgap_and_maxmis(msaA[ai], errbuf, clen, msaA[ai]->alen, maxgap, maxmis)) != eslOK) esl_fatal(errbuf);
362       }
363       if(esl_opt_GetBoolean(go, "-v")) {
364 	if((status = esl_FileTail(alifile_list[fi], FALSE, &tmpstr)) != eslOK) esl_fatal("Memory allocation error.");
365 	tmp_len = clen + esl_vec_ISum(maxgap, clen+1) + esl_vec_ISum(maxmis, clen+1);
366 	fprintf(stdout, "  %7d  %-*s  %7d  %9d  %9" PRId64 "  %13d  %8d\n", (fi+1),  namewidth, tmpstr, (ai+1), nseq_cur, msaA[ai]->alen, nseq_tot, tmp_len);
367 	free(tmpstr);
368       }
369       ai++;
370     } /* end of while esl_msafile_Read() loop */
371 
372     if (do_small)
373       {
374 	if      (status == eslEFORMAT) esl_fatal("Alignment file %s, parse error:\n%s\n", alifile_list[fi], afp2->errbuf);
375 	else if (status == eslEINVAL)  esl_fatal("Alignment file %s, parse error:\n%s\n", alifile_list[fi], afp2->errbuf);
376 	else if (status != eslEOF)     esl_fatal("Alignment file %s, read failed with error code %d\n", alifile_list[fi], status);
377       }
378     else if (status != eslEOF) esl_msafile_ReadFailure(afp, status);
379 
380     if(nali_per_file[fi] == 0)     esl_fatal("Failed to read any alignments from file %s\n", alifile_list[fi]);
381 
382     if (do_small) esl_msafile2_Close(afp2);
383     else          esl_msafile_Close(afp);
384   } /* end of for (fi=0; fi < nalifile; fi++) */
385   nali_tot = ai;
386 
387   /*********************************************
388    *  Merge all alignments into the merged MSA *
389    *********************************************/
390 
391   /* We allocate space for all sequences, but leave sequences as NULL (nseq = -1).
392    * If (do_small) we didn't store the sequences on the first pass through the
393    * alignment files, and we'll never allocate space for the sequences in mmsa,
394    * we'll just output them as we reread them on another pass through the
395    * individual alignments. If we read >= 1 GS line in any of the input alignments,
396    * we need to do an additional pass through the files, outputting only GS
397    * data. Then, in a final (3rd) pass we'll output aligned data.
398    *
399    * if (!do_small),  we have the sequences in memory, we'll copy these
400    * to the merged alignment, freeing them in the orignal msaA alignments
401    * as we go so we never need to allocate the full mmsa while we still have
402    * the individual msas (in msaA[]) in memory.
403    */
404   mmsa = esl_msa_Create(nseq_tot, -1);
405   alen_mmsa = clen + esl_vec_ISum(maxgap, clen+1) + esl_vec_ISum(maxmis, clen+1);
406 
407   /* Determine what annotation from the input alignments
408    * we will include in the merged MSA.
409    * See comments in header of validate_and_copy_msa_annotation()
410    * for rules on what we include.
411    */
412   if((status = validate_and_copy_msa_annotation(go, outfmt, mmsa, msaA, nali_tot, clen, alen_mmsa, maxgap, maxmis, errbuf)) != eslOK)
413     esl_fatal("Error while checking and copying individual MSA annotation to merged MSA:%s\n", errbuf);
414 
415   if (do_small) {
416     /* Small memory mode, do up to 2 more passes through the input alignments:
417      * pass 2 will output only GS data at top of output alignment file (only performed if >= 1 GS line read in input files
418      * pass 3 will output all aligned data at to output alignment file (always performed)
419      */
420 
421     /* output header, comments, and #=GF data */
422     write_pfam_msa_top(ofp, mmsa);
423 
424     if(ofp != stdout) {
425       if(esl_opt_GetBoolean(go, "-v")) { fprintf(stdout, "#\n"); }
426       fprintf(stdout, "# Outputting merged alignment to file %s ... ", esl_opt_GetString(go, "-o"));
427       fflush(stdout);
428     }
429 
430     /* if there was any GS annotation in any of the individual alignments,
431      * do second pass through alignment files, outputting GS annotation as we go. */
432     if(gs_exists) {
433       ai = 0;
434       for(fi = 0; fi < nalifile; fi++) {
435 	/* we're in small memory mode... */
436 	status = esl_msafile2_Open(alifile_list[fi], NULL, &afp2); /* this should work b/c it did on the first pass */
437 	if      (status == eslENOTFOUND) esl_fatal("Second pass, alignment file %s doesn't exist or is not readable\n", alifile_list[fi]);
438 	else if (status == eslEFORMAT)   esl_fatal("Second pass, couldn't determine format of alignment %s\n", alifile_list[fi]);
439 	else if (status != eslOK)        esl_fatal("Second pass, alignment file %s open failed with error %d\n", alifile_list[fi], status);
440 
441 	for(ai2 = 0; ai2 < nali_per_file[fi]; ai2++) {
442 	  status = esl_msafile2_RegurgitatePfam(afp2, ofp,
443 						maxname, maxgf, maxgc, maxgr, /* max width of a seq name, gf tag, gc tag, gr tag (irrelevant here) */
444 						FALSE, /* regurgitate stockholm header ? */
445 						FALSE, /* regurgitate // trailer ? */
446 						FALSE, /* regurgitate blank lines */
447 						FALSE, /* regurgitate comments */
448 						FALSE, /* regurgitate GF ? */
449 						TRUE,  /* regurgitate GS ? */
450 						FALSE, /* regurgitate GC ? */
451 						FALSE, /* regurgitate GR ? */
452 						FALSE, /* regurgitate aseq ? */
453 						NULL,  /* regurgitate all seqs, not a subset */
454 						NULL,  /* regurgitate all seqs, not a subset */
455 						NULL,  /* we're not keeping a subset of columns */
456 						NULL,  /* we're not adding all gap columns */
457 						alenA[ai], /* alignment length, as we read it in first pass (inserts may have been removed since then) */
458 						'.',   /* gap char, irrelevant */
459 						NULL,  /* don't return num seqs read */
460 						NULL); /* don't return num seqs regurgitated */
461 	  if(status == eslEOF) esl_fatal("Second pass, error out of alignments too soon, when trying to read aln %d of file %s", ai2, alifile_list[fi]);
462 	  if(status != eslOK)  esl_fatal("Second pass, error reading alignment %d of file %s: %s", ai2, alifile_list[fi], afp2->errbuf);
463 	  ai++;
464 	  fflush(ofp);
465 	}
466 	esl_msafile2_Close(afp2);
467       }
468       fprintf(ofp, "\n"); /* a single blank line to separate GS annotation from aligned data */
469     }
470 
471     /* do another (either second or third) pass through alignment files, outputting aligned sequence data (and GR) as we go */
472     ai = 0;
473     for(fi = 0; fi < nalifile; fi++) {
474       status = esl_msafile2_Open(alifile_list[fi], NULL, &afp2); /* this should work b/c it did on the first pass */
475       if      (status == eslENOTFOUND) esl_fatal("Second pass, alignment file %s doesn't exist or is not readable\n", alifile_list[fi]);
476       else if (status == eslEFORMAT)   esl_fatal("Second pass, couldn't determine format of alignment %s\n", alifile_list[fi]);
477       else if (status != eslOK)        esl_fatal("Second pass, alignment file %s open failed with error %d\n", alifile_list[fi], status);
478 
479       for(ai2 = 0; ai2 < nali_per_file[fi]; ai2++) {
480 	/* determine how many all gap columns to insert after each alignment position
481 	 * of the child msa when copying it to the merged msa */
482 	if(! do_rfonly) {
483 	  if((status = determine_gap_columns_to_add(msaA[ai], maxgap, maxmis, clen, &(ngapA), &(nmisA), &(neitherA), errbuf)) != eslOK)
484 	    esl_fatal("error determining number of all gap columns to add to alignment %d of file %s", ai2, alifile_list[fi]);
485 	}
486 	status = esl_msafile2_RegurgitatePfam(afp2, ofp,
487 					      maxname, maxgf, maxgc, maxgr, /* max width of a seq name, gf tag, gc tag, gr tag */
488 					      FALSE, /* regurgitate stockholm header ? */
489 					      FALSE, /* regurgitate // trailer ? */
490 					      FALSE, /* regurgitate blank lines */
491 					      FALSE, /* regurgitate comments */
492 					      FALSE, /* regurgitate GF ? */
493 					      FALSE, /* regurgitate GS ? */
494 					      FALSE, /* regurgitate GC ? */
495 					      TRUE,  /* regurgitate GR ? */
496 					      TRUE,  /* regurgitate aseq ? */
497 					      NULL,  /* regurgitate all seqs, not a subset */
498 					      NULL,  /* regurgitate all seqs, not a subset */
499 					      (do_rfonly) ? usemeA[ai] : NULL,
500 					      (do_rfonly) ? NULL       : neitherA,
501 					      alenA[ai], /* alignment length, as we read it in first pass (inserts may have been removed since then) */
502 					      '.',
503 					      NULL,  /* don't return num seqs read */
504 					      NULL); /* don't return num seqs regurgitated */
505 
506 	if(status == eslEOF) esl_fatal("Second pass, error out of alignments too soon, when trying to read aln %d of file %s", ai2, alifile_list[fi]);
507 	if(status != eslOK)  esl_fatal("Second pass, error reading alignment %d of file %s: %s", ai2, alifile_list[fi], afp2->errbuf);
508 	free(ngapA);
509 	free(nmisA);
510 	free(neitherA);
511 	esl_msa_Destroy(msaA[ai]);
512 	msaA[ai] = NULL;
513 	ai++;
514 	fflush(ofp);
515       }
516       esl_msafile2_Close(afp2);
517     }
518     /* finally, write GC annotation and '//' line */
519     margin = maxname + 1;
520     if (maxgc > 0 && maxgc+6 > margin) margin = maxgc+6;
521     if (maxgr > 0 && maxname+maxgr+7 > margin) margin = maxname+maxgr+7;
522     write_pfam_msa_gc(ofp, mmsa, margin);
523   } /* end of if(do_small) */
524 
525   else { /* ! do_small: for each input alignment in msaA[], add all aligned data to mmsa, then free it  */
526     if(esl_opt_GetBoolean(go, "-v")) { fprintf(stdout, "#\n# Merging alignments ... "); fflush(stdout); }
527     for(ai = 0; ai < nali_tot; ai++) {
528       if((status = add_msa(mmsa, errbuf, msaA[ai], maxgap, maxmis, clen, alen_mmsa)) != eslOK)
529 	esl_fatal("Error, merging alignment %d of %d:\n%s.", (ai+1), nali_tot, errbuf);
530       esl_msa_Destroy(msaA[ai]); /* note: the aligned sequences will have already been freed in add_msa() */
531       msaA[ai] = NULL;
532     }
533     if(esl_opt_GetBoolean(go, "-v")) { fprintf(stdout, "done.\n#\n"); fflush(stdout); }
534     mmsa->alen = alen_mmsa; /* it was -1, b/c we filled in each seq as we marched through each msaA[] alignment */
535     if(ofp != stdout) { fprintf(stdout, "# Saving alignment to file %s ... ", esl_opt_GetString(go, "-o")); }
536     status = esl_msafile_Write(ofp, mmsa, outfmt);
537     if(status != eslOK) esl_fatal("Error, during alignment output; status code: %d\n", status);
538   }
539   if(ofp != stdout) {
540     fflush(stdout);
541     fclose(ofp);
542     fprintf(stdout, "done\n#\n");
543     fflush(stdout);
544     esl_stopwatch_Stop(w);
545     esl_stopwatch_Display(stdout, w, "# CPU time: ");
546   }
547 
548   /* clean up and exit */
549   if(alifile_list != NULL) {
550     for(fi = 0; fi < nalifile; fi++) {
551       if(alifile_list[fi] != NULL) free(alifile_list[fi]);
552     }
553     free(alifile_list);
554   }
555   if(usemeA != NULL) {
556     for(ai = 0; ai < nali_tot; ai++) {
557       free(usemeA[ai]);
558     }
559     free(usemeA);
560   }
561 
562   if(nali_per_file != NULL) free(nali_per_file);
563   if(alenA != NULL)         free(alenA);
564   if(namedashes != NULL)    free(namedashes);
565   if(msaA != NULL)          free(msaA);
566   if(maxgap != NULL)        free(maxgap);
567   if(maxmis != NULL)        free(maxmis);
568   if(mmsa != NULL)          esl_msa_Destroy(mmsa);
569   if(abc  != NULL)          esl_alphabet_Destroy(abc);
570   if(w    != NULL)          esl_stopwatch_Destroy(w);
571   esl_getopts_Destroy(go);
572   return 0;
573 
574  ERROR:
575   esl_fatal("Out of memory. Reformat to Pfam with esl-reformat and try esl-alimerge --small.");
576   return eslEMEM; /*NEVERREACHED*/
577 }
578 
579 /* Function: read_list_file
580  * Date:     EPN, Fri Nov 20 16:41:32 2009
581  *
582  * Read a file listing alignment files to merge.
583  * Store file names in *ret_alifile_list and return it,
584  * return number of files in ret_nalifile and return it.
585  * Each white-space delimited token is considered a
586  * different alignment name.
587  *
588  * Dies if we encounter an error.
589  *
590  * Returns: void.
591  */
592 void
read_list_file(char * listfile,char *** ret_alifile_list,int * ret_nalifile)593 read_list_file(char *listfile, char ***ret_alifile_list, int *ret_nalifile)
594 {
595   int             status;
596   ESL_FILEPARSER *efp;
597   char           *tok;
598   int nalloc     = 10;
599   int chunksize  = 10;
600   char **alifile_list = NULL;
601   int n = 0;
602   void *tmp;
603 
604   ESL_ALLOC(alifile_list, sizeof(char *) * nalloc);
605   status = esl_fileparser_Open(listfile, NULL,  &efp);
606   if     (status == eslENOTFOUND) esl_fatal("List file %s does not exist or is not readable\n", listfile);
607   else if(status == eslEMEM)      esl_fatal("Ran out of memory when opening list file %s\n", listfile);
608   else if(status != eslOK)        esl_fatal("Error opening list file %s\n", listfile);
609 
610   esl_fileparser_SetCommentChar(efp, '#');
611   while((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslEOF) {
612     if(n == nalloc) { nalloc += chunksize; ESL_RALLOC(alifile_list, tmp, sizeof(char *) * nalloc); }
613     if((status = esl_strdup(tok, -1, &(alifile_list[n++]))) != eslOK) {
614       esl_fatal("Error storing alignment file name while reading list file %s, error status: %d\n", listfile, status);
615     }
616   }
617   esl_fileparser_Close(efp);
618   *ret_alifile_list = alifile_list;
619   *ret_nalifile = n;
620 
621   return;
622 
623  ERROR:
624   esl_fatal("Out of memory.");
625   return; /*NOTREACHED*/
626 }
627 
628 
629 /* Function: update_maxgap_and_maxmis
630  * Date:     EPN, Sun Nov 22 09:40:48 2009
631  *           (stolen from Infernal's cmalign.c)
632  *
633  * Update maxgap[] and maxmis[], arrays that keeps track of the max
634  * number of gap columns and missing data columns ('~' gap #=GC RF)
635  * columns before each cpos (consensus (non-gap #=GC RF) column)
636  *
637  * We require that all missing columns between cpos x and x+1 occur before
638  * all gap columns between cpos x and x+1.
639  *
640  * Consensus columns are index [0..cpos..clen].
641  *
642  * max{gap,mis}[0]      is number of gap/missing columns before 1st cpos.
643  * max{gap,mis}[clen-1] is number of gap/missing columns before final cpos.
644  * max{gap,mis}[clen]   is number of gap/missing columns after  final cpos.
645  *
646  * Returns: eslOK on success.
647  *          eslEINVAL if msa->rf is NULL, nongap RF length is not clen of
648  *                    for any two cpos x and x+1, a gap column precedes a
649  *                    missing data column.
650  *
651  */
652 int
update_maxgap_and_maxmis(ESL_MSA * msa,char * errbuf,int clen,int64_t alen,int * maxgap,int * maxmis)653 update_maxgap_and_maxmis(ESL_MSA *msa, char *errbuf, int clen, int64_t alen, int *maxgap, int *maxmis)
654 {
655   int apos;
656   int cpos = 0;
657   int ngap = 0;
658   int nmis = 0;
659 
660   for(apos = 0; apos < alen; apos++) {
661     if(esl_abc_CIsGap(msa->abc, msa->rf[apos])) {
662       ngap++;
663     }
664     else if (esl_abc_CIsMissing(msa->abc, msa->rf[apos])) {
665       nmis++;
666       if(ngap > 0) ESL_FAIL(eslEINVAL, errbuf, "after nongap RF pos %d, %d gap columns precede a missing data column (none should)", cpos, ngap);
667     }
668     else {
669       maxgap[cpos] = ESL_MAX(maxgap[cpos], ngap);
670       maxmis[cpos] = ESL_MAX(maxmis[cpos], nmis);
671       cpos++;
672       ngap = 0;
673       nmis = 0;
674     }
675   }
676 
677   /* update final value, max{ins,el}[clen+1], the number of inserts
678    * after the final consensus position */
679   maxgap[cpos] = ESL_MAX(maxgap[cpos], ngap);
680   maxmis[cpos] = ESL_MAX(maxmis[cpos], nmis);
681   if(cpos != clen) ESL_FAIL(eslEINVAL, errbuf, "second pass expected clen (%d) not equal to actual clen (%d).\n", clen, cpos);
682 
683   return eslOK;
684 }
685 
686 /* Function: validate_and_copy_msa_annotation
687  * Date:     EPN, Tue Nov 24 05:35:50 2009
688  *
689  * Decide what individual MSA annotation from
690  * the input alignments in msaA[], if any, will be
691  * included in the merged alignment (mmsa) and
692  * add that info to it.
693  *
694  * Rules for what to include in mmsa:
695  *
696  * We include name,desc,acc,author annotation in merged alignment
697  * if it is identical in all msaA[] input alignments.
698  *
699  * We include comments and per-file (GF) annotation if they
700  * are present and identical in all input msaA[] alignments.
701  *
702  * We include per-column (GC) annotation if it is present and
703  * identical *with-respect-to* #=GC RF annotation AND all
704  * the annotation in gap  #=GC RF columns in all msaA[] are gaps
705  * ('.'). This also pertains to the following parsed per-column
706  * annotation: ss_cons, sa_cons, pp_cons, and rf. With rf,
707  * de-gapped rf annotation must be identical in all input
708  * alignments period, if it is not we'll die with an error message.
709  *
710  * Per-sequence information and per-residue information is always
711  * included in merged alignment. This is done by add_msa() function.
712  *
713  * Returns: eslOK on success.
714  *          eslEINCONCEIVABLE if input/msa is corrupt in some way (example: ngf>0 but gf_tag[0] == NULL)
715  *          eslEMEM on memory error
716  *          if !eslOK, errbuf is filled before return
717  */
718 int
validate_and_copy_msa_annotation(const ESL_GETOPTS * go,int outfmt,ESL_MSA * mmsa,ESL_MSA ** msaA,int nmsa,int clen,int alen_merged,int * maxgap,int * maxmis,char * errbuf)719 validate_and_copy_msa_annotation(const ESL_GETOPTS *go, int outfmt, ESL_MSA *mmsa, ESL_MSA **msaA, int nmsa, int clen, int alen_merged, int *maxgap, int *maxmis, char *errbuf)
720 {
721   int status;
722   int j;                    /* counter over alignment annotations */
723   int j2;                   /* counter over alignment annotations */
724   int ai;                   /* counter over alignments */
725   char *dealigned   = NULL; /* a temporary, dealigned string */
726   char *dealigned2  = NULL; /* another temporary, dealigned string */
727   char *gapped_out  = NULL; /* a temporary string with gaps added to fit into merged aln */
728   int *ngapA    = NULL; /* [0..alen] number of insert gap columns to add after each alignment column when merging */
729   int *nmisA     = NULL; /* [0..alen] number of missing data ('~') gap columns to add after each alignment column when merging */
730   int *neitherA = NULL; /* [0..apos..alen] = ngapA[apos] + nmisA[apos] */
731   int do_add;
732   int found_tag;
733   int be_verbose = FALSE;
734 
735   /* we only print info about annotation if -v AND we'll actually
736    * output it (as stockholm or pfam)
737    * (we actually don't even need to be in this function if we're
738    * not output in stockholm or pfam...)
739    */
740   if((esl_opt_GetBoolean(go, "-v")) &&
741      (outfmt == eslMSAFILE_STOCKHOLM || outfmt == eslMSAFILE_PFAM))
742     { be_verbose = TRUE; }
743 
744   if(be_verbose) fprintf(stdout, "#\n");
745 
746   if(nmsa == 0) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "in validate_and_copy_msa_annotation(): zero child alignments.");
747 
748   /* First, determine how many all gap columns to insert after each alignment position
749    * of the first child msa, so we can (possibly) gap out GC,SS_cons,SA_cons,PP_cons annotation
750    * to appropriate length when adding it to the merged MSA. */
751   if((status = determine_gap_columns_to_add(msaA[0], maxgap, maxmis, clen, &(ngapA), &(nmisA), &(neitherA), errbuf)) != eslOK)
752     return status;
753 
754   /* Note: esl_strcmp() can handle NULL strings (they are not identical to non-NULL strings) */
755 
756   /*********************************************************************/
757   /* Check if name annotation is identical in all alignments */
758   do_add = TRUE; /* until proven otherwise */
759   if(msaA[0]->name != NULL) {
760     for(ai = 1; ai < nmsa; ai++) {
761       if(esl_strcmp(msaA[0]->name, msaA[ai]->name) != 0) { do_add = FALSE; break; }
762     }
763     if(do_add) {
764       if(be_verbose) fprintf(stdout, "# Identical name annotation from all alignments transferred to merged alignment.\n");
765       if((status = esl_strdup(msaA[0]->name, -1, &(mmsa->name))) != eslOK) goto ERROR;
766     }
767     else if(be_verbose) fprintf(stdout, "# Name annotation is not identical in all alignments; not included in merged alignment.\n");
768   }
769   else if(be_verbose) fprintf(stdout, "# Name annotation absent from (at least) first alignment; not included in merged alignment.\n");
770 
771   /*********************************************************************/
772   /* Check if description annotation is identical in all alignments */
773   do_add = TRUE; /* until proven otherwise */
774   if(msaA[0]->desc != NULL) {
775     for(ai = 1; ai < nmsa; ai++) {
776       if(esl_strcmp(msaA[0]->desc, msaA[ai]->desc) != 0) { do_add = FALSE; break; }
777     }
778     if(do_add) {
779       if(be_verbose) fprintf(stdout, "# Identical description annotation from all alignments transferred to merged alignment.\n");
780       if((status = esl_strdup(msaA[0]->desc, -1, &(mmsa->desc))) != eslOK) goto ERROR;
781     }
782     else if(be_verbose) fprintf(stdout, "# Description annotation is not identical in all alignments; not included in merged alignment.\n");
783   }
784   else if(be_verbose) fprintf(stdout, "# Description annotation absent from (at least) first alignment; not included in merged alignment.\n");
785 
786   /*********************************************************************/
787   /* Check if accession annotation is identical in all alignments */
788   do_add = TRUE; /* until proven otherwise */
789   if(msaA[0]->acc != NULL) {
790     for(ai = 1; ai < nmsa; ai++) {
791       if(esl_strcmp(msaA[0]->acc, msaA[ai]->acc) != 0) { do_add = FALSE; break; }
792     }
793     if(do_add) {
794       if(be_verbose) fprintf(stdout, "# Identical accession annotation from all alignments transferred to merged alignment.\n");
795       if((status = esl_strdup(msaA[0]->acc, -1, &(mmsa->acc))) != eslOK) goto ERROR;
796     }
797     else if(be_verbose) fprintf(stdout, "# Accession annotation is not identical in all alignments; not included in merged alignment.\n");
798   }
799   else if(be_verbose) fprintf(stdout, "# Accession annotation absent from (at least) first alignment; not included in merged alignment.\n");
800 
801   /*********************************************************************/
802   /* Check if author annotation is identical in all alignments */
803   do_add = TRUE; /* until proven otherwise */
804   if(msaA[0]->au != NULL) {
805     for(ai = 1; ai < nmsa; ai++) {
806       if(esl_strcmp(msaA[0]->au, msaA[ai]->au) != 0) { do_add = FALSE; break; }
807     }
808     if(do_add) {
809       if(be_verbose) fprintf(stdout, "# Identical author annotation from all alignments transferred to merged alignment.\n");
810       if((status = esl_strdup(msaA[0]->au, -1, &(mmsa->au))) != eslOK) goto ERROR;
811     }
812     else if(be_verbose) fprintf(stdout, "# Author annotation is not identical in all alignments; not included in merged alignment.\n");
813   }
814   else if(be_verbose) fprintf(stdout, "# Author annotation absent from (at least) first alignment; not included in merged alignment.\n");
815 
816   /*********************************************************************/
817   /* Check per-file (GF) annotation, must be present and identical in all msaA[] alignments to be included */
818   if(msaA[0]->ngf > 0) {
819     for(j = 0; j < msaA[0]->ngf; j++) {
820       do_add = TRUE; /* until proven otherwise */
821       /* verify that what we think is true is true */
822       if(msaA[0]->gf_tag[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GF tag %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngf);
823       if(msaA[0]->gf[j]    == NULL)  ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GF annotation %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngf);
824       for(ai = 1; ai < nmsa; ai++) {
825 	found_tag = FALSE;
826 	for(j2 = 0; j2 < msaA[ai]->ngf; j2++) {
827 	  if(esl_strcmp(msaA[0]->gf_tag[j], msaA[ai]->gf_tag[j2]) == 0) {
828 	    found_tag = TRUE;
829 	    if(esl_strcmp(msaA[0]->gf[j], msaA[ai]->gf_tag[j2]) != 0) {
830 	      do_add = FALSE;
831 	    }
832 	    break; /* if we found a match, do_add remains TRUE */
833 	  }
834 	}
835 	if(found_tag && do_add) {
836 	  if(be_verbose) fprintf(stdout, "# Identical GF tag %s annotation from all alignments transferred to merged alignment.\n", msaA[0]->gf_tag[j]);
837 	  if((status = esl_msa_AddGF(mmsa, msaA[0]->gf_tag[j], -1,  msaA[0]->gf[j], -1)) != eslOK) goto ERROR;
838 	}
839 	else {
840 	  if(be_verbose) fprintf(stdout, "# GF tag %s annotation from first alignment absent from >= 1 other alignments; not included in merged alignment.\n", msaA[0]->gf_tag[j]);
841 	}
842       }
843     }
844   }
845   else if(be_verbose) fprintf(stdout, "# Unparsed GF annotation absent from (at least) first alignment; not included in merged alignment.\n");
846 
847   /*********************************************************************/
848   /* Check comments, all must be identically ordered and identical in all msaA[] aligments to include them */
849   if(msaA[0]->ncomment > 0) {
850     do_add = TRUE; /* until proven otherwise */
851     /* make sure all alignments have same number of comments */
852     for(ai = 1; ai < nmsa; ai++) {
853       if(msaA[ai]->ncomment != msaA[0]->ncomment) {
854 	do_add = FALSE;
855 	break;
856       }
857     }
858     if(do_add) {
859       /* make sure all alignments have identical comments */
860       for(j = 0; j < msaA[0]->ncomment; j++) {
861 	/* verify that what we think is true is true */
862 	if(msaA[0]->comment[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, comment %d of msaA[0] is NULL, but msaA[0]->ncomment is %d.\n", j, msaA[0]->ncomment);
863 	for(ai = 1; ai < nmsa; ai++) {
864 	  if(esl_strcmp(msaA[0]->comment[j], msaA[ai]->comment[j]) != 0) { /* comment doesn't match */
865 	    do_add = FALSE;
866 	    break;
867 	  }
868 	}
869       }
870     }
871     if(do_add) {
872       for(j = 0; j < msaA[0]->ncomment; j++) {
873 	if((status = esl_msa_AddComment(mmsa, msaA[0]->comment[j], -1))!= eslOK) goto ERROR;
874       }
875       if(be_verbose) fprintf(stdout, "# All alignments have identical comments in the same order. These were transferred to merged alignment.\n");
876     }
877     else {
878       if(be_verbose) fprintf(stdout, "# Comments are not identical in all alignments; not included in merged alignment.\n");
879     }
880   }
881   else if(be_verbose) fprintf(stdout, "# No comments in (at least) first alignment; not included in merged alignment.\n");
882 
883   /*********************************************************************/
884   /* Check unparsed per-column (GC) annotation, it must include all gaps in gap RF columns and
885    * be identical once gap RF columns are removed in all msaA[] alignments to be included. */
886 
887   if(msaA[0]->ngc > 0) {
888     for(j = 0; j < msaA[0]->ngc; j++) {
889       do_add = TRUE; /* until proven otherwise */
890       /* verify that what we think is true is true */
891       if(msaA[0]->gc_tag[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GC tag %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngc);
892       if(msaA[0]->gc[j]     == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GC annotation %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngc);
893 
894       /* ensure it does not have non-gaps in gap RF columns */
895       if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->gc[j], msaA[0]->alen)) { /* returns TRUE if gc[j] has 0 non-gap characters in gap columns of RF annotation */
896 	/* dealign gc line */
897 	if((status = esl_strdup(msaA[0]->gc[j], msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
898 	if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning GC tag %s of msaA[0]", msaA[0]->gc_tag[j]);
899 
900 	for(ai = 1; ai < nmsa; ai++) {
901 	  found_tag = FALSE;
902 	  for(j2 = 0; j2 < msaA[ai]->ngc; j2++) {
903 	    if(esl_strcmp(msaA[0]->gc_tag[j], msaA[ai]->gc_tag[j2]) == 0) {
904 	      found_tag = TRUE;
905 	      /* ensure it does not have non-gaps in gap RF columns */
906 	      if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->gc[j2], msaA[ai]->alen)) { /* returns TRUE if gc[j2] has 0 non-gap characters in gap columns of RF annotation */
907 		/* dealign */
908 		if((status = esl_strdup(msaA[ai]->gc[j2], msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
909 		if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning GC tag %s of msaA[%d]", msaA[ai]->gc_tag[j2], ai);
910 		/* check identity */
911 		if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; }
912 		free(dealigned2);
913 		dealigned2 = NULL;
914 		break; /* if we matched, do_add remains TRUE */
915 	      }
916 	    }
917 	  }
918 	} /* end of (for(ai = 1...)) */
919 	if(dealigned != NULL) { free(dealigned); dealigned = NULL; }
920 	if(found_tag && do_add) {
921 	  /* gap out the the GC annotation to fit in merged alignment */
922 	  if((status = inflate_string_with_gaps_and_missing(msaA[0]->gc[j], msaA[0]->alen, alen_merged, neitherA, '.', NULL, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create GC tag %s annotation for merged alignment.", msaA[0]->gc_tag[j]);
923 	  if((status = esl_msa_AppendGC(mmsa, msaA[0]->gc_tag[j], gapped_out)) != eslOK) goto ERROR;
924 	  free(gapped_out);
925 	  gapped_out = NULL;
926 	  if(be_verbose) fprintf(stdout, "# Identical GC tag %s annotation from all alignments transferred to merged alignment.\n", msaA[0]->gc_tag[j]);
927 	}
928 	else {
929 	  if(be_verbose) fprintf(stdout, "# GC tag %s annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n", msaA[0]->gc_tag[j]);
930 	}
931       }
932     } /* end of for(j = 0 j < msaA[0]->ngc... */
933   } /* end of if(msaA[0]->ngc > 0) */
934   else if(be_verbose) fprintf(stdout, "# Unparsed GC annotation absent from (at least) first alignment; not included in merged alignment.\n");
935 
936   /*********************************************************************/
937   /* Check ss_cons: it must include all gaps in gap RF columns and be identical once gap RF columns are removed in all
938    * msaA[] alignments to be included. (Same requirements as unparsed GC annotation, so code block below is analogous to one above). */
939   if(msaA[0]->ss_cons != NULL) {
940     do_add = TRUE; /* until proven otherwise */
941     /* ensure it does not have non-gaps in gap RF columns */
942     if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->ss_cons, msaA[0]->alen)) { /* returns TRUE if ss_cons has 0 non-gap characters in gap columns of RF annotation */
943       /* dealign ss_cons */
944       if((status = esl_strdup(msaA[0]->ss_cons, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
945       if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning ss_cons of msaA[0]");
946       for(ai = 1; ai < nmsa; ai++) {
947 	if(msaA[ai]->ss_cons == NULL) {
948 	  do_add = FALSE;
949 	  break;
950 	}
951 	/* ss_cons != NULL, ensure it does not have non-gaps in gap RF columns */
952 	if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->ss_cons, msaA[ai]->alen)) { /* returns TRUE if ss_cons has 0 non-gap characters in gap columns of RF annotation */
953 	  /* dealign */
954 	  if((status = esl_strdup(msaA[ai]->ss_cons, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
955 	  if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning ss_cons of msaA[%d]", ai);
956 	  /* check identity */
957 	  if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; }
958 	  free(dealigned2);
959 	  dealigned2 = NULL;
960 	  break; /* if we matched, do_add remains TRUE */
961 	}
962       } /* end of (for(ai = 1...)) */
963       if(dealigned != NULL) { free(dealigned); dealigned = NULL; }
964       if(do_add) {
965 	/* gap out the the ss_cons to fit in merged alignment,
966 	 * as a special case, we use '~' in SS_cons as we add missing data and gaps
967 	 * like we do in RF (for all other annotation we only add gaps) */
968 	if((status = inflate_string_with_gaps_and_missing(msaA[0]->ss_cons, msaA[0]->alen, alen_merged, ngapA, '.', nmisA, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create SS_cons annotation for merged alignment.");
969 	if(mmsa->ss_cons != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding SS_cons to merged alignment, it is already non-NULL.");
970 	if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->ss_cons))) != eslOK) goto ERROR;
971 	free(gapped_out);
972 	gapped_out = NULL;
973 	if(be_verbose) fprintf(stdout, "# Identical SS_cons annotation from all alignments transferred to merged alignment.\n");
974       }
975       else {
976 	if(be_verbose) fprintf(stdout, "# SS_cons annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n");
977       }
978     }
979   } /* end of if(msaA[0]->ss_cons != NULL) */
980   else if(be_verbose) fprintf(stdout, "# SS_cons annotation absent from (at least) first alignment; not included in merged alignment.\n");
981 
982   /*********************************************************************/
983   /* Check sa_cons: it must include all gaps in gap RF columns and be identical once gap RF columns are removed in all
984    * msaA[] alignments to be included. (Same requirements as unparsed GC annotation, so code block below is analogous to one above). */
985   if(msaA[0]->sa_cons != NULL) {
986     do_add = TRUE; /* until proven otherwise */
987     /* ensure it does not have non-gaps in gap RF columns */
988     if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->sa_cons, msaA[0]->alen)) { /* returns TRUE if sa_cons has 0 non-gap characters in gap columns of RF annotation */
989       /* dealign sa_cons */
990       if((status = esl_strdup(msaA[0]->sa_cons, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
991       if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning sa_cons of msaA[0]");
992       for(ai = 1; ai < nmsa; ai++) {
993 	if(msaA[ai]->sa_cons == NULL) {
994 	  do_add = FALSE;
995 	  break;
996 	}
997 	/* sa_cons != NULL, ensure it does not have non-gaps in gap RF columns */
998 	if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->sa_cons, msaA[ai]->alen)) { /* returns TRUE if sa_cons has 0 non-gap characters in gap columns of RF annotation */
999 	  /* dealign */
1000 	  if((status = esl_strdup(msaA[ai]->sa_cons, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
1001 	  if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning sa_cons of msaA[%d]", ai);
1002 	  /* check identity */
1003 	  if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; }
1004 	  free(dealigned2);
1005 	  dealigned2 = NULL;
1006 	  break; /* if we matched, do_add remains TRUE */
1007 	}
1008       } /* end of (for(ai = 1...)) */
1009       if(dealigned != NULL) { free(dealigned); dealigned = NULL; }
1010       if(do_add) {
1011 	/* gap out the the sa_cons to fit in merged alignment */
1012 	if((status = inflate_string_with_gaps_and_missing(msaA[0]->sa_cons, msaA[0]->alen, alen_merged, neitherA, '.', NULL, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create SA_cons annotation for merged alignment.");
1013 	if(mmsa->sa_cons != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding SA_cons to merged alignment, it is already non-NULL.");
1014 	if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->sa_cons))) != eslOK) goto ERROR;
1015 	free(gapped_out);
1016 	gapped_out = NULL;
1017 	if(be_verbose) fprintf(stdout, "# Identical SA_cons annotation from all alignments transferred to merged alignment.\n");
1018       }
1019       else {
1020 	if(be_verbose) fprintf(stdout, "# SA_cons annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n");
1021       }
1022     }
1023   } /* end of if(msaA[0]->sa_cons != NULL) */
1024   else if(be_verbose) fprintf(stdout, "# SA_cons annotation absent from (at least) first alignment; not included in merged alignment.\n");
1025 
1026   /*********************************************************************/
1027   /* Check pp_cons: it must include all gaps in gap RF columns and be identical once gap RF columns are removed in all
1028    * msaA[] alignments to be included. (Same requirements as unparsed GC annotation, so code block below is analogous to one above). */
1029   if(msaA[0]->pp_cons != NULL) {
1030     do_add = TRUE; /* until proven otherwise */
1031     /* ensure it does not have non-gaps in gap RF columns */
1032     if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->pp_cons, msaA[0]->alen)) { /* returns TRUE if pp_cons has 0 non-gap characters in gap columns of RF annotation */
1033       /* dealign pp_cons */
1034       if((status = esl_strdup(msaA[0]->pp_cons, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
1035       if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning pp_cons of msaA[0]");
1036       for(ai = 1; ai < nmsa; ai++) {
1037 	if(msaA[ai]->pp_cons == NULL) {
1038 	  do_add = FALSE;
1039 	  break;
1040 	}
1041 	/* pp_cons != NULL, ensure it does not have non-gaps in gap RF columns */
1042 	if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->pp_cons, msaA[ai]->alen)) { /* returns TRUE if pp_cons has 0 non-gap characters in gap columns of RF annotation */
1043 	  /* dealign */
1044 	  if((status = esl_strdup(msaA[ai]->pp_cons, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
1045 	  if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning pp_cons of msaA[%d]", ai);
1046 	  /* check identity */
1047 	  if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; }
1048 	  free(dealigned2);
1049 	  dealigned2 = NULL;
1050 	  break; /* if we matched, do_add remains TRUE */
1051 	}
1052       } /* end of (for(ai = 1...)) */
1053       if(dealigned != NULL) { free(dealigned); dealigned = NULL; }
1054       if(do_add) {
1055 	/* gap out the the pp_cons to fit in merged alignment */
1056 	if((status = inflate_string_with_gaps_and_missing(msaA[0]->pp_cons, msaA[0]->alen, alen_merged, neitherA, '.', NULL, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create PP_cons annotation for merged alignment.");
1057 	if(mmsa->pp_cons != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding PP_cons to merged alignment, it is already non-NULL.");
1058 	if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->pp_cons))) != eslOK) goto ERROR;
1059 	free(gapped_out);
1060 	gapped_out = NULL;
1061 	if(be_verbose) fprintf(stdout, "# Identical PP_cons annotation from all alignments transferred to merged alignment.\n");
1062       }
1063       else {
1064 	if(be_verbose) fprintf(stdout, "# PP_cons annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n");
1065       }
1066     }
1067   } /* end of if(msaA[0]->pp_cons != NULL) */
1068   else if(be_verbose) fprintf(stdout, "# PP_cons annotation absent from (at least) first alignment; not included in merged alignment.\n");
1069 
1070   /*********************************************************************/
1071   /* Finally, validate that RF annotation is identical in all alignments after removing gaps. */
1072 
1073   if(msaA[0]->rf == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "All alignments must have #= GC RF annotation.");
1074   /* dealign rf */
1075   if((status = esl_strdup(msaA[0]->rf, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
1076   if((status = esl_strdealign(dealigned, dealigned, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning RF of msaA[0]");
1077   for(ai = 1; ai < nmsa; ai++) {
1078     if(msaA[ai]->rf == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "All alignments must have #= GC RF annotation.");
1079     /* dealign */
1080     if((status = esl_strdup(msaA[ai]->rf, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
1081     if((status = esl_strdealign(dealigned2, dealigned2, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning RF of msaA[%d]", ai);
1082     /* check identity */
1083     if(esl_strcmp(dealigned, dealigned2) != 0) {
1084       printf("%s\n%s\n", dealigned, dealigned2);
1085       ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "All alignments must have identical #=GC RF annotation, once gaps (\".\",\"-\",\"_\") are removed.\nAlignment %d de-gapped RF annotation differs from that of alignment 1.\n%s\n%s", ai+1, dealigned, dealigned2);
1086     }
1087     if(dealigned2 != NULL) { free(dealigned2); dealigned2 = NULL; }
1088   }
1089   if(dealigned  != NULL) { free(dealigned);  dealigned = NULL; }
1090   /* gap out the the RF to fit in merged alignment */
1091   if((status = inflate_string_with_gaps_and_missing(msaA[0]->rf, msaA[0]->alen, alen_merged, ngapA, '.', nmisA, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create RF annotation for merged alignment.");
1092   if(mmsa->rf != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding RF to merged alignment, it is already non-NULL.");
1093   if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->rf))) != eslOK) goto ERROR;
1094   free(gapped_out);
1095   gapped_out = NULL;
1096   if(be_verbose) fprintf(stdout, "# Identical RF annotation from all alignments transferred to merged alignment.\n");
1097 
1098   if(dealigned  != NULL) free(dealigned);
1099   if(dealigned2 != NULL) free(dealigned2);
1100   if(gapped_out != NULL) free(gapped_out);
1101   if(ngapA      != NULL) free(ngapA);
1102   if(nmisA      != NULL) free(nmisA);
1103   if(neitherA   != NULL) free(neitherA);
1104   return eslOK;
1105 
1106  ERROR:
1107   if(dealigned  != NULL) free(dealigned);
1108   if(dealigned2 != NULL) free(dealigned2);
1109   if(gapped_out != NULL) free(gapped_out);
1110   if(ngapA      != NULL) free(ngapA);
1111   if(nmisA      != NULL) free(nmisA);
1112   if(neitherA   != NULL) free(neitherA);
1113   return status;
1114 }
1115 
1116 /* Function: add_msa
1117  * Date:     EPN, Mon Nov 23 05:54:37 2009
1118  *
1119  * Add a "child" MSA we read from a file to the merged
1120  * MSA - the merged alignment that we'll eventually
1121  * output. We free each string in the child as soon
1122  * as we've added it to the merged, to save memory.
1123  *
1124  * We add all sequence data (aseq), and per sequence
1125  * annotation, including sqname, sqdesc, sqacc, pp, ss,
1126  * sa, as well as non-parsed GS and GR annotation.
1127  *
1128  * <maxgap>[0..clen] is an array specifying the
1129  * number of inserted columns necessary between
1130  * each consensus position.
1131  *
1132  * max{gap,mis}[0]      is number of gaps/missing columns before 1st cpos.
1133  * max{gap,mis}[clen-1] is number of gaps/missing columns before final cpos.
1134  * max{gap,mis}[clen]   is number of gaps/missing columns after  final cpos.
1135  *
1136  * <alen_merged> is the number of columns in the merged
1137  * alignment. This is the non-gap RF length plus the
1138  * sum of the maxgap vector.
1139  *
1140  * Returns: eslOK on success.
1141  *          eslEMEM on memory allocation failure.
1142  */
1143 int
add_msa(ESL_MSA * mmsa,char * errbuf,ESL_MSA * msa_to_add,int * maxgap,int * maxmis,int clen,int alen_merged)1144 add_msa(ESL_MSA *mmsa, char *errbuf, ESL_MSA *msa_to_add, int *maxgap, int *maxmis, int clen, int alen_merged)
1145 {
1146   int status;
1147   int i;                 /* counter over sequences in msa_to_add */
1148   int j;                 /* counter over alignment annotations */
1149   int mi;                /* counter over sequences in mmsa */
1150   void *tmp;             /* for reallocations */
1151   char *tmpstr;          /* used for copying GR annotation */
1152   int  nseq_existing;    /* number of sequences already added to mmsa, by previous calls of this function */
1153   int *ngapA = NULL;     /* [0..alen] number of insert gap columns to add after each alignment column when merging */
1154   int *nmisA = NULL;     /* [0..alen] number of missing data ('~') gap columns to add after each alignment column when merging */
1155   int *neitherA = NULL;  /* [0..apos..alen] = ngapA[apos] + nmisA[apos] */
1156 
1157   nseq_existing = mmsa->nseq;
1158 
1159   /* determine how many all gap columns to insert after each alignment position
1160    * of the child msa when copying it to the merged msa */
1161   if((status = determine_gap_columns_to_add(msa_to_add, maxgap, maxmis, clen, &(ngapA), &(nmisA), &(neitherA), errbuf)) != eslOK)
1162     return status;
1163 
1164   /* Append msa_to_add's sequence data and per-sequence annotation to mmsa after adding necessary gaps */
1165   /* sequence names and aligned sequence data (digitized) */
1166   for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1167       esl_strdup(msa_to_add->sqname[i], -1, &(mmsa->sqname[mi]));
1168 
1169       status = inflate_seq_with_gaps(msa_to_add->abc, msa_to_add->aseq[i], msa_to_add->alen, alen_merged, neitherA, '.', &(mmsa->aseq[mi]));
1170       if     (status == eslEMEM)   ESL_XFAIL(status, errbuf, "Out of memory adding sequence %s", mmsa->sqname[mi]);
1171       else if(status != eslOK)     ESL_XFAIL(status, errbuf, "Found internal missing data symbols in seq: %s", mmsa->sqname[mi]);
1172       free(msa_to_add->aseq[i]); /* free immediately */
1173       msa_to_add->aseq[i] = NULL;
1174   }
1175 
1176   /* parsed annotation that is optional */
1177   /* sqacc */
1178   if(msa_to_add->sqacc != NULL) {
1179     if(mmsa->sqacc == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1180       ESL_ALLOC(mmsa->sqacc, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1181       for(mi = 0; mi < nseq_existing; mi++) { mmsa->sqacc[mi] = NULL; }
1182     }
1183     else { /* reallocate; to add space for new seqs */
1184       ESL_RALLOC(mmsa->sqacc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1185     }
1186     for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1187       if(msa_to_add->sqacc[i] != NULL) {
1188 	if((status = esl_strdup(msa_to_add->sqacc[i], -1, &(mmsa->sqacc[mi]))) != eslOK)
1189 	  ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence accession number %d.\n", i+1);
1190 	free(msa_to_add->sqacc[i]); /* free immediately */
1191 	msa_to_add->sqacc[i] = NULL;
1192       }
1193       else {
1194 	mmsa->sqacc[mi] = NULL;
1195       }
1196     }
1197   } /* end of if(msa_to_add->sqacc != NULL */
1198   else if(mmsa->sqacc != NULL) {
1199     /* msa_to_add->sqacc == NULL, but mmsa->sqacc != NULL, reallocate mmsa->sqacc, and set new ones to NULL */
1200     ESL_RALLOC(mmsa->sqacc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1201     for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1202       mmsa->sqacc[mi] = NULL;
1203     }
1204   }
1205 
1206   /* sqdesc */
1207   if(msa_to_add->sqdesc != NULL) {
1208     if(mmsa->sqdesc == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1209       ESL_ALLOC(mmsa->sqdesc, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1210       for(mi = 0; mi < nseq_existing; mi++) { mmsa->sqdesc[mi] = NULL; }
1211     }
1212     else { /* reallocate; to add space for new seqs */
1213       ESL_RALLOC(mmsa->sqdesc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1214     }
1215     for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1216       if(msa_to_add->sqdesc[i] != NULL) {
1217 	if((status = esl_strdup(msa_to_add->sqdesc[i], -1, &(mmsa->sqdesc[mi]))) != eslOK)
1218 	  ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence description number %d.\n", i+1);
1219 	free(msa_to_add->sqdesc[i]); /* free immediately */
1220 	msa_to_add->sqdesc[i] = NULL;
1221       }
1222       else {
1223 	mmsa->sqdesc[mi] = NULL;
1224       }
1225     }
1226   } /* end of if(msa_to_add->sqdesc != NULL */
1227   else if(mmsa->sqdesc != NULL) {
1228     /* msa_to_add->sqdesc == NULL, but mmsa->sqdesc != NULL, reallocate mmsa->sqdesc, and set new ones to NULL */
1229     ESL_RALLOC(mmsa->sqdesc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1230     for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1231       mmsa->sqdesc[mi] = NULL;
1232     }
1233   }
1234 
1235   /* per-seq posterior probabilities */
1236   if(msa_to_add->pp != NULL) {
1237     if(mmsa->pp == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1238       ESL_ALLOC(mmsa->pp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1239       for(mi = 0; mi < nseq_existing; mi++) { mmsa->pp[mi] = NULL; }
1240     }
1241     else { /* reallocate; to add space for new seqs */
1242       ESL_RALLOC(mmsa->pp, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1243     }
1244     for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1245       if(msa_to_add->pp[i] != NULL) {
1246 	if((status = inflate_string_with_gaps_and_missing(msa_to_add->pp[i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(mmsa->pp[mi]))) != eslOK)
1247 	  ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d posterior probabilities.\n", i+1);
1248 	free(msa_to_add->pp[i]); /* free immediately */
1249 	msa_to_add->pp[i] = NULL;
1250       }
1251       else {
1252 	mmsa->pp[mi] = NULL;
1253       }
1254     }
1255   } /* end of if(msa_to_add->pp != NULL */
1256   else if(mmsa->pp != NULL) {
1257     /* msa_to_add->pp == NULL, but mmsa->pp != NULL, reallocate mmsa->pp, and set new ones to NULL */
1258     ESL_RALLOC(mmsa->pp, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1259     for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1260       mmsa->pp[mi] = NULL;
1261     }
1262   }
1263 
1264   /* per-seq secondary structure */
1265   if(msa_to_add->ss != NULL) {
1266     if(mmsa->ss == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1267       ESL_ALLOC(mmsa->ss, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1268       for(mi = 0; mi < nseq_existing; mi++) { mmsa->ss[mi] = NULL; }
1269     }
1270     else { /* reallocate; to add space for new seqs */
1271       ESL_RALLOC(mmsa->ss, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1272     }
1273     for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1274       if(msa_to_add->ss[i] != NULL) {
1275 	if((status = inflate_string_with_gaps_and_missing(msa_to_add->ss[i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(mmsa->ss[mi]))) != eslOK)
1276 	  ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d secondary structure.\n", i+1);
1277 	free(msa_to_add->ss[i]); /* free immediately */
1278 	msa_to_add->ss[i] = NULL;
1279       }
1280       else {
1281 	mmsa->ss[mi] = NULL;
1282       }
1283     }
1284   } /* end of if(msa_to_add->ss != NULL */
1285   else if(mmsa->ss != NULL) {
1286     /* msa_to_add->ss == NULL, but mmsa->ss != NULL, reallocate mmsa->ss, and set new ones to NULL */
1287     ESL_RALLOC(mmsa->ss, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1288     for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1289       mmsa->ss[mi] = NULL;
1290     }
1291   }
1292 
1293   /* per-seq surface accessibility */
1294   if(msa_to_add->sa != NULL) {
1295     if(mmsa->sa == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1296       ESL_ALLOC(mmsa->sa, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1297       for(mi = 0; mi < nseq_existing; mi++) { mmsa->sa[mi] = NULL; }
1298     }
1299     else { /* reallocate; to add space for new seqs */
1300       ESL_RALLOC(mmsa->sa, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1301     }
1302     for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1303       if(msa_to_add->sa[i] != NULL) {
1304 	if((status = inflate_string_with_gaps_and_missing(msa_to_add->sa[i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(mmsa->sa[mi]))) != eslOK)
1305 	  ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d surface accessibility.\n", i+1);
1306 	free(msa_to_add->sa[i]); /* free immediately */
1307 	msa_to_add->sa[i] = NULL;
1308       }
1309       else {
1310 	mmsa->sa[mi] = NULL;
1311       }
1312     }
1313   } /* end of if(msa_to_add->sa != NULL */
1314   else if(mmsa->sa != NULL) {
1315     /* msa_to_add->sa == NULL, but mmsa->sa != NULL, reallocate mmsa->sa, and set new ones to NULL */
1316     ESL_RALLOC(mmsa->sa, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1317     for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1318       mmsa->sa[mi] = NULL;
1319     }
1320   }
1321 
1322   /* Unparsed per-sequence (GS) annotation */
1323   if(msa_to_add->ngs > 0) {
1324     for(j = 0; j < msa_to_add->ngs; j++) {
1325       for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1326 	if(msa_to_add->gs[j][i] != NULL)
1327 	  if((status =esl_msa_AddGS(mmsa, msa_to_add->gs_tag[j], -1, mi, msa_to_add->gs[j][i], -1)) != eslOK)
1328 	    ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d GS annotation.\n", i+1);
1329       }
1330       free(msa_to_add->gs[j][i]); /* free immediately */
1331       msa_to_add->gs[j][i] = NULL;
1332     }
1333   }
1334   /* caller will free the rest of gs via esl_msa_Destroy() */
1335 
1336   /* unparsed per-residue (GR) annotation */
1337   if(msa_to_add->gr != NULL) {
1338     for(j = 0; j < msa_to_add->ngr; j++) {
1339       for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1340 	if(msa_to_add->gr[j][i] != NULL) {
1341 	  if((status = inflate_string_with_gaps_and_missing(msa_to_add->gr[j][i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(tmpstr))) != eslOK)
1342 	    ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d GR annotation.\n", i+1);
1343 	  if((status = esl_msa_AppendGR(mmsa, msa_to_add->gr_tag[j], mi, tmpstr)) != eslOK)
1344 	    ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d GR annotation.\n", i+1);
1345 	  free(tmpstr);
1346 	  free(msa_to_add->gr[j][i]); /* free immediately */
1347 	  msa_to_add->gr[j][i] = NULL;
1348 	}
1349       }
1350     }
1351   }
1352   /* caller will free the rest of gr in esl_msa_Destroy() */
1353 
1354   /* msa_to_add should be destroyed by caller */
1355 
1356   /* update nseq in mmsa */
1357   mmsa->nseq += msa_to_add->nseq;
1358 
1359   if(ngapA    != NULL) free(ngapA);
1360   if(nmisA    != NULL) free(nmisA);
1361   if(neitherA != NULL) free(neitherA);
1362   return eslOK;
1363 
1364  ERROR:
1365   if(ngapA    != NULL) free(ngapA);
1366   if(nmisA    != NULL) free(nmisA);
1367   if(neitherA != NULL) free(neitherA);
1368   return status;
1369 }
1370 
1371 /* inflate_string_with_gaps_and_missing
1372  *
1373  * Given a string, create a new one that is a copy of it,
1374  * but with missing data and gaps added before each position (apos)
1375  * as specified by n{gap,mis}A[0..apos..len]. <gapchar> and <mischar>
1376  * specify the gap character and missing data character. Either
1377  * ngapA or nmisA can be either be NULL, if so they're treated as if
1378  * they're all zeroes (no gaps/missing data will be added anywhere).
1379  *
1380  * n{gap,mis}A[0]       - number of gaps/missing to add before first posn
1381  * n{gap,mis}A[apos]    - number of gaps/missing to add before posn apos
1382  * n{gap,mis}A[src_len] - number of gaps/missing to add after  final posn
1383  *
1384  * By convention, missing data symbols are always put before gap symbols
1385  * when more than 1 of each are to be placed after the some nongap RF position.
1386  *
1387  * ret_str is allocated here.
1388  *
1389  * Returns eslOK on success.
1390  *         eslEMEM on memory error.
1391  */
1392 int
inflate_string_with_gaps_and_missing(char * src_str,int64_t src_len,int64_t dst_len,int * ngapA,char gapchar,int * nmisA,char mischar,char ** ret_dst_str)1393 inflate_string_with_gaps_and_missing(char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, int *nmisA, char mischar, char **ret_dst_str)
1394 {
1395   int status;
1396   int src_apos = 0;
1397   int dst_apos  = 0;
1398   int i;
1399   char *dst_str;
1400 
1401   ESL_ALLOC(dst_str, sizeof(char) * (dst_len+1));
1402   dst_str[dst_len] = '\0';
1403 
1404   /* add gaps before first position */
1405   if(nmisA) for(i = 0; i < nmisA[0]; i++) dst_str[dst_apos++] = mischar;
1406   if(ngapA) for(i = 0; i < ngapA[0]; i++) dst_str[dst_apos++] = gapchar;
1407 
1408   /* add gaps after every position */
1409   for(src_apos = 0; src_apos < src_len; src_apos++) {
1410     dst_str[dst_apos++] = src_str[src_apos];
1411     if(nmisA) for(i = 0; i < nmisA[(src_apos+1)]; i++) dst_str[dst_apos++] = mischar;
1412     if(ngapA) for(i = 0; i < ngapA[(src_apos+1)]; i++) dst_str[dst_apos++] = gapchar;
1413   }
1414 
1415   *ret_dst_str = dst_str;
1416   return eslOK;
1417 
1418  ERROR:
1419   return eslEMEM;
1420 }
1421 
1422 
1423 /* inflate_seq_with_gaps()
1424  *
1425  * Given an aligned sequence, create a new one that is a copy of it,
1426  * but with gaps added before each position (apos) as specified by
1427  * ngapA[0..apos..len]. <gapchar> specifies the gap character.
1428  *
1429  * This function follows the HMMER3 convention of using '~' to mark
1430  * fragment sequences. If a sequence is a fragment, the first and final
1431  * string of gaps (contiguous gaps) will be marked as '~'. That is,
1432  * the 1st column will necessarily be a '~' and the last will be a '~'.
1433  * This function takes care to add '~' as gap characters in fragments,
1434  * where appropriate, to follow HMMER3 convention.
1435  *
1436  * This function is importantly different from inflate_string_with_gaps_and_missing()
1437  * in that it does not allow internal '~' missing data characters to
1438  * be added (which are dictated by nmisA[] in inflate_string_with_gaps_and_missing())
1439  * Those '~' are possible in Infernal output alignments only in
1440  * #=GC RF and #=GC SS_cons alignment.
1441  *
1442  * It only adds '~' as necessary to the beginning and ends of fragments.
1443  *
1444  * ngapA[0]       - number of gaps/missing to add before first posn
1445  * ngapA[apos]    - number of gaps/missing to add before posn apos
1446  * ngapA[src_len] - number of gaps/missing to add after  final posn
1447  *
1448  * ret_str is allocated here.
1449  *
1450  * Returns eslOK on success.
1451  *         eslEMEM on memory error.
1452  *         eslEINVAL if sequence (src_str) has an internal missing
1453  *           data symbol that violates HMMER3 convention.
1454  */
1455 int
inflate_seq_with_gaps(const ESL_ALPHABET * abc,char * src_str,int64_t src_len,int64_t dst_len,int * ngapA,char gapchar,char ** ret_dst_str)1456 inflate_seq_with_gaps(const ESL_ALPHABET *abc, char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, char **ret_dst_str)
1457 {
1458   int status;
1459   int src_apos = 0;
1460   int dst_apos  = 0;
1461   int src_flpos = 0; /* position of rightmost '~' in contiguous string that begins at position 0 */
1462   int src_frpos = 0; /* position of leftmost  '~' in contiguous string that ends   at position src_len-1 */
1463   int i;
1464   char *dst_str;
1465   char char2add;
1466   int  i_am_fragment = FALSE;
1467 
1468   ESL_ALLOC(dst_str, sizeof(char) * (dst_len+1));
1469   dst_str[dst_len] = '\0';
1470 
1471   /* determine if sequence is a fragment, and determine src_flpos and src_frpos */
1472   src_flpos = 0;
1473   while(src_flpos < src_len && esl_abc_CIsMissing(abc, src_str[src_flpos])) { src_flpos++; }
1474   src_flpos--; /* we overshot by 1 */
1475   /* src_flpos is now rightmost '~' in stretch of '~' that begin at position 0, so
1476    * it's -1 if sequence is not a fragment (there are no leading '~' in src_str)
1477    */
1478 
1479   src_frpos = src_len-1;
1480   while(src_frpos > -1 && esl_abc_CIsMissing(abc, src_str[src_frpos])) { src_frpos--; }
1481   src_frpos++; /* we overshot by 1 */
1482   /* src_frpos is now leftmost '~' in stretch of '~' that end at position src_len-1, so
1483    * it's src_len if sequence is not a fragment (there are no trailing '~' in src_str)
1484    */
1485   i_am_fragment = (src_flpos != -1 || src_frpos != src_len) ? TRUE : FALSE;
1486 
1487   /* Now verify that we don't have any internal '~' characters */
1488   for(src_apos = src_flpos+1; src_apos <= src_frpos-1; src_apos++) {
1489     if(esl_abc_CIsMissing(abc, src_str[src_apos])) return eslEINVAL;
1490   }
1491 
1492   /* add gaps before first position */
1493   src_apos = 0;
1494   if(i_am_fragment) {
1495     char2add = ((src_flpos >= src_apos) || (src_frpos <= src_apos)) ? esl_abc_CGetMissing(abc) : gapchar;
1496   }
1497   else {
1498     char2add = gapchar;
1499   }
1500   if(ngapA) for(i = 0; i < ngapA[0]; i++) dst_str[dst_apos++] = char2add;
1501 
1502   /* add gaps or missing character after every position */
1503   for(src_apos = 0; src_apos < src_len; src_apos++) {
1504     dst_str[dst_apos++] = src_str[src_apos];
1505     if(i_am_fragment) {
1506       char2add = ((src_flpos >= src_apos) || (src_frpos <= src_apos)) ? esl_abc_CGetMissing(abc) : gapchar;
1507     }
1508     else {
1509       char2add = gapchar;
1510     }
1511     for(i = 0; i < ngapA[(src_apos+1)]; i++) dst_str[dst_apos++] = char2add;
1512   }
1513 
1514   *ret_dst_str = dst_str;
1515   return eslOK;
1516 
1517  ERROR:
1518   return eslEMEM;
1519 }
1520 
1521 
1522 /* validate_no_nongaps_in_rf_gaps
1523  *
1524  * Given an RF string with gaps defined as by alphabet <abc>
1525  * and another string of same length. Make sure none of the
1526  * positions that are gaps in the RF string are non-gaps in the
1527  * other string. Return TRUE if none are. Return FALSE if at
1528  * least one is.
1529  *
1530  * Returns TRUE if 0 characters in <other_str> in same position
1531  *         as a gap in <rf_str> are non-gaps. FALSE otherwise.
1532  */
1533 int
validate_no_nongaps_in_rf_gaps(const ESL_ALPHABET * abc,char * rf_str,char * other_str,int64_t len)1534 validate_no_nongaps_in_rf_gaps(const ESL_ALPHABET *abc, char *rf_str, char *other_str, int64_t len)
1535 {
1536   int64_t i;
1537   for(i = 0; i < len; i++) {
1538     if((! rfchar_is_nongap_nonmissing(abc, rf_str[i])) && (rfchar_is_nongap_nonmissing(abc, other_str[i]))) return FALSE;
1539   }
1540   return TRUE;
1541 }
1542 
1543 /* determine_gap_columns_to_add
1544  * (stolen and slightly modified from Infernal 1.1rc1's cmalign.c)
1545  *
1546  * Given <maxgap> and <maxmis>, two arrays of the number of gap RF
1547  * positions and '~' RF (missing data) after each non-gap RF
1548  * (consensus) position in the eventual final merged alignment,
1549  * calculate how many inserts and missing data inserts
1550  * we need to add at each position of <msa> to expand it out to the
1551  * appropriate size of the eventual merged alignment.
1552  *
1553  * max{gap,mis}[0]      is number of gaps/missing before 1st cpos in merged aln
1554  * max{gap,mis}[cpos]   is number of gaps/missing after cpos in merged aln
1555  *                             for cpos = 1..clen
1556  * clen is the number of non-gap RF positions in msa (and in eventual merged msa).
1557  *
1558  * We allocate fill and return ret_ngapA[0..msa->alen], ret_nmisA[0..msa->alen],
1559  * and ret_neitherA[0..msa->alen] here.
1560  *
1561  * ret_n{gap,mis}gapA[0]      is number of gaps/missing to add before 1st position of msa
1562  * ret_n{gap,mis}gapA[apos]   is number of gaps/missing to add after alignment position apos
1563  *                             for apos = 1..msa->alen
1564  *
1565  * ret_neitherA[apos] = ngapA[apos] + nmisA[apos]
1566  *
1567  * This is similar to the esl_msa.c helper function of the same name,
1568  * but that function does not bother with missing data '~'.
1569  *
1570  * Returns eslOK on success.
1571  *         eslEMEM on memory alloaction error
1572  *         eslEINVAL if gaps occur before missing data b/t any two nongap RF posns
1573  *         eslERANGE if a value exceeds what we expected (based on earlier
1574  *                   checks before this function was entered).
1575  *         if !eslOK, errbuf if filled.
1576  */
1577 int
determine_gap_columns_to_add(ESL_MSA * msa,int * maxgap,int * maxmis,int clen,int ** ret_ngapA,int ** ret_nmisA,int ** ret_neitherA,char * errbuf)1578 determine_gap_columns_to_add(ESL_MSA *msa, int *maxgap, int *maxmis, int clen, int **ret_ngapA, int **ret_nmisA, int **ret_neitherA, char *errbuf)
1579 {
1580   int status;
1581   int apos;
1582   int prv_cpos = 0;  /* alignment position corresponding to consensus position cpos-1 */
1583   int cpos = 0;
1584   int ngap = 0;
1585   int nmis = 0;
1586   int *ngapA    = NULL;
1587   int *nmisA    = NULL;
1588   int *neitherA = NULL;
1589 
1590   /* contract check */
1591   if(maxmis[0] != 0) ESL_FAIL(eslEINVAL, errbuf, "missing characters exist prior to first cpos, this shouldn't happen.\n");
1592 
1593   ESL_ALLOC(ngapA,    sizeof(int) * (msa->alen+1));
1594   ESL_ALLOC(nmisA,    sizeof(int) * (msa->alen+1));
1595   ESL_ALLOC(neitherA, sizeof(int) * (msa->alen+1));
1596   esl_vec_ISet(ngapA,    (msa->alen+1), 0);
1597   esl_vec_ISet(nmisA,    (msa->alen+1), 0);
1598   esl_vec_ISet(neitherA, (msa->alen+1), 0);
1599 
1600   for(apos = 0; apos < msa->alen; apos++) {
1601     if(esl_abc_CIsMissing(msa->abc, msa->rf[apos])) {
1602       nmis++;
1603       if(ngap > 0) ESL_FAIL(eslEINVAL, errbuf, "after nongap RF pos %d, %d gap columns precede a missing data column (none should)", cpos, ngap);
1604     }
1605     else if(esl_abc_CIsGap(msa->abc, msa->rf[apos])) {
1606       ngap++;
1607     }
1608     else { /* a consensus position */
1609       /* a few sanity checks */
1610       if(ngap  > maxgap[cpos]) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "%d inserts before cpos %d greater than max expected (%d).\n", ngap, cpos, maxgap[cpos]);
1611       if(nmis  > maxmis[cpos]) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "%d EL inserts before cpos %d greater than max expected (%d).\n", nmis, cpos, maxmis[cpos]);
1612 
1613       if (cpos == 0) {
1614 	/* gaps and/or missing data before first consensus position: flush right */
1615 	if(maxgap[cpos] > 0 && maxmis[cpos] == 0) { /* most common case */
1616 	  ngapA[prv_cpos]  = maxgap[cpos] - ngap; /* flush right gaps (no missing) */
1617 	}
1618 	else if(maxgap[cpos] == 0 && maxmis[cpos] > 0) {
1619 	  nmisA[prv_cpos]  = maxmis[cpos] - nmis; /* flush right missing (no gaps) */
1620 	}
1621 	else if(maxgap[cpos] > 0 && maxmis[cpos] > 0) {
1622 	  /* missing data (ELs) is always 5' of gaps */
1623 	  nmisA[prv_cpos]      = maxmis[cpos] - nmis; /* flush right */
1624 	  ngapA[prv_cpos+nmis] = maxgap[cpos] - ngap; /* flush right, after missing */
1625 	}
1626       }
1627       else {
1628 	/* gaps and/missing data after interior consensus position (i.e. not before 1st cpos or after last)
1629 	 * Determine where to place inserts and/or missing data.
1630 	 * Handle each of 4 possibilities separately, note that if
1631 	 * maxgap[cpos] == 0 then ngap == 0, and if maxmis[cpos] == 0 then nmis == 0 (see sanity check above).
1632 	 */
1633 	if(maxgap[cpos] >  0 && maxmis[cpos] == 0) { /* most common case */
1634 	  ngapA[prv_cpos + 1 + (ngap/2)] = maxgap[cpos] - ngap; /* internal cpos: split */
1635 	}
1636 	else if(maxgap[cpos] == 0 && maxmis[cpos] > 0) {
1637 	  nmisA[prv_cpos + 1 + (nmis/2)] = maxmis[cpos] - nmis; /* internal cpos: split */
1638 	}
1639 	else if(maxgap[cpos] >  0 && maxmis[cpos] > 0) {
1640 	  /* missing data is always 5' of gaps */
1641 	  nmisA[prv_cpos + 1 +        (nmis/2)] = maxmis[cpos] - nmis; /* internal cpos: split */
1642 	  ngapA[prv_cpos + 1 + nmis + (ngap/2)] = maxgap[cpos] - ngap; /* internal cpos: split */
1643 	}
1644 	/* final case is if (maxgap[cpos] == 0 && maxmis[cpos] == 0)
1645 	 * in this case we do nothing.
1646 	 */
1647       }
1648       cpos++;
1649       prv_cpos = apos;
1650       ngap = 0;
1651       nmis = 0;
1652     }
1653   }
1654   /* deal with gaps and missing data after final consensus position */
1655 
1656   /* first, validate that clen is what it should be */
1657   if(cpos != clen) {
1658     if(ngapA != NULL) free(ngapA);
1659     if(nmisA != NULL) free(nmisA);
1660     if(neitherA != NULL) free(neitherA);
1661     ESL_FAIL(eslEINCONCEIVABLE, errbuf, "consensus length (%d) is not the expected length (%d).", cpos, clen);
1662   }
1663 
1664   if(maxgap[cpos] > 0 && maxmis[cpos] == 0) { /* most common case */
1665     ngapA[prv_cpos + 1 + ngap] = maxgap[cpos] - ngap; /* flush left gaps (no missing) */
1666   }
1667   else if(maxgap[cpos] == 0 && maxmis[cpos] > 0) {
1668     nmisA[prv_cpos + 1 + nmis] = maxmis[cpos] - nmis; /* flush left missing (no gaps) */
1669   }
1670   else if(maxgap[cpos] > 0 && maxmis[cpos] > 0) {
1671     /* missing data (ELs) is always 5' of gaps */
1672     nmisA[prv_cpos + 1 + nmis]        = maxmis[cpos] - nmis; /* flush left */
1673     ngapA[prv_cpos + 1 + nmis + ngap] = maxgap[cpos] - ngap; /* flush left, after missing */
1674   }
1675 
1676   /* determine neitherA[], the number of gaps due to either inserts or missing data after each apos */
1677   for(apos = 0; apos <= msa->alen; apos++) {
1678     neitherA[apos] = ngapA[apos] + nmisA[apos];
1679   }
1680 
1681   *ret_ngapA  = ngapA;
1682   *ret_nmisA = nmisA;
1683   *ret_neitherA = neitherA;
1684 
1685   return eslOK;
1686 
1687  ERROR:
1688   if(ngapA    != NULL) free(ngapA);
1689   if(nmisA    != NULL) free(nmisA);
1690   if(neitherA != NULL) free(neitherA);
1691   ESL_FAIL(status, errbuf, "Memory allocation error.");
1692   return status; /*NEVERREACHED*/
1693 }
1694 
1695 /* write_pfam_msa_top
1696  *
1697  * Highly specialized function for printing out the 'top' of a
1698  * Stockholm alignment file, i.e. the Stockholm header line, comments
1699  * and GF annotation to an msa file. This function is necessary when
1700  * printing the merged alignment file in small memory mode (when
1701  * --small enabled). <msa> is an alignment with no sequence
1702  * information (no aseq, ax, GS, or GR data).
1703  */
1704 void
write_pfam_msa_top(FILE * fp,ESL_MSA * msa)1705 write_pfam_msa_top(FILE *fp, ESL_MSA *msa)
1706 {
1707   int i, maxgf;
1708 
1709   /* rest of code in this function was stolen verbatim from actually_write_stockholm in esl_msa.c */
1710 
1711   /* Magic Stockholm header
1712    */
1713   fprintf(fp, "# STOCKHOLM 1.0\n");
1714 
1715   /* Free text comments
1716    */
1717   for (i = 0;  i < msa->ncomment; i++)
1718     fprintf(fp, "# %s\n", msa->comment[i]);
1719   if (msa->ncomment > 0) fprintf(fp, "\n");
1720 
1721   maxgf = maxwidth(msa->gf_tag, msa->ngf);
1722   if (maxgf < 2) maxgf = 2;
1723 
1724   /* GF section: per-file annotation
1725    */
1726   if (msa->name != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "ID", msa->name);
1727   if (msa->acc  != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "AC", msa->acc);
1728   if (msa->desc != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "DE", msa->desc);
1729   if (msa->au   != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "AU", msa->au);
1730 
1731   /* Thresholds are hacky. Pfam has two. Rfam has one.
1732    */
1733   if      (msa->cutset[eslMSA_GA1] && msa->cutset[eslMSA_GA2])
1734     fprintf(fp, "#=GF %-*s %.1f %.1f\n",
1735 	    maxgf, "GA", msa->cutoff[eslMSA_GA1], msa->cutoff[eslMSA_GA2]);
1736   else if (msa->cutset[eslMSA_GA1])
1737     fprintf(fp, "#=GF %-*s %.1f\n",
1738 	    maxgf, "GA", msa->cutoff[eslMSA_GA1]);
1739 
1740   if      (msa->cutset[eslMSA_NC1] && msa->cutset[eslMSA_NC2])
1741     fprintf(fp, "#=GF %-*s %.1f %.1f\n",
1742 	    maxgf, "NC", msa->cutoff[eslMSA_NC1], msa->cutoff[eslMSA_NC2]);
1743   else if (msa->cutset[eslMSA_NC1])
1744     fprintf(fp, "#=GF %-*s %.1f\n",
1745 	    maxgf, "NC", msa->cutoff[eslMSA_NC1]);
1746 
1747   if      (msa->cutset[eslMSA_TC1] && msa->cutset[eslMSA_TC2])
1748     fprintf(fp, "#=GF %-*s %.1f %.1f\n",
1749 	    maxgf, "TC", msa->cutoff[eslMSA_TC1], msa->cutoff[eslMSA_TC2]);
1750   else if (msa->cutset[eslMSA_TC1])
1751     fprintf(fp, "#=GF %-*s %.1f\n",
1752 	    maxgf, "TC", msa->cutoff[eslMSA_TC1]);
1753 
1754   for (i = 0; i < msa->ngf; i++)
1755     fprintf(fp, "#=GF %-*s %s\n", maxgf, msa->gf_tag[i], msa->gf[i]);
1756   fprintf(fp, "\n");
1757 
1758   return;
1759 }
1760 
1761 /* write_pfam_msa_gc
1762  *
1763  * Highly specialized function for printing out the GC annotation to a
1764  * Pfam Stockholm alignment file (1 line/seq). This function is
1765  * necessary when printing the merged alignment file in small memory
1766  * mode (when --small enabled). <msa> is an alignment with no sequence
1767  * information (no aseq, ax, GS, nor GR data).
1768  */
1769 void
write_pfam_msa_gc(FILE * fp,ESL_MSA * msa,int margin)1770 write_pfam_msa_gc(FILE *fp, ESL_MSA *msa, int margin)
1771 {
1772   int i;
1773   if (msa->ss_cons != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "SS_cons", msa->ss_cons); }
1774   if (msa->sa_cons != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "SA_cons", msa->sa_cons); }
1775   if (msa->pp_cons != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "PP_cons", msa->pp_cons); }
1776   if (msa->rf != NULL)      { fprintf(fp, "#=GC %-*s %s\n", margin-6, "RF", msa->rf); }
1777   for (i = 0; i < msa->ngc; i++) { fprintf(fp, "#=GC %-*s %s\n", margin-6, msa->gc_tag[i], msa->gc[i]); }
1778   fprintf(fp, "//\n");
1779 }
1780 
1781 /* maxwidth()
1782  * Return the length of the longest string in
1783  * an array of strings.
1784  */
1785 static int64_t
maxwidth(char ** s,int n)1786 maxwidth(char **s, int n)
1787 {
1788   int64_t max,len;
1789   int     i;
1790 
1791   max = 0;
1792   for (i = 0; i < n; i++)
1793     if (s[i] != NULL)
1794       {
1795 	len = strlen(s[i]);
1796 	if (len > max) max = len;
1797       }
1798   return max;
1799 }
1800 
1801 /* rfchar_is_nongap_nonmissing()
1802  * Return FALSE if a character from RF annotation is
1803  * a gap or a missing character, else return TRUE.
1804  */
1805 static int
rfchar_is_nongap_nonmissing(const ESL_ALPHABET * abc,char rfchar)1806 rfchar_is_nongap_nonmissing(const ESL_ALPHABET *abc, char rfchar) {
1807   if(esl_abc_CIsGap    (abc, rfchar)) return FALSE;
1808   if(esl_abc_CIsMissing(abc, rfchar)) return FALSE;
1809   return TRUE;
1810 }
1811