1 /* Merge alignments into a single alignment based on their reference (RF) annotation.
2 *
3 * Special care is taken to be consistent with missing data '~'
4 * annotation in two different contexts.
5 *
6 * First, from Infernal 1.1rc2 and later cmalign and cmsearch -A
7 * output alignments: '~' can occur in GC RF and SS_cons
8 * annotation. The '~' after a given nongap/nonmissing RF position x
9 * (and before nongap/nonmissing RF position x+1) must all occur 5' of
10 * all inserted positions (gap: '.') that occur between x and x+1.
11 *
12 * Second, from HMMER3, hmmbuild reads and writes '~' in sequences to
13 * mark the ends of fragments. Any fragment must begin and end with a
14 * contiguous string of 1 or more '~'. '~' characters are not allowed
15 * within sequences anywhere except within these leading and trailing
16 * contiguous stretches.
17 *
18 * esl-alimerge will accept as input and produce as output alignments
19 * that are consistent with both of these conventions.
20 *
21 * EPN, Fri Nov 20 16:28:59 2009
22 */
23 #include "esl_config.h"
24
25 #include <stdlib.h>
26 #include <stdio.h>
27 #include <string.h>
28
29 #include "easel.h"
30 #include "esl_alphabet.h"
31 #include "esl_distance.h"
32 #include "esl_fileparser.h"
33 #include "esl_getopts.h"
34 #include "esl_msa.h"
35 #include "esl_msafile.h"
36 #include "esl_msafile2.h"
37 #include "esl_sqio.h"
38 #include "esl_stopwatch.h"
39 #include "esl_vectorops.h"
40
41 static char banner[] = "merge alignments based on their reference (RF) annotation";
42 static char usage1[] = "[options] <alignment file 1> <alignment file 2>";
43 static char usage2[] = "[options] --list <file listing n > 1 ali files to merge>\n\
44 \n\
45 Input alignments must be in Stockholm or Pfam format.\n\
46 Ouput format choices\n\
47 --------------------\n\
48 stockholm [default]\n\
49 pfam\n\
50 a2m\n\
51 psiblast\n\
52 afa";
53
54 static void read_list_file(char *listfile, char ***ret_alifile_list, int *ret_nalifile);
55 static int update_maxgap_and_maxmis(ESL_MSA *msa, char *errbuf, int clen, int64_t alen, int *maxgap, int *maxmis);
56 static int validate_and_copy_msa_annotation(const ESL_GETOPTS *go, int outfmt, ESL_MSA *mmsa, ESL_MSA **msaA, int clen, int nmsa, int alen_merged, int *maxgap, int *maxmis, char *errbuf);
57 static int add_msa(ESL_MSA *mmsa, char *errbuf, ESL_MSA *msa_to_add, int *maxgap, int *maxmis, int clen, int alen_merged);
58 static int inflate_string_with_gaps_and_missing(char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, int *nmisA, char mischar, char **ret_dst_str);
59 static int inflate_seq_with_gaps(const ESL_ALPHABET *abc, char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, char **ret_dst_str);
60 static int validate_no_nongaps_in_rf_gaps(const ESL_ALPHABET *abc, char *rf_str, char *other_str, int64_t len);
61 static int determine_gap_columns_to_add(ESL_MSA *msa, int *maxgap, int *maxmis, int clen, int **ret_ngapA, int **ret_nmisA, int **ret_neitherA, char *errbuf);
62 static void write_pfam_msa_top(FILE *fp, ESL_MSA *msa);
63 static void write_pfam_msa_gc(FILE *fp, ESL_MSA *msa, int maxwidth);
64 static int64_t maxwidth(char **s, int n);
65 static int rfchar_is_nongap_nonmissing(const ESL_ALPHABET *abc, char rfchar);
66
67 static ESL_OPTIONS options[] = {
68 /* name type default env range togs reqs incomp help docgroup */
69 { "--list", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "command-line argument is a file that lists ali files to merge", 99 },
70 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "help; show brief info on version and usage", 1 },
71 { "-o", eslARG_OUTFILE, NULL, NULL, NULL, NULL,NULL, NULL, "output the final alignment to file <f>, not stdout", 1 },
72 { "-v", eslARG_NONE, FALSE, NULL, NULL, NULL,"-o", NULL, "print info on merge to stdout; requires -o", 1 },
73 { "--small", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "use minimal RAM (RAM usage will be independent of aln sizes)", 1 },
74 { "--rfonly", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "remove all columns that are gaps in GC RF annotation", 1 },
75 { "--informat", eslARG_STRING, FALSE, NULL, NULL, NULL,NULL, NULL, "NOT YET DISPLAYED", 99 },
76 { "--outformat",eslARG_STRING, FALSE, NULL, NULL, NULL,NULL, NULL, "specify that output aln be format <s> (see choices above)", 1 },
77 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL,"--amino,--rna", "alignments to merge are RNA alignments", 1 },
78 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL,"--amino,--rna", "alignments to merge are DNA alignments", 1 },
79 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL,"--dna,--rna", "alignments to merge are protein alignments", 1 },
80 { "--stall", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "arrest after start: for debugging under gdb", 99 },
81 { 0,0,0,0,0,0,0,0,0,0 },
82 };
83
84 int
main(int argc,char ** argv)85 main(int argc, char **argv)
86 {
87 int status; /* easel return code */
88 ESL_GETOPTS *go = NULL; /* application configuration */
89 ESL_ALPHABET *abc = NULL; /* biological alphabet */
90 char *msafile1 = NULL; /* msa file 1 (stays NULL if --list) */
91 char *msafile2 = NULL; /* msa file 2 (stays NULL if --list) */
92 char *listfile = NULL; /* list file name (stays NULL unless --list) */
93 int infmt = eslMSAFILE_UNKNOWN; /* format code for input alifiles */
94 int outfmt = eslMSAFILE_UNKNOWN; /* format code for output ali */
95 ESL_MSAFILE *afp = NULL; /* open alignment file, normal interface */
96 ESL_MSAFILE2 *afp2 = NULL; /* open alignment file, small-mem interface */
97 FILE *ofp; /* output file (default is stdout) */
98 char **alifile_list = NULL; /* list of alignment files to merge */
99 int nalifile; /* size of alifile_list */
100 int do_stall; /* used to stall when debugging */
101 int fi; /* counter over alignment files */
102 int ai, ai2; /* counters over alignments */
103 int nali_tot; /* number of alignments in all files */
104 int *nali_per_file = NULL; /* [0..nalifile-1] number of alignments per file */
105 int nseq_tot; /* number of sequences in all alignments */
106 int nseq_cur; /* number of sequences in current alignment */
107 int64_t alen_cur; /* length of current alignment */
108 int64_t *alenA = NULL; /* [0..nali_tot-1] alignment length of input msas (even after
109 * potentially removingeinserts (--rfonly)) */
110 ESL_MSA **msaA = NULL; /* [0..nali_tot-1] all msas read from all files */
111 int *maxgap = NULL; /* [0..cpos..cm->clen+1] max number of gap columns
112 * before each consensus position in all alignments */
113 int *maxmis = NULL; /* [0..cpos..cm->clen+1] max number of missing data columns
114 * before each consensus position in all alignments */
115 int nalloc = 0; /* current size of msaA */
116 int chunksize = 10; /* size to increase nalloc by when realloc'ing */
117 void *tmp; /* for ESL_RALLOC() */
118 int clen; /* consensus length (non-gap #=GC RF length) of all alignments */
119 int cur_clen; /* consensus length (non-gap #=GC RF length) of current alignments */
120 int apos; /* alignment position */
121 ESL_MSA *mmsa = NULL; /* the merged alignment created by merging all alignments in msaA */
122 int alen_mmsa; /* number of columns in merged MSA */
123 char errbuf[eslERRBUFSIZE]; /* buffer for error messages */
124 char *tmpstr; /* used if -v, for printing file names */
125 int **usemeA = NULL; /* [0..nali_tot-1][0..alen] used only if --rfonly enabled, for removing gap RF columns */
126 ESL_STOPWATCH *w = NULL; /* for timing the merge, only used if -o enabled */
127 int do_small; /* TRUE if --small, operate in special small memory mode, aln seq data is not stored */
128 int do_rfonly; /* TRUE if --rfonly, output only non-gap RF columns (remove all insert columns) */
129 int *ngapA = NULL; /* [0..alen] number of insert gap columns to add after each alignment column when merging */
130 int *nmisA = NULL; /* [0..alen] number of missing data ('~') gap columns to add after each alignment column when merging */
131 int *neitherA = NULL; /* [0..apos..alen] = ngapA[apos] + nmisA[apos] */
132
133 /* output formatting, only relevant if -v */
134 char *namedashes = NULL; /* string of dashes, an underline */
135 int ni; /* counter */
136 int namewidth; /* max width of file name */
137 int tmp_len; /* current width of merged aln */
138
139 /* variables only used in small mode (--small) */
140 int ngs_cur; /* number of GS lines in current alignment (only used if do_small) */
141 int gs_exists = FALSE; /* set to TRUE if do_small and any input aln has >= 1 GS line */
142 int maxname, maxgf, maxgc, maxgr; /* max length of seqname, GF tag, GC tag, GR tag in all input alignments */
143 int maxname_cur, maxgf_cur, maxgc_cur, maxgr_cur; /* max length of seqname, GF tag, GC tag, GR tag in current input alignment */
144 int margin; /* total margin length for output msa */
145
146 /***********************************************
147 * Parse command line
148 ***********************************************/
149
150 go = esl_getopts_Create(options);
151 if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK ||
152 esl_opt_VerifyConfig(go) != eslOK)
153 {
154 printf("Failed to parse command line: %s\n", go->errbuf);
155 esl_usage(stdout, argv[0], usage1);
156 esl_usage(stdout, argv[0], usage2);
157 printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
158 exit(1);
159 }
160
161 if (esl_opt_GetBoolean(go, "-h") )
162 {
163 esl_banner(stdout, argv[0], banner);
164 esl_usage (stdout, argv[0], usage1);
165 esl_usage (stdout, argv[0], usage2);
166 puts("\n where options are:");
167 esl_opt_DisplayHelp(stdout, go, 1, 2, 80);
168 exit(0);
169 }
170
171 if(((! esl_opt_GetBoolean(go, "--list")) && (esl_opt_ArgNumber(go) != 2)) ||
172 (( esl_opt_GetBoolean(go, "--list")) && (esl_opt_ArgNumber(go) != 1)))
173 {
174 printf("Incorrect number of command line arguments.\n");
175 esl_usage(stdout, argv[0], usage1);
176 esl_usage(stdout, argv[0], usage2);
177 printf("\nTo see more help on available options, do %s -h\n\n", argv[0]);
178 exit(1);
179 }
180
181 if(esl_opt_GetBoolean(go, "--list")) {
182 listfile = esl_opt_GetArg(go, 1);
183 }
184 else {
185 msafile1 = esl_opt_GetArg(go, 1);
186 msafile2 = esl_opt_GetArg(go, 2);
187 }
188
189 do_small = (esl_opt_IsOn(go, "--small")) ? TRUE : FALSE;
190 do_rfonly = (esl_opt_IsOn(go, "--rfonly")) ? TRUE : FALSE;
191
192 /* open output file */
193 if (esl_opt_GetString(go, "-o") != NULL) {
194 if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL)
195 esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o"));
196 } else ofp = stdout;
197
198 if (esl_opt_IsOn(go, "--informat")) {
199 infmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--informat"));
200 if (infmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat"));
201 if (do_small && infmt != eslMSAFILE_PFAM) esl_fatal("small memory mode requires Pfam formatted alignments");
202 }
203 if (esl_opt_IsOn(go, "--outformat")) {
204 outfmt = esl_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat"));
205 if (outfmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --outformat", esl_opt_GetString(go, "--outformat"));
206 if (do_small && outfmt != eslMSAFILE_PFAM) esl_fatal("we can only output Pfam formatted alignments in small memory mode");
207 }
208 else outfmt = eslMSAFILE_STOCKHOLM;
209
210 if (do_small) {
211 infmt = eslMSAFILE_PFAM; /* this must be true, else we can't do small memory mode */
212 outfmt = eslMSAFILE_PFAM;
213 }
214
215 do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */
216 while (do_stall);
217
218 /* determine file names to merge */
219 if(listfile != NULL) { /* read list file */
220 read_list_file(listfile, &alifile_list, &nalifile);
221 if(nalifile == 0) esl_fatal("Failed to read a single alignment file name from %s\n", listfile);
222 }
223 else { /* we're merging two alignment files from command-line */
224 nalifile = 2;
225 ESL_ALLOC(alifile_list, sizeof(char *) * nalifile);
226 if((status = esl_strdup(msafile1, -1, &(alifile_list[0]))) != eslOK) esl_fatal("Error storing alignment file name %s, error status: %d\n", msafile1, status);
227 if((status = esl_strdup(msafile2, -1, &(alifile_list[1]))) != eslOK) esl_fatal("Error storing alignment file name %s, error status: %d\n", msafile2, status);
228 }
229
230 /* create and start stopwatch */
231 if(ofp != stdout) {
232 w = esl_stopwatch_Create();
233 esl_stopwatch_Start(w);
234 }
235
236 if(esl_opt_GetBoolean(go, "-v")) {
237 /* determine the longest file name in alifile_list */
238 namewidth = 9; /* length of 'file name' */
239 for(fi = 0; fi < nalifile; fi++) {
240 if((status = esl_FileTail(alifile_list[fi], FALSE, &tmpstr)) != eslOK) esl_fatal("Memory allocation error.");
241 namewidth = ESL_MAX(namewidth, strlen(tmpstr));
242 free(tmpstr);
243 }
244
245 ESL_ALLOC(namedashes, sizeof(char) * (namewidth+1));
246 namedashes[namewidth] = '\0';
247 for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-';
248 fprintf(stdout, "# Reading %d alignment files...\n", nalifile);
249 fprintf(stdout, "#\n");
250 fprintf(stdout, "# %7s %-*s %7s %9s %9s %13s %8s\n", "", namewidth,"", "", "", "", "", "ncols");
251 fprintf(stdout, "# %7s %-*s %7s %9s %9s %13s %8s\n", "file #", namewidth,"file name", "ali #", "#seq/ali", "ncols/ali", "# seq total", "required");
252 fprintf(stdout, "# %7s %*s %7s %9s %9s %13s %8s\n", "-------", namewidth, namedashes, "-------", "---------", "---------", "-------------", "--------");
253 }
254
255 /* Allocate and initialize */
256 nalloc = chunksize;
257 ESL_ALLOC(msaA, sizeof(ESL_MSA *) * nalloc);
258 ESL_ALLOC(alenA, sizeof(int64_t) * nalloc);
259 if(do_rfonly) ESL_ALLOC(usemeA, sizeof(int *) * nalloc);
260
261 /* abc handling is weird. We only use alphabet to define gap characters in this miniapp.
262 * msa's are actually read in text mode. Thus eslRNA suffices for anything.
263 */
264 if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO);
265 else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA);
266 else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA);
267 else if (do_small) abc = esl_alphabet_Create(eslRNA); /* alphabet is only used to define gap characters, so (in this miniapp) we're okay specifying RNA for any alignment (even non-RNA ones) */
268 else abc = esl_alphabet_Create(eslRNA); /* ditto */
269
270 /****************************************************************
271 * Read alignments one at a time, storing them all, separately *
272 ****************************************************************/
273
274 ai = 0;
275 nseq_tot = 0;
276 maxname = maxgf = maxgc = maxgr = 0;
277 ESL_ALLOC(nali_per_file, sizeof(int) * nalifile);
278 esl_vec_ISet(nali_per_file, nalifile, 0);
279
280 for (fi = 0; fi < nalifile; fi++) {
281
282 if (do_small)
283 {
284 status = esl_msafile2_Open(alifile_list[fi], NULL, &afp2);
285 if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile_list[fi]);
286 else if (status != eslOK) esl_fatal("Alignment file %s open failed with error %d\n", alifile_list[fi], status);
287 }
288 else
289 {
290 status = esl_msafile_Open(NULL, alifile_list[fi], NULL, infmt, NULL, &afp);
291 if (status != eslOK) esl_msafile_OpenFailure(afp, status);
292 }
293
294
295 /* while loop: while we have an alignment in current alignment file, (statement looks weird b/c we use a different function if --small) */
296 while((status = (do_small) ?
297 esl_msafile2_ReadInfoPfam(afp2, NULL, NULL, -1, NULL,NULL, &(msaA[ai]), &nseq_cur, &alen_cur, &ngs_cur, &maxname_cur, &maxgf_cur, &maxgc_cur, &maxgr_cur, NULL, NULL, NULL, NULL, NULL) :
298 esl_msafile_Read (afp, &(msaA[ai]))) == eslOK) {
299
300 if(msaA[ai]->rf == NULL) esl_fatal("Error, all alignments must have #=GC RF annotation; alignment %d of file %d does not (%s)\n", nali_per_file[fi], (fi+1), alifile_list[fi]);
301 msaA[ai]->abc = abc; /* msa's are read in text mode, so this is (currently) only used to define gap characters, it doesn't even have to be the correct alphabet. if --small, this is set as RNA regardless of input */
302
303 if (do_small) {
304 maxname = ESL_MAX(maxname, maxname_cur);
305 maxgf = ESL_MAX(maxgf, maxgf_cur);
306 maxgc = ESL_MAX(maxgc, maxgc_cur);
307 maxgr = ESL_MAX(maxgr, maxgr_cur);
308 msaA[ai]->alen = alen_cur;
309 if(ngs_cur > 0) gs_exists = TRUE;
310 }
311 else {
312 nseq_cur = msaA[ai]->nseq;
313 }
314 alenA[ai] = msaA[ai]->alen; /* impt if --small and --rfonly, to remember total width of aln to expect in second pass */
315 nali_per_file[fi]++;
316 nseq_tot += nseq_cur;
317
318 /* reallocate per-alignment data, if nec */
319 if((ai+1) == nalloc) {
320 nalloc += chunksize;
321 ESL_RALLOC(msaA, tmp, sizeof(ESL_MSA *) * nalloc);
322 ESL_RALLOC(alenA, tmp, sizeof(int64_t) * nalloc);
323 for(ai2 = ai+1; ai2 < nalloc; ai2++) { msaA[ai2] = NULL; }
324 if(do_rfonly) {
325 ESL_RALLOC(usemeA, tmp, sizeof(int *) * nalloc);
326 for(ai2 = ai+1; ai2 < nalloc; ai2++) { usemeA[ai2] = NULL; }
327 }
328 }
329
330 /* either store consensus (non-gap RF) length (if first aln), or verify it is what we expect */
331 cur_clen = 0;
332 for(apos = 0; apos < (int) msaA[ai]->alen; apos++) {
333 if(rfchar_is_nongap_nonmissing(msaA[ai]->abc, msaA[ai]->rf[apos])) cur_clen++;
334 }
335 if(ai == 0) { /* first alignment, store clen, allocate maxgap, maxmis */
336 clen = cur_clen;
337 ESL_ALLOC(maxgap, sizeof(int) * (clen+1));
338 esl_vec_ISet(maxgap, (clen+1), 0);
339 ESL_ALLOC(maxmis, sizeof(int) * (clen+1));
340 esl_vec_ISet(maxmis, (clen+1), 0); /* these will all stay 0 unless we see '~' in the alignments */
341 }
342 else if(cur_clen != clen) {
343 esl_fatal("Error, all alignments must have identical non-gap #=GC RF lengths; expected (RF length of first ali read): %d,\nalignment %d of file %d length is %d (%s))\n", clen, nali_per_file[fi], (fi+1), cur_clen, alifile_list[fi]);
344 }
345
346
347 if(do_rfonly) {
348 /* Remove all columns that are gaps/missing data in the RF annotation, we keep an array of usemes,
349 * one per aln, in case of --small, so we know useme upon second pass of alignment files */
350 ESL_ALLOC(usemeA[ai], sizeof(int) * (msaA[ai]->alen));
351 for(apos = 0; apos < msaA[ai]->alen; apos++) { usemeA[ai][apos] = (rfchar_is_nongap_nonmissing(abc, msaA[ai]->rf[apos])) ? TRUE : FALSE; }
352 if((status = esl_msa_ColumnSubset(msaA[ai], errbuf, usemeA[ai])) != eslOK) {
353 esl_fatal("status code: %d removing gap RF columns for msa %d from file %s:\n%s", status, (ai+1), alifile_list[fi], errbuf);
354 }
355 }
356 else { /* --rfonly not enabled, determine max number inserts between each position */
357 /* update_maxgap_and_maxmis checks to make sure the msa follows our rule for missing data and gaps:
358 * for all positions x..y b/t any two adjacent nongap RF positions (x-1 and y+1 are nongap RF positions)
359 * all missing data columns '~' must come before all gap positions ('.', '-', or '_')
360 */
361 if((status = update_maxgap_and_maxmis(msaA[ai], errbuf, clen, msaA[ai]->alen, maxgap, maxmis)) != eslOK) esl_fatal(errbuf);
362 }
363 if(esl_opt_GetBoolean(go, "-v")) {
364 if((status = esl_FileTail(alifile_list[fi], FALSE, &tmpstr)) != eslOK) esl_fatal("Memory allocation error.");
365 tmp_len = clen + esl_vec_ISum(maxgap, clen+1) + esl_vec_ISum(maxmis, clen+1);
366 fprintf(stdout, " %7d %-*s %7d %9d %9" PRId64 " %13d %8d\n", (fi+1), namewidth, tmpstr, (ai+1), nseq_cur, msaA[ai]->alen, nseq_tot, tmp_len);
367 free(tmpstr);
368 }
369 ai++;
370 } /* end of while esl_msafile_Read() loop */
371
372 if (do_small)
373 {
374 if (status == eslEFORMAT) esl_fatal("Alignment file %s, parse error:\n%s\n", alifile_list[fi], afp2->errbuf);
375 else if (status == eslEINVAL) esl_fatal("Alignment file %s, parse error:\n%s\n", alifile_list[fi], afp2->errbuf);
376 else if (status != eslEOF) esl_fatal("Alignment file %s, read failed with error code %d\n", alifile_list[fi], status);
377 }
378 else if (status != eslEOF) esl_msafile_ReadFailure(afp, status);
379
380 if(nali_per_file[fi] == 0) esl_fatal("Failed to read any alignments from file %s\n", alifile_list[fi]);
381
382 if (do_small) esl_msafile2_Close(afp2);
383 else esl_msafile_Close(afp);
384 } /* end of for (fi=0; fi < nalifile; fi++) */
385 nali_tot = ai;
386
387 /*********************************************
388 * Merge all alignments into the merged MSA *
389 *********************************************/
390
391 /* We allocate space for all sequences, but leave sequences as NULL (nseq = -1).
392 * If (do_small) we didn't store the sequences on the first pass through the
393 * alignment files, and we'll never allocate space for the sequences in mmsa,
394 * we'll just output them as we reread them on another pass through the
395 * individual alignments. If we read >= 1 GS line in any of the input alignments,
396 * we need to do an additional pass through the files, outputting only GS
397 * data. Then, in a final (3rd) pass we'll output aligned data.
398 *
399 * if (!do_small), we have the sequences in memory, we'll copy these
400 * to the merged alignment, freeing them in the orignal msaA alignments
401 * as we go so we never need to allocate the full mmsa while we still have
402 * the individual msas (in msaA[]) in memory.
403 */
404 mmsa = esl_msa_Create(nseq_tot, -1);
405 alen_mmsa = clen + esl_vec_ISum(maxgap, clen+1) + esl_vec_ISum(maxmis, clen+1);
406
407 /* Determine what annotation from the input alignments
408 * we will include in the merged MSA.
409 * See comments in header of validate_and_copy_msa_annotation()
410 * for rules on what we include.
411 */
412 if((status = validate_and_copy_msa_annotation(go, outfmt, mmsa, msaA, nali_tot, clen, alen_mmsa, maxgap, maxmis, errbuf)) != eslOK)
413 esl_fatal("Error while checking and copying individual MSA annotation to merged MSA:%s\n", errbuf);
414
415 if (do_small) {
416 /* Small memory mode, do up to 2 more passes through the input alignments:
417 * pass 2 will output only GS data at top of output alignment file (only performed if >= 1 GS line read in input files
418 * pass 3 will output all aligned data at to output alignment file (always performed)
419 */
420
421 /* output header, comments, and #=GF data */
422 write_pfam_msa_top(ofp, mmsa);
423
424 if(ofp != stdout) {
425 if(esl_opt_GetBoolean(go, "-v")) { fprintf(stdout, "#\n"); }
426 fprintf(stdout, "# Outputting merged alignment to file %s ... ", esl_opt_GetString(go, "-o"));
427 fflush(stdout);
428 }
429
430 /* if there was any GS annotation in any of the individual alignments,
431 * do second pass through alignment files, outputting GS annotation as we go. */
432 if(gs_exists) {
433 ai = 0;
434 for(fi = 0; fi < nalifile; fi++) {
435 /* we're in small memory mode... */
436 status = esl_msafile2_Open(alifile_list[fi], NULL, &afp2); /* this should work b/c it did on the first pass */
437 if (status == eslENOTFOUND) esl_fatal("Second pass, alignment file %s doesn't exist or is not readable\n", alifile_list[fi]);
438 else if (status == eslEFORMAT) esl_fatal("Second pass, couldn't determine format of alignment %s\n", alifile_list[fi]);
439 else if (status != eslOK) esl_fatal("Second pass, alignment file %s open failed with error %d\n", alifile_list[fi], status);
440
441 for(ai2 = 0; ai2 < nali_per_file[fi]; ai2++) {
442 status = esl_msafile2_RegurgitatePfam(afp2, ofp,
443 maxname, maxgf, maxgc, maxgr, /* max width of a seq name, gf tag, gc tag, gr tag (irrelevant here) */
444 FALSE, /* regurgitate stockholm header ? */
445 FALSE, /* regurgitate // trailer ? */
446 FALSE, /* regurgitate blank lines */
447 FALSE, /* regurgitate comments */
448 FALSE, /* regurgitate GF ? */
449 TRUE, /* regurgitate GS ? */
450 FALSE, /* regurgitate GC ? */
451 FALSE, /* regurgitate GR ? */
452 FALSE, /* regurgitate aseq ? */
453 NULL, /* regurgitate all seqs, not a subset */
454 NULL, /* regurgitate all seqs, not a subset */
455 NULL, /* we're not keeping a subset of columns */
456 NULL, /* we're not adding all gap columns */
457 alenA[ai], /* alignment length, as we read it in first pass (inserts may have been removed since then) */
458 '.', /* gap char, irrelevant */
459 NULL, /* don't return num seqs read */
460 NULL); /* don't return num seqs regurgitated */
461 if(status == eslEOF) esl_fatal("Second pass, error out of alignments too soon, when trying to read aln %d of file %s", ai2, alifile_list[fi]);
462 if(status != eslOK) esl_fatal("Second pass, error reading alignment %d of file %s: %s", ai2, alifile_list[fi], afp2->errbuf);
463 ai++;
464 fflush(ofp);
465 }
466 esl_msafile2_Close(afp2);
467 }
468 fprintf(ofp, "\n"); /* a single blank line to separate GS annotation from aligned data */
469 }
470
471 /* do another (either second or third) pass through alignment files, outputting aligned sequence data (and GR) as we go */
472 ai = 0;
473 for(fi = 0; fi < nalifile; fi++) {
474 status = esl_msafile2_Open(alifile_list[fi], NULL, &afp2); /* this should work b/c it did on the first pass */
475 if (status == eslENOTFOUND) esl_fatal("Second pass, alignment file %s doesn't exist or is not readable\n", alifile_list[fi]);
476 else if (status == eslEFORMAT) esl_fatal("Second pass, couldn't determine format of alignment %s\n", alifile_list[fi]);
477 else if (status != eslOK) esl_fatal("Second pass, alignment file %s open failed with error %d\n", alifile_list[fi], status);
478
479 for(ai2 = 0; ai2 < nali_per_file[fi]; ai2++) {
480 /* determine how many all gap columns to insert after each alignment position
481 * of the child msa when copying it to the merged msa */
482 if(! do_rfonly) {
483 if((status = determine_gap_columns_to_add(msaA[ai], maxgap, maxmis, clen, &(ngapA), &(nmisA), &(neitherA), errbuf)) != eslOK)
484 esl_fatal("error determining number of all gap columns to add to alignment %d of file %s", ai2, alifile_list[fi]);
485 }
486 status = esl_msafile2_RegurgitatePfam(afp2, ofp,
487 maxname, maxgf, maxgc, maxgr, /* max width of a seq name, gf tag, gc tag, gr tag */
488 FALSE, /* regurgitate stockholm header ? */
489 FALSE, /* regurgitate // trailer ? */
490 FALSE, /* regurgitate blank lines */
491 FALSE, /* regurgitate comments */
492 FALSE, /* regurgitate GF ? */
493 FALSE, /* regurgitate GS ? */
494 FALSE, /* regurgitate GC ? */
495 TRUE, /* regurgitate GR ? */
496 TRUE, /* regurgitate aseq ? */
497 NULL, /* regurgitate all seqs, not a subset */
498 NULL, /* regurgitate all seqs, not a subset */
499 (do_rfonly) ? usemeA[ai] : NULL,
500 (do_rfonly) ? NULL : neitherA,
501 alenA[ai], /* alignment length, as we read it in first pass (inserts may have been removed since then) */
502 '.',
503 NULL, /* don't return num seqs read */
504 NULL); /* don't return num seqs regurgitated */
505
506 if(status == eslEOF) esl_fatal("Second pass, error out of alignments too soon, when trying to read aln %d of file %s", ai2, alifile_list[fi]);
507 if(status != eslOK) esl_fatal("Second pass, error reading alignment %d of file %s: %s", ai2, alifile_list[fi], afp2->errbuf);
508 free(ngapA);
509 free(nmisA);
510 free(neitherA);
511 esl_msa_Destroy(msaA[ai]);
512 msaA[ai] = NULL;
513 ai++;
514 fflush(ofp);
515 }
516 esl_msafile2_Close(afp2);
517 }
518 /* finally, write GC annotation and '//' line */
519 margin = maxname + 1;
520 if (maxgc > 0 && maxgc+6 > margin) margin = maxgc+6;
521 if (maxgr > 0 && maxname+maxgr+7 > margin) margin = maxname+maxgr+7;
522 write_pfam_msa_gc(ofp, mmsa, margin);
523 } /* end of if(do_small) */
524
525 else { /* ! do_small: for each input alignment in msaA[], add all aligned data to mmsa, then free it */
526 if(esl_opt_GetBoolean(go, "-v")) { fprintf(stdout, "#\n# Merging alignments ... "); fflush(stdout); }
527 for(ai = 0; ai < nali_tot; ai++) {
528 if((status = add_msa(mmsa, errbuf, msaA[ai], maxgap, maxmis, clen, alen_mmsa)) != eslOK)
529 esl_fatal("Error, merging alignment %d of %d:\n%s.", (ai+1), nali_tot, errbuf);
530 esl_msa_Destroy(msaA[ai]); /* note: the aligned sequences will have already been freed in add_msa() */
531 msaA[ai] = NULL;
532 }
533 if(esl_opt_GetBoolean(go, "-v")) { fprintf(stdout, "done.\n#\n"); fflush(stdout); }
534 mmsa->alen = alen_mmsa; /* it was -1, b/c we filled in each seq as we marched through each msaA[] alignment */
535 if(ofp != stdout) { fprintf(stdout, "# Saving alignment to file %s ... ", esl_opt_GetString(go, "-o")); }
536 status = esl_msafile_Write(ofp, mmsa, outfmt);
537 if(status != eslOK) esl_fatal("Error, during alignment output; status code: %d\n", status);
538 }
539 if(ofp != stdout) {
540 fflush(stdout);
541 fclose(ofp);
542 fprintf(stdout, "done\n#\n");
543 fflush(stdout);
544 esl_stopwatch_Stop(w);
545 esl_stopwatch_Display(stdout, w, "# CPU time: ");
546 }
547
548 /* clean up and exit */
549 if(alifile_list != NULL) {
550 for(fi = 0; fi < nalifile; fi++) {
551 if(alifile_list[fi] != NULL) free(alifile_list[fi]);
552 }
553 free(alifile_list);
554 }
555 if(usemeA != NULL) {
556 for(ai = 0; ai < nali_tot; ai++) {
557 free(usemeA[ai]);
558 }
559 free(usemeA);
560 }
561
562 if(nali_per_file != NULL) free(nali_per_file);
563 if(alenA != NULL) free(alenA);
564 if(namedashes != NULL) free(namedashes);
565 if(msaA != NULL) free(msaA);
566 if(maxgap != NULL) free(maxgap);
567 if(maxmis != NULL) free(maxmis);
568 if(mmsa != NULL) esl_msa_Destroy(mmsa);
569 if(abc != NULL) esl_alphabet_Destroy(abc);
570 if(w != NULL) esl_stopwatch_Destroy(w);
571 esl_getopts_Destroy(go);
572 return 0;
573
574 ERROR:
575 esl_fatal("Out of memory. Reformat to Pfam with esl-reformat and try esl-alimerge --small.");
576 return eslEMEM; /*NEVERREACHED*/
577 }
578
579 /* Function: read_list_file
580 * Date: EPN, Fri Nov 20 16:41:32 2009
581 *
582 * Read a file listing alignment files to merge.
583 * Store file names in *ret_alifile_list and return it,
584 * return number of files in ret_nalifile and return it.
585 * Each white-space delimited token is considered a
586 * different alignment name.
587 *
588 * Dies if we encounter an error.
589 *
590 * Returns: void.
591 */
592 void
read_list_file(char * listfile,char *** ret_alifile_list,int * ret_nalifile)593 read_list_file(char *listfile, char ***ret_alifile_list, int *ret_nalifile)
594 {
595 int status;
596 ESL_FILEPARSER *efp;
597 char *tok;
598 int nalloc = 10;
599 int chunksize = 10;
600 char **alifile_list = NULL;
601 int n = 0;
602 void *tmp;
603
604 ESL_ALLOC(alifile_list, sizeof(char *) * nalloc);
605 status = esl_fileparser_Open(listfile, NULL, &efp);
606 if (status == eslENOTFOUND) esl_fatal("List file %s does not exist or is not readable\n", listfile);
607 else if(status == eslEMEM) esl_fatal("Ran out of memory when opening list file %s\n", listfile);
608 else if(status != eslOK) esl_fatal("Error opening list file %s\n", listfile);
609
610 esl_fileparser_SetCommentChar(efp, '#');
611 while((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslEOF) {
612 if(n == nalloc) { nalloc += chunksize; ESL_RALLOC(alifile_list, tmp, sizeof(char *) * nalloc); }
613 if((status = esl_strdup(tok, -1, &(alifile_list[n++]))) != eslOK) {
614 esl_fatal("Error storing alignment file name while reading list file %s, error status: %d\n", listfile, status);
615 }
616 }
617 esl_fileparser_Close(efp);
618 *ret_alifile_list = alifile_list;
619 *ret_nalifile = n;
620
621 return;
622
623 ERROR:
624 esl_fatal("Out of memory.");
625 return; /*NOTREACHED*/
626 }
627
628
629 /* Function: update_maxgap_and_maxmis
630 * Date: EPN, Sun Nov 22 09:40:48 2009
631 * (stolen from Infernal's cmalign.c)
632 *
633 * Update maxgap[] and maxmis[], arrays that keeps track of the max
634 * number of gap columns and missing data columns ('~' gap #=GC RF)
635 * columns before each cpos (consensus (non-gap #=GC RF) column)
636 *
637 * We require that all missing columns between cpos x and x+1 occur before
638 * all gap columns between cpos x and x+1.
639 *
640 * Consensus columns are index [0..cpos..clen].
641 *
642 * max{gap,mis}[0] is number of gap/missing columns before 1st cpos.
643 * max{gap,mis}[clen-1] is number of gap/missing columns before final cpos.
644 * max{gap,mis}[clen] is number of gap/missing columns after final cpos.
645 *
646 * Returns: eslOK on success.
647 * eslEINVAL if msa->rf is NULL, nongap RF length is not clen of
648 * for any two cpos x and x+1, a gap column precedes a
649 * missing data column.
650 *
651 */
652 int
update_maxgap_and_maxmis(ESL_MSA * msa,char * errbuf,int clen,int64_t alen,int * maxgap,int * maxmis)653 update_maxgap_and_maxmis(ESL_MSA *msa, char *errbuf, int clen, int64_t alen, int *maxgap, int *maxmis)
654 {
655 int apos;
656 int cpos = 0;
657 int ngap = 0;
658 int nmis = 0;
659
660 for(apos = 0; apos < alen; apos++) {
661 if(esl_abc_CIsGap(msa->abc, msa->rf[apos])) {
662 ngap++;
663 }
664 else if (esl_abc_CIsMissing(msa->abc, msa->rf[apos])) {
665 nmis++;
666 if(ngap > 0) ESL_FAIL(eslEINVAL, errbuf, "after nongap RF pos %d, %d gap columns precede a missing data column (none should)", cpos, ngap);
667 }
668 else {
669 maxgap[cpos] = ESL_MAX(maxgap[cpos], ngap);
670 maxmis[cpos] = ESL_MAX(maxmis[cpos], nmis);
671 cpos++;
672 ngap = 0;
673 nmis = 0;
674 }
675 }
676
677 /* update final value, max{ins,el}[clen+1], the number of inserts
678 * after the final consensus position */
679 maxgap[cpos] = ESL_MAX(maxgap[cpos], ngap);
680 maxmis[cpos] = ESL_MAX(maxmis[cpos], nmis);
681 if(cpos != clen) ESL_FAIL(eslEINVAL, errbuf, "second pass expected clen (%d) not equal to actual clen (%d).\n", clen, cpos);
682
683 return eslOK;
684 }
685
686 /* Function: validate_and_copy_msa_annotation
687 * Date: EPN, Tue Nov 24 05:35:50 2009
688 *
689 * Decide what individual MSA annotation from
690 * the input alignments in msaA[], if any, will be
691 * included in the merged alignment (mmsa) and
692 * add that info to it.
693 *
694 * Rules for what to include in mmsa:
695 *
696 * We include name,desc,acc,author annotation in merged alignment
697 * if it is identical in all msaA[] input alignments.
698 *
699 * We include comments and per-file (GF) annotation if they
700 * are present and identical in all input msaA[] alignments.
701 *
702 * We include per-column (GC) annotation if it is present and
703 * identical *with-respect-to* #=GC RF annotation AND all
704 * the annotation in gap #=GC RF columns in all msaA[] are gaps
705 * ('.'). This also pertains to the following parsed per-column
706 * annotation: ss_cons, sa_cons, pp_cons, and rf. With rf,
707 * de-gapped rf annotation must be identical in all input
708 * alignments period, if it is not we'll die with an error message.
709 *
710 * Per-sequence information and per-residue information is always
711 * included in merged alignment. This is done by add_msa() function.
712 *
713 * Returns: eslOK on success.
714 * eslEINCONCEIVABLE if input/msa is corrupt in some way (example: ngf>0 but gf_tag[0] == NULL)
715 * eslEMEM on memory error
716 * if !eslOK, errbuf is filled before return
717 */
718 int
validate_and_copy_msa_annotation(const ESL_GETOPTS * go,int outfmt,ESL_MSA * mmsa,ESL_MSA ** msaA,int nmsa,int clen,int alen_merged,int * maxgap,int * maxmis,char * errbuf)719 validate_and_copy_msa_annotation(const ESL_GETOPTS *go, int outfmt, ESL_MSA *mmsa, ESL_MSA **msaA, int nmsa, int clen, int alen_merged, int *maxgap, int *maxmis, char *errbuf)
720 {
721 int status;
722 int j; /* counter over alignment annotations */
723 int j2; /* counter over alignment annotations */
724 int ai; /* counter over alignments */
725 char *dealigned = NULL; /* a temporary, dealigned string */
726 char *dealigned2 = NULL; /* another temporary, dealigned string */
727 char *gapped_out = NULL; /* a temporary string with gaps added to fit into merged aln */
728 int *ngapA = NULL; /* [0..alen] number of insert gap columns to add after each alignment column when merging */
729 int *nmisA = NULL; /* [0..alen] number of missing data ('~') gap columns to add after each alignment column when merging */
730 int *neitherA = NULL; /* [0..apos..alen] = ngapA[apos] + nmisA[apos] */
731 int do_add;
732 int found_tag;
733 int be_verbose = FALSE;
734
735 /* we only print info about annotation if -v AND we'll actually
736 * output it (as stockholm or pfam)
737 * (we actually don't even need to be in this function if we're
738 * not output in stockholm or pfam...)
739 */
740 if((esl_opt_GetBoolean(go, "-v")) &&
741 (outfmt == eslMSAFILE_STOCKHOLM || outfmt == eslMSAFILE_PFAM))
742 { be_verbose = TRUE; }
743
744 if(be_verbose) fprintf(stdout, "#\n");
745
746 if(nmsa == 0) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "in validate_and_copy_msa_annotation(): zero child alignments.");
747
748 /* First, determine how many all gap columns to insert after each alignment position
749 * of the first child msa, so we can (possibly) gap out GC,SS_cons,SA_cons,PP_cons annotation
750 * to appropriate length when adding it to the merged MSA. */
751 if((status = determine_gap_columns_to_add(msaA[0], maxgap, maxmis, clen, &(ngapA), &(nmisA), &(neitherA), errbuf)) != eslOK)
752 return status;
753
754 /* Note: esl_strcmp() can handle NULL strings (they are not identical to non-NULL strings) */
755
756 /*********************************************************************/
757 /* Check if name annotation is identical in all alignments */
758 do_add = TRUE; /* until proven otherwise */
759 if(msaA[0]->name != NULL) {
760 for(ai = 1; ai < nmsa; ai++) {
761 if(esl_strcmp(msaA[0]->name, msaA[ai]->name) != 0) { do_add = FALSE; break; }
762 }
763 if(do_add) {
764 if(be_verbose) fprintf(stdout, "# Identical name annotation from all alignments transferred to merged alignment.\n");
765 if((status = esl_strdup(msaA[0]->name, -1, &(mmsa->name))) != eslOK) goto ERROR;
766 }
767 else if(be_verbose) fprintf(stdout, "# Name annotation is not identical in all alignments; not included in merged alignment.\n");
768 }
769 else if(be_verbose) fprintf(stdout, "# Name annotation absent from (at least) first alignment; not included in merged alignment.\n");
770
771 /*********************************************************************/
772 /* Check if description annotation is identical in all alignments */
773 do_add = TRUE; /* until proven otherwise */
774 if(msaA[0]->desc != NULL) {
775 for(ai = 1; ai < nmsa; ai++) {
776 if(esl_strcmp(msaA[0]->desc, msaA[ai]->desc) != 0) { do_add = FALSE; break; }
777 }
778 if(do_add) {
779 if(be_verbose) fprintf(stdout, "# Identical description annotation from all alignments transferred to merged alignment.\n");
780 if((status = esl_strdup(msaA[0]->desc, -1, &(mmsa->desc))) != eslOK) goto ERROR;
781 }
782 else if(be_verbose) fprintf(stdout, "# Description annotation is not identical in all alignments; not included in merged alignment.\n");
783 }
784 else if(be_verbose) fprintf(stdout, "# Description annotation absent from (at least) first alignment; not included in merged alignment.\n");
785
786 /*********************************************************************/
787 /* Check if accession annotation is identical in all alignments */
788 do_add = TRUE; /* until proven otherwise */
789 if(msaA[0]->acc != NULL) {
790 for(ai = 1; ai < nmsa; ai++) {
791 if(esl_strcmp(msaA[0]->acc, msaA[ai]->acc) != 0) { do_add = FALSE; break; }
792 }
793 if(do_add) {
794 if(be_verbose) fprintf(stdout, "# Identical accession annotation from all alignments transferred to merged alignment.\n");
795 if((status = esl_strdup(msaA[0]->acc, -1, &(mmsa->acc))) != eslOK) goto ERROR;
796 }
797 else if(be_verbose) fprintf(stdout, "# Accession annotation is not identical in all alignments; not included in merged alignment.\n");
798 }
799 else if(be_verbose) fprintf(stdout, "# Accession annotation absent from (at least) first alignment; not included in merged alignment.\n");
800
801 /*********************************************************************/
802 /* Check if author annotation is identical in all alignments */
803 do_add = TRUE; /* until proven otherwise */
804 if(msaA[0]->au != NULL) {
805 for(ai = 1; ai < nmsa; ai++) {
806 if(esl_strcmp(msaA[0]->au, msaA[ai]->au) != 0) { do_add = FALSE; break; }
807 }
808 if(do_add) {
809 if(be_verbose) fprintf(stdout, "# Identical author annotation from all alignments transferred to merged alignment.\n");
810 if((status = esl_strdup(msaA[0]->au, -1, &(mmsa->au))) != eslOK) goto ERROR;
811 }
812 else if(be_verbose) fprintf(stdout, "# Author annotation is not identical in all alignments; not included in merged alignment.\n");
813 }
814 else if(be_verbose) fprintf(stdout, "# Author annotation absent from (at least) first alignment; not included in merged alignment.\n");
815
816 /*********************************************************************/
817 /* Check per-file (GF) annotation, must be present and identical in all msaA[] alignments to be included */
818 if(msaA[0]->ngf > 0) {
819 for(j = 0; j < msaA[0]->ngf; j++) {
820 do_add = TRUE; /* until proven otherwise */
821 /* verify that what we think is true is true */
822 if(msaA[0]->gf_tag[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GF tag %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngf);
823 if(msaA[0]->gf[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GF annotation %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngf);
824 for(ai = 1; ai < nmsa; ai++) {
825 found_tag = FALSE;
826 for(j2 = 0; j2 < msaA[ai]->ngf; j2++) {
827 if(esl_strcmp(msaA[0]->gf_tag[j], msaA[ai]->gf_tag[j2]) == 0) {
828 found_tag = TRUE;
829 if(esl_strcmp(msaA[0]->gf[j], msaA[ai]->gf_tag[j2]) != 0) {
830 do_add = FALSE;
831 }
832 break; /* if we found a match, do_add remains TRUE */
833 }
834 }
835 if(found_tag && do_add) {
836 if(be_verbose) fprintf(stdout, "# Identical GF tag %s annotation from all alignments transferred to merged alignment.\n", msaA[0]->gf_tag[j]);
837 if((status = esl_msa_AddGF(mmsa, msaA[0]->gf_tag[j], -1, msaA[0]->gf[j], -1)) != eslOK) goto ERROR;
838 }
839 else {
840 if(be_verbose) fprintf(stdout, "# GF tag %s annotation from first alignment absent from >= 1 other alignments; not included in merged alignment.\n", msaA[0]->gf_tag[j]);
841 }
842 }
843 }
844 }
845 else if(be_verbose) fprintf(stdout, "# Unparsed GF annotation absent from (at least) first alignment; not included in merged alignment.\n");
846
847 /*********************************************************************/
848 /* Check comments, all must be identically ordered and identical in all msaA[] aligments to include them */
849 if(msaA[0]->ncomment > 0) {
850 do_add = TRUE; /* until proven otherwise */
851 /* make sure all alignments have same number of comments */
852 for(ai = 1; ai < nmsa; ai++) {
853 if(msaA[ai]->ncomment != msaA[0]->ncomment) {
854 do_add = FALSE;
855 break;
856 }
857 }
858 if(do_add) {
859 /* make sure all alignments have identical comments */
860 for(j = 0; j < msaA[0]->ncomment; j++) {
861 /* verify that what we think is true is true */
862 if(msaA[0]->comment[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, comment %d of msaA[0] is NULL, but msaA[0]->ncomment is %d.\n", j, msaA[0]->ncomment);
863 for(ai = 1; ai < nmsa; ai++) {
864 if(esl_strcmp(msaA[0]->comment[j], msaA[ai]->comment[j]) != 0) { /* comment doesn't match */
865 do_add = FALSE;
866 break;
867 }
868 }
869 }
870 }
871 if(do_add) {
872 for(j = 0; j < msaA[0]->ncomment; j++) {
873 if((status = esl_msa_AddComment(mmsa, msaA[0]->comment[j], -1))!= eslOK) goto ERROR;
874 }
875 if(be_verbose) fprintf(stdout, "# All alignments have identical comments in the same order. These were transferred to merged alignment.\n");
876 }
877 else {
878 if(be_verbose) fprintf(stdout, "# Comments are not identical in all alignments; not included in merged alignment.\n");
879 }
880 }
881 else if(be_verbose) fprintf(stdout, "# No comments in (at least) first alignment; not included in merged alignment.\n");
882
883 /*********************************************************************/
884 /* Check unparsed per-column (GC) annotation, it must include all gaps in gap RF columns and
885 * be identical once gap RF columns are removed in all msaA[] alignments to be included. */
886
887 if(msaA[0]->ngc > 0) {
888 for(j = 0; j < msaA[0]->ngc; j++) {
889 do_add = TRUE; /* until proven otherwise */
890 /* verify that what we think is true is true */
891 if(msaA[0]->gc_tag[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GC tag %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngc);
892 if(msaA[0]->gc[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GC annotation %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngc);
893
894 /* ensure it does not have non-gaps in gap RF columns */
895 if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->gc[j], msaA[0]->alen)) { /* returns TRUE if gc[j] has 0 non-gap characters in gap columns of RF annotation */
896 /* dealign gc line */
897 if((status = esl_strdup(msaA[0]->gc[j], msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
898 if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning GC tag %s of msaA[0]", msaA[0]->gc_tag[j]);
899
900 for(ai = 1; ai < nmsa; ai++) {
901 found_tag = FALSE;
902 for(j2 = 0; j2 < msaA[ai]->ngc; j2++) {
903 if(esl_strcmp(msaA[0]->gc_tag[j], msaA[ai]->gc_tag[j2]) == 0) {
904 found_tag = TRUE;
905 /* ensure it does not have non-gaps in gap RF columns */
906 if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->gc[j2], msaA[ai]->alen)) { /* returns TRUE if gc[j2] has 0 non-gap characters in gap columns of RF annotation */
907 /* dealign */
908 if((status = esl_strdup(msaA[ai]->gc[j2], msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
909 if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning GC tag %s of msaA[%d]", msaA[ai]->gc_tag[j2], ai);
910 /* check identity */
911 if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; }
912 free(dealigned2);
913 dealigned2 = NULL;
914 break; /* if we matched, do_add remains TRUE */
915 }
916 }
917 }
918 } /* end of (for(ai = 1...)) */
919 if(dealigned != NULL) { free(dealigned); dealigned = NULL; }
920 if(found_tag && do_add) {
921 /* gap out the the GC annotation to fit in merged alignment */
922 if((status = inflate_string_with_gaps_and_missing(msaA[0]->gc[j], msaA[0]->alen, alen_merged, neitherA, '.', NULL, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create GC tag %s annotation for merged alignment.", msaA[0]->gc_tag[j]);
923 if((status = esl_msa_AppendGC(mmsa, msaA[0]->gc_tag[j], gapped_out)) != eslOK) goto ERROR;
924 free(gapped_out);
925 gapped_out = NULL;
926 if(be_verbose) fprintf(stdout, "# Identical GC tag %s annotation from all alignments transferred to merged alignment.\n", msaA[0]->gc_tag[j]);
927 }
928 else {
929 if(be_verbose) fprintf(stdout, "# GC tag %s annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n", msaA[0]->gc_tag[j]);
930 }
931 }
932 } /* end of for(j = 0 j < msaA[0]->ngc... */
933 } /* end of if(msaA[0]->ngc > 0) */
934 else if(be_verbose) fprintf(stdout, "# Unparsed GC annotation absent from (at least) first alignment; not included in merged alignment.\n");
935
936 /*********************************************************************/
937 /* Check ss_cons: it must include all gaps in gap RF columns and be identical once gap RF columns are removed in all
938 * msaA[] alignments to be included. (Same requirements as unparsed GC annotation, so code block below is analogous to one above). */
939 if(msaA[0]->ss_cons != NULL) {
940 do_add = TRUE; /* until proven otherwise */
941 /* ensure it does not have non-gaps in gap RF columns */
942 if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->ss_cons, msaA[0]->alen)) { /* returns TRUE if ss_cons has 0 non-gap characters in gap columns of RF annotation */
943 /* dealign ss_cons */
944 if((status = esl_strdup(msaA[0]->ss_cons, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
945 if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning ss_cons of msaA[0]");
946 for(ai = 1; ai < nmsa; ai++) {
947 if(msaA[ai]->ss_cons == NULL) {
948 do_add = FALSE;
949 break;
950 }
951 /* ss_cons != NULL, ensure it does not have non-gaps in gap RF columns */
952 if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->ss_cons, msaA[ai]->alen)) { /* returns TRUE if ss_cons has 0 non-gap characters in gap columns of RF annotation */
953 /* dealign */
954 if((status = esl_strdup(msaA[ai]->ss_cons, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
955 if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning ss_cons of msaA[%d]", ai);
956 /* check identity */
957 if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; }
958 free(dealigned2);
959 dealigned2 = NULL;
960 break; /* if we matched, do_add remains TRUE */
961 }
962 } /* end of (for(ai = 1...)) */
963 if(dealigned != NULL) { free(dealigned); dealigned = NULL; }
964 if(do_add) {
965 /* gap out the the ss_cons to fit in merged alignment,
966 * as a special case, we use '~' in SS_cons as we add missing data and gaps
967 * like we do in RF (for all other annotation we only add gaps) */
968 if((status = inflate_string_with_gaps_and_missing(msaA[0]->ss_cons, msaA[0]->alen, alen_merged, ngapA, '.', nmisA, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create SS_cons annotation for merged alignment.");
969 if(mmsa->ss_cons != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding SS_cons to merged alignment, it is already non-NULL.");
970 if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->ss_cons))) != eslOK) goto ERROR;
971 free(gapped_out);
972 gapped_out = NULL;
973 if(be_verbose) fprintf(stdout, "# Identical SS_cons annotation from all alignments transferred to merged alignment.\n");
974 }
975 else {
976 if(be_verbose) fprintf(stdout, "# SS_cons annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n");
977 }
978 }
979 } /* end of if(msaA[0]->ss_cons != NULL) */
980 else if(be_verbose) fprintf(stdout, "# SS_cons annotation absent from (at least) first alignment; not included in merged alignment.\n");
981
982 /*********************************************************************/
983 /* Check sa_cons: it must include all gaps in gap RF columns and be identical once gap RF columns are removed in all
984 * msaA[] alignments to be included. (Same requirements as unparsed GC annotation, so code block below is analogous to one above). */
985 if(msaA[0]->sa_cons != NULL) {
986 do_add = TRUE; /* until proven otherwise */
987 /* ensure it does not have non-gaps in gap RF columns */
988 if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->sa_cons, msaA[0]->alen)) { /* returns TRUE if sa_cons has 0 non-gap characters in gap columns of RF annotation */
989 /* dealign sa_cons */
990 if((status = esl_strdup(msaA[0]->sa_cons, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
991 if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning sa_cons of msaA[0]");
992 for(ai = 1; ai < nmsa; ai++) {
993 if(msaA[ai]->sa_cons == NULL) {
994 do_add = FALSE;
995 break;
996 }
997 /* sa_cons != NULL, ensure it does not have non-gaps in gap RF columns */
998 if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->sa_cons, msaA[ai]->alen)) { /* returns TRUE if sa_cons has 0 non-gap characters in gap columns of RF annotation */
999 /* dealign */
1000 if((status = esl_strdup(msaA[ai]->sa_cons, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
1001 if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning sa_cons of msaA[%d]", ai);
1002 /* check identity */
1003 if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; }
1004 free(dealigned2);
1005 dealigned2 = NULL;
1006 break; /* if we matched, do_add remains TRUE */
1007 }
1008 } /* end of (for(ai = 1...)) */
1009 if(dealigned != NULL) { free(dealigned); dealigned = NULL; }
1010 if(do_add) {
1011 /* gap out the the sa_cons to fit in merged alignment */
1012 if((status = inflate_string_with_gaps_and_missing(msaA[0]->sa_cons, msaA[0]->alen, alen_merged, neitherA, '.', NULL, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create SA_cons annotation for merged alignment.");
1013 if(mmsa->sa_cons != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding SA_cons to merged alignment, it is already non-NULL.");
1014 if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->sa_cons))) != eslOK) goto ERROR;
1015 free(gapped_out);
1016 gapped_out = NULL;
1017 if(be_verbose) fprintf(stdout, "# Identical SA_cons annotation from all alignments transferred to merged alignment.\n");
1018 }
1019 else {
1020 if(be_verbose) fprintf(stdout, "# SA_cons annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n");
1021 }
1022 }
1023 } /* end of if(msaA[0]->sa_cons != NULL) */
1024 else if(be_verbose) fprintf(stdout, "# SA_cons annotation absent from (at least) first alignment; not included in merged alignment.\n");
1025
1026 /*********************************************************************/
1027 /* Check pp_cons: it must include all gaps in gap RF columns and be identical once gap RF columns are removed in all
1028 * msaA[] alignments to be included. (Same requirements as unparsed GC annotation, so code block below is analogous to one above). */
1029 if(msaA[0]->pp_cons != NULL) {
1030 do_add = TRUE; /* until proven otherwise */
1031 /* ensure it does not have non-gaps in gap RF columns */
1032 if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->pp_cons, msaA[0]->alen)) { /* returns TRUE if pp_cons has 0 non-gap characters in gap columns of RF annotation */
1033 /* dealign pp_cons */
1034 if((status = esl_strdup(msaA[0]->pp_cons, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
1035 if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning pp_cons of msaA[0]");
1036 for(ai = 1; ai < nmsa; ai++) {
1037 if(msaA[ai]->pp_cons == NULL) {
1038 do_add = FALSE;
1039 break;
1040 }
1041 /* pp_cons != NULL, ensure it does not have non-gaps in gap RF columns */
1042 if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->pp_cons, msaA[ai]->alen)) { /* returns TRUE if pp_cons has 0 non-gap characters in gap columns of RF annotation */
1043 /* dealign */
1044 if((status = esl_strdup(msaA[ai]->pp_cons, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
1045 if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning pp_cons of msaA[%d]", ai);
1046 /* check identity */
1047 if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; }
1048 free(dealigned2);
1049 dealigned2 = NULL;
1050 break; /* if we matched, do_add remains TRUE */
1051 }
1052 } /* end of (for(ai = 1...)) */
1053 if(dealigned != NULL) { free(dealigned); dealigned = NULL; }
1054 if(do_add) {
1055 /* gap out the the pp_cons to fit in merged alignment */
1056 if((status = inflate_string_with_gaps_and_missing(msaA[0]->pp_cons, msaA[0]->alen, alen_merged, neitherA, '.', NULL, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create PP_cons annotation for merged alignment.");
1057 if(mmsa->pp_cons != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding PP_cons to merged alignment, it is already non-NULL.");
1058 if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->pp_cons))) != eslOK) goto ERROR;
1059 free(gapped_out);
1060 gapped_out = NULL;
1061 if(be_verbose) fprintf(stdout, "# Identical PP_cons annotation from all alignments transferred to merged alignment.\n");
1062 }
1063 else {
1064 if(be_verbose) fprintf(stdout, "# PP_cons annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n");
1065 }
1066 }
1067 } /* end of if(msaA[0]->pp_cons != NULL) */
1068 else if(be_verbose) fprintf(stdout, "# PP_cons annotation absent from (at least) first alignment; not included in merged alignment.\n");
1069
1070 /*********************************************************************/
1071 /* Finally, validate that RF annotation is identical in all alignments after removing gaps. */
1072
1073 if(msaA[0]->rf == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "All alignments must have #= GC RF annotation.");
1074 /* dealign rf */
1075 if((status = esl_strdup(msaA[0]->rf, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR;
1076 if((status = esl_strdealign(dealigned, dealigned, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning RF of msaA[0]");
1077 for(ai = 1; ai < nmsa; ai++) {
1078 if(msaA[ai]->rf == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "All alignments must have #= GC RF annotation.");
1079 /* dealign */
1080 if((status = esl_strdup(msaA[ai]->rf, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR;
1081 if((status = esl_strdealign(dealigned2, dealigned2, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning RF of msaA[%d]", ai);
1082 /* check identity */
1083 if(esl_strcmp(dealigned, dealigned2) != 0) {
1084 printf("%s\n%s\n", dealigned, dealigned2);
1085 ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "All alignments must have identical #=GC RF annotation, once gaps (\".\",\"-\",\"_\") are removed.\nAlignment %d de-gapped RF annotation differs from that of alignment 1.\n%s\n%s", ai+1, dealigned, dealigned2);
1086 }
1087 if(dealigned2 != NULL) { free(dealigned2); dealigned2 = NULL; }
1088 }
1089 if(dealigned != NULL) { free(dealigned); dealigned = NULL; }
1090 /* gap out the the RF to fit in merged alignment */
1091 if((status = inflate_string_with_gaps_and_missing(msaA[0]->rf, msaA[0]->alen, alen_merged, ngapA, '.', nmisA, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create RF annotation for merged alignment.");
1092 if(mmsa->rf != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding RF to merged alignment, it is already non-NULL.");
1093 if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->rf))) != eslOK) goto ERROR;
1094 free(gapped_out);
1095 gapped_out = NULL;
1096 if(be_verbose) fprintf(stdout, "# Identical RF annotation from all alignments transferred to merged alignment.\n");
1097
1098 if(dealigned != NULL) free(dealigned);
1099 if(dealigned2 != NULL) free(dealigned2);
1100 if(gapped_out != NULL) free(gapped_out);
1101 if(ngapA != NULL) free(ngapA);
1102 if(nmisA != NULL) free(nmisA);
1103 if(neitherA != NULL) free(neitherA);
1104 return eslOK;
1105
1106 ERROR:
1107 if(dealigned != NULL) free(dealigned);
1108 if(dealigned2 != NULL) free(dealigned2);
1109 if(gapped_out != NULL) free(gapped_out);
1110 if(ngapA != NULL) free(ngapA);
1111 if(nmisA != NULL) free(nmisA);
1112 if(neitherA != NULL) free(neitherA);
1113 return status;
1114 }
1115
1116 /* Function: add_msa
1117 * Date: EPN, Mon Nov 23 05:54:37 2009
1118 *
1119 * Add a "child" MSA we read from a file to the merged
1120 * MSA - the merged alignment that we'll eventually
1121 * output. We free each string in the child as soon
1122 * as we've added it to the merged, to save memory.
1123 *
1124 * We add all sequence data (aseq), and per sequence
1125 * annotation, including sqname, sqdesc, sqacc, pp, ss,
1126 * sa, as well as non-parsed GS and GR annotation.
1127 *
1128 * <maxgap>[0..clen] is an array specifying the
1129 * number of inserted columns necessary between
1130 * each consensus position.
1131 *
1132 * max{gap,mis}[0] is number of gaps/missing columns before 1st cpos.
1133 * max{gap,mis}[clen-1] is number of gaps/missing columns before final cpos.
1134 * max{gap,mis}[clen] is number of gaps/missing columns after final cpos.
1135 *
1136 * <alen_merged> is the number of columns in the merged
1137 * alignment. This is the non-gap RF length plus the
1138 * sum of the maxgap vector.
1139 *
1140 * Returns: eslOK on success.
1141 * eslEMEM on memory allocation failure.
1142 */
1143 int
add_msa(ESL_MSA * mmsa,char * errbuf,ESL_MSA * msa_to_add,int * maxgap,int * maxmis,int clen,int alen_merged)1144 add_msa(ESL_MSA *mmsa, char *errbuf, ESL_MSA *msa_to_add, int *maxgap, int *maxmis, int clen, int alen_merged)
1145 {
1146 int status;
1147 int i; /* counter over sequences in msa_to_add */
1148 int j; /* counter over alignment annotations */
1149 int mi; /* counter over sequences in mmsa */
1150 void *tmp; /* for reallocations */
1151 char *tmpstr; /* used for copying GR annotation */
1152 int nseq_existing; /* number of sequences already added to mmsa, by previous calls of this function */
1153 int *ngapA = NULL; /* [0..alen] number of insert gap columns to add after each alignment column when merging */
1154 int *nmisA = NULL; /* [0..alen] number of missing data ('~') gap columns to add after each alignment column when merging */
1155 int *neitherA = NULL; /* [0..apos..alen] = ngapA[apos] + nmisA[apos] */
1156
1157 nseq_existing = mmsa->nseq;
1158
1159 /* determine how many all gap columns to insert after each alignment position
1160 * of the child msa when copying it to the merged msa */
1161 if((status = determine_gap_columns_to_add(msa_to_add, maxgap, maxmis, clen, &(ngapA), &(nmisA), &(neitherA), errbuf)) != eslOK)
1162 return status;
1163
1164 /* Append msa_to_add's sequence data and per-sequence annotation to mmsa after adding necessary gaps */
1165 /* sequence names and aligned sequence data (digitized) */
1166 for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1167 esl_strdup(msa_to_add->sqname[i], -1, &(mmsa->sqname[mi]));
1168
1169 status = inflate_seq_with_gaps(msa_to_add->abc, msa_to_add->aseq[i], msa_to_add->alen, alen_merged, neitherA, '.', &(mmsa->aseq[mi]));
1170 if (status == eslEMEM) ESL_XFAIL(status, errbuf, "Out of memory adding sequence %s", mmsa->sqname[mi]);
1171 else if(status != eslOK) ESL_XFAIL(status, errbuf, "Found internal missing data symbols in seq: %s", mmsa->sqname[mi]);
1172 free(msa_to_add->aseq[i]); /* free immediately */
1173 msa_to_add->aseq[i] = NULL;
1174 }
1175
1176 /* parsed annotation that is optional */
1177 /* sqacc */
1178 if(msa_to_add->sqacc != NULL) {
1179 if(mmsa->sqacc == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1180 ESL_ALLOC(mmsa->sqacc, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1181 for(mi = 0; mi < nseq_existing; mi++) { mmsa->sqacc[mi] = NULL; }
1182 }
1183 else { /* reallocate; to add space for new seqs */
1184 ESL_RALLOC(mmsa->sqacc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1185 }
1186 for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1187 if(msa_to_add->sqacc[i] != NULL) {
1188 if((status = esl_strdup(msa_to_add->sqacc[i], -1, &(mmsa->sqacc[mi]))) != eslOK)
1189 ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence accession number %d.\n", i+1);
1190 free(msa_to_add->sqacc[i]); /* free immediately */
1191 msa_to_add->sqacc[i] = NULL;
1192 }
1193 else {
1194 mmsa->sqacc[mi] = NULL;
1195 }
1196 }
1197 } /* end of if(msa_to_add->sqacc != NULL */
1198 else if(mmsa->sqacc != NULL) {
1199 /* msa_to_add->sqacc == NULL, but mmsa->sqacc != NULL, reallocate mmsa->sqacc, and set new ones to NULL */
1200 ESL_RALLOC(mmsa->sqacc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1201 for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1202 mmsa->sqacc[mi] = NULL;
1203 }
1204 }
1205
1206 /* sqdesc */
1207 if(msa_to_add->sqdesc != NULL) {
1208 if(mmsa->sqdesc == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1209 ESL_ALLOC(mmsa->sqdesc, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1210 for(mi = 0; mi < nseq_existing; mi++) { mmsa->sqdesc[mi] = NULL; }
1211 }
1212 else { /* reallocate; to add space for new seqs */
1213 ESL_RALLOC(mmsa->sqdesc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1214 }
1215 for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1216 if(msa_to_add->sqdesc[i] != NULL) {
1217 if((status = esl_strdup(msa_to_add->sqdesc[i], -1, &(mmsa->sqdesc[mi]))) != eslOK)
1218 ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence description number %d.\n", i+1);
1219 free(msa_to_add->sqdesc[i]); /* free immediately */
1220 msa_to_add->sqdesc[i] = NULL;
1221 }
1222 else {
1223 mmsa->sqdesc[mi] = NULL;
1224 }
1225 }
1226 } /* end of if(msa_to_add->sqdesc != NULL */
1227 else if(mmsa->sqdesc != NULL) {
1228 /* msa_to_add->sqdesc == NULL, but mmsa->sqdesc != NULL, reallocate mmsa->sqdesc, and set new ones to NULL */
1229 ESL_RALLOC(mmsa->sqdesc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1230 for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1231 mmsa->sqdesc[mi] = NULL;
1232 }
1233 }
1234
1235 /* per-seq posterior probabilities */
1236 if(msa_to_add->pp != NULL) {
1237 if(mmsa->pp == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1238 ESL_ALLOC(mmsa->pp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1239 for(mi = 0; mi < nseq_existing; mi++) { mmsa->pp[mi] = NULL; }
1240 }
1241 else { /* reallocate; to add space for new seqs */
1242 ESL_RALLOC(mmsa->pp, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1243 }
1244 for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1245 if(msa_to_add->pp[i] != NULL) {
1246 if((status = inflate_string_with_gaps_and_missing(msa_to_add->pp[i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(mmsa->pp[mi]))) != eslOK)
1247 ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d posterior probabilities.\n", i+1);
1248 free(msa_to_add->pp[i]); /* free immediately */
1249 msa_to_add->pp[i] = NULL;
1250 }
1251 else {
1252 mmsa->pp[mi] = NULL;
1253 }
1254 }
1255 } /* end of if(msa_to_add->pp != NULL */
1256 else if(mmsa->pp != NULL) {
1257 /* msa_to_add->pp == NULL, but mmsa->pp != NULL, reallocate mmsa->pp, and set new ones to NULL */
1258 ESL_RALLOC(mmsa->pp, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1259 for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1260 mmsa->pp[mi] = NULL;
1261 }
1262 }
1263
1264 /* per-seq secondary structure */
1265 if(msa_to_add->ss != NULL) {
1266 if(mmsa->ss == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1267 ESL_ALLOC(mmsa->ss, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1268 for(mi = 0; mi < nseq_existing; mi++) { mmsa->ss[mi] = NULL; }
1269 }
1270 else { /* reallocate; to add space for new seqs */
1271 ESL_RALLOC(mmsa->ss, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1272 }
1273 for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1274 if(msa_to_add->ss[i] != NULL) {
1275 if((status = inflate_string_with_gaps_and_missing(msa_to_add->ss[i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(mmsa->ss[mi]))) != eslOK)
1276 ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d secondary structure.\n", i+1);
1277 free(msa_to_add->ss[i]); /* free immediately */
1278 msa_to_add->ss[i] = NULL;
1279 }
1280 else {
1281 mmsa->ss[mi] = NULL;
1282 }
1283 }
1284 } /* end of if(msa_to_add->ss != NULL */
1285 else if(mmsa->ss != NULL) {
1286 /* msa_to_add->ss == NULL, but mmsa->ss != NULL, reallocate mmsa->ss, and set new ones to NULL */
1287 ESL_RALLOC(mmsa->ss, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1288 for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1289 mmsa->ss[mi] = NULL;
1290 }
1291 }
1292
1293 /* per-seq surface accessibility */
1294 if(msa_to_add->sa != NULL) {
1295 if(mmsa->sa == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */
1296 ESL_ALLOC(mmsa->sa, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1297 for(mi = 0; mi < nseq_existing; mi++) { mmsa->sa[mi] = NULL; }
1298 }
1299 else { /* reallocate; to add space for new seqs */
1300 ESL_RALLOC(mmsa->sa, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1301 }
1302 for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1303 if(msa_to_add->sa[i] != NULL) {
1304 if((status = inflate_string_with_gaps_and_missing(msa_to_add->sa[i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(mmsa->sa[mi]))) != eslOK)
1305 ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d surface accessibility.\n", i+1);
1306 free(msa_to_add->sa[i]); /* free immediately */
1307 msa_to_add->sa[i] = NULL;
1308 }
1309 else {
1310 mmsa->sa[mi] = NULL;
1311 }
1312 }
1313 } /* end of if(msa_to_add->sa != NULL */
1314 else if(mmsa->sa != NULL) {
1315 /* msa_to_add->sa == NULL, but mmsa->sa != NULL, reallocate mmsa->sa, and set new ones to NULL */
1316 ESL_RALLOC(mmsa->sa, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq));
1317 for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) {
1318 mmsa->sa[mi] = NULL;
1319 }
1320 }
1321
1322 /* Unparsed per-sequence (GS) annotation */
1323 if(msa_to_add->ngs > 0) {
1324 for(j = 0; j < msa_to_add->ngs; j++) {
1325 for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1326 if(msa_to_add->gs[j][i] != NULL)
1327 if((status =esl_msa_AddGS(mmsa, msa_to_add->gs_tag[j], -1, mi, msa_to_add->gs[j][i], -1)) != eslOK)
1328 ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d GS annotation.\n", i+1);
1329 }
1330 free(msa_to_add->gs[j][i]); /* free immediately */
1331 msa_to_add->gs[j][i] = NULL;
1332 }
1333 }
1334 /* caller will free the rest of gs via esl_msa_Destroy() */
1335
1336 /* unparsed per-residue (GR) annotation */
1337 if(msa_to_add->gr != NULL) {
1338 for(j = 0; j < msa_to_add->ngr; j++) {
1339 for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) {
1340 if(msa_to_add->gr[j][i] != NULL) {
1341 if((status = inflate_string_with_gaps_and_missing(msa_to_add->gr[j][i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(tmpstr))) != eslOK)
1342 ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d GR annotation.\n", i+1);
1343 if((status = esl_msa_AppendGR(mmsa, msa_to_add->gr_tag[j], mi, tmpstr)) != eslOK)
1344 ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d GR annotation.\n", i+1);
1345 free(tmpstr);
1346 free(msa_to_add->gr[j][i]); /* free immediately */
1347 msa_to_add->gr[j][i] = NULL;
1348 }
1349 }
1350 }
1351 }
1352 /* caller will free the rest of gr in esl_msa_Destroy() */
1353
1354 /* msa_to_add should be destroyed by caller */
1355
1356 /* update nseq in mmsa */
1357 mmsa->nseq += msa_to_add->nseq;
1358
1359 if(ngapA != NULL) free(ngapA);
1360 if(nmisA != NULL) free(nmisA);
1361 if(neitherA != NULL) free(neitherA);
1362 return eslOK;
1363
1364 ERROR:
1365 if(ngapA != NULL) free(ngapA);
1366 if(nmisA != NULL) free(nmisA);
1367 if(neitherA != NULL) free(neitherA);
1368 return status;
1369 }
1370
1371 /* inflate_string_with_gaps_and_missing
1372 *
1373 * Given a string, create a new one that is a copy of it,
1374 * but with missing data and gaps added before each position (apos)
1375 * as specified by n{gap,mis}A[0..apos..len]. <gapchar> and <mischar>
1376 * specify the gap character and missing data character. Either
1377 * ngapA or nmisA can be either be NULL, if so they're treated as if
1378 * they're all zeroes (no gaps/missing data will be added anywhere).
1379 *
1380 * n{gap,mis}A[0] - number of gaps/missing to add before first posn
1381 * n{gap,mis}A[apos] - number of gaps/missing to add before posn apos
1382 * n{gap,mis}A[src_len] - number of gaps/missing to add after final posn
1383 *
1384 * By convention, missing data symbols are always put before gap symbols
1385 * when more than 1 of each are to be placed after the some nongap RF position.
1386 *
1387 * ret_str is allocated here.
1388 *
1389 * Returns eslOK on success.
1390 * eslEMEM on memory error.
1391 */
1392 int
inflate_string_with_gaps_and_missing(char * src_str,int64_t src_len,int64_t dst_len,int * ngapA,char gapchar,int * nmisA,char mischar,char ** ret_dst_str)1393 inflate_string_with_gaps_and_missing(char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, int *nmisA, char mischar, char **ret_dst_str)
1394 {
1395 int status;
1396 int src_apos = 0;
1397 int dst_apos = 0;
1398 int i;
1399 char *dst_str;
1400
1401 ESL_ALLOC(dst_str, sizeof(char) * (dst_len+1));
1402 dst_str[dst_len] = '\0';
1403
1404 /* add gaps before first position */
1405 if(nmisA) for(i = 0; i < nmisA[0]; i++) dst_str[dst_apos++] = mischar;
1406 if(ngapA) for(i = 0; i < ngapA[0]; i++) dst_str[dst_apos++] = gapchar;
1407
1408 /* add gaps after every position */
1409 for(src_apos = 0; src_apos < src_len; src_apos++) {
1410 dst_str[dst_apos++] = src_str[src_apos];
1411 if(nmisA) for(i = 0; i < nmisA[(src_apos+1)]; i++) dst_str[dst_apos++] = mischar;
1412 if(ngapA) for(i = 0; i < ngapA[(src_apos+1)]; i++) dst_str[dst_apos++] = gapchar;
1413 }
1414
1415 *ret_dst_str = dst_str;
1416 return eslOK;
1417
1418 ERROR:
1419 return eslEMEM;
1420 }
1421
1422
1423 /* inflate_seq_with_gaps()
1424 *
1425 * Given an aligned sequence, create a new one that is a copy of it,
1426 * but with gaps added before each position (apos) as specified by
1427 * ngapA[0..apos..len]. <gapchar> specifies the gap character.
1428 *
1429 * This function follows the HMMER3 convention of using '~' to mark
1430 * fragment sequences. If a sequence is a fragment, the first and final
1431 * string of gaps (contiguous gaps) will be marked as '~'. That is,
1432 * the 1st column will necessarily be a '~' and the last will be a '~'.
1433 * This function takes care to add '~' as gap characters in fragments,
1434 * where appropriate, to follow HMMER3 convention.
1435 *
1436 * This function is importantly different from inflate_string_with_gaps_and_missing()
1437 * in that it does not allow internal '~' missing data characters to
1438 * be added (which are dictated by nmisA[] in inflate_string_with_gaps_and_missing())
1439 * Those '~' are possible in Infernal output alignments only in
1440 * #=GC RF and #=GC SS_cons alignment.
1441 *
1442 * It only adds '~' as necessary to the beginning and ends of fragments.
1443 *
1444 * ngapA[0] - number of gaps/missing to add before first posn
1445 * ngapA[apos] - number of gaps/missing to add before posn apos
1446 * ngapA[src_len] - number of gaps/missing to add after final posn
1447 *
1448 * ret_str is allocated here.
1449 *
1450 * Returns eslOK on success.
1451 * eslEMEM on memory error.
1452 * eslEINVAL if sequence (src_str) has an internal missing
1453 * data symbol that violates HMMER3 convention.
1454 */
1455 int
inflate_seq_with_gaps(const ESL_ALPHABET * abc,char * src_str,int64_t src_len,int64_t dst_len,int * ngapA,char gapchar,char ** ret_dst_str)1456 inflate_seq_with_gaps(const ESL_ALPHABET *abc, char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, char **ret_dst_str)
1457 {
1458 int status;
1459 int src_apos = 0;
1460 int dst_apos = 0;
1461 int src_flpos = 0; /* position of rightmost '~' in contiguous string that begins at position 0 */
1462 int src_frpos = 0; /* position of leftmost '~' in contiguous string that ends at position src_len-1 */
1463 int i;
1464 char *dst_str;
1465 char char2add;
1466 int i_am_fragment = FALSE;
1467
1468 ESL_ALLOC(dst_str, sizeof(char) * (dst_len+1));
1469 dst_str[dst_len] = '\0';
1470
1471 /* determine if sequence is a fragment, and determine src_flpos and src_frpos */
1472 src_flpos = 0;
1473 while(src_flpos < src_len && esl_abc_CIsMissing(abc, src_str[src_flpos])) { src_flpos++; }
1474 src_flpos--; /* we overshot by 1 */
1475 /* src_flpos is now rightmost '~' in stretch of '~' that begin at position 0, so
1476 * it's -1 if sequence is not a fragment (there are no leading '~' in src_str)
1477 */
1478
1479 src_frpos = src_len-1;
1480 while(src_frpos > -1 && esl_abc_CIsMissing(abc, src_str[src_frpos])) { src_frpos--; }
1481 src_frpos++; /* we overshot by 1 */
1482 /* src_frpos is now leftmost '~' in stretch of '~' that end at position src_len-1, so
1483 * it's src_len if sequence is not a fragment (there are no trailing '~' in src_str)
1484 */
1485 i_am_fragment = (src_flpos != -1 || src_frpos != src_len) ? TRUE : FALSE;
1486
1487 /* Now verify that we don't have any internal '~' characters */
1488 for(src_apos = src_flpos+1; src_apos <= src_frpos-1; src_apos++) {
1489 if(esl_abc_CIsMissing(abc, src_str[src_apos])) return eslEINVAL;
1490 }
1491
1492 /* add gaps before first position */
1493 src_apos = 0;
1494 if(i_am_fragment) {
1495 char2add = ((src_flpos >= src_apos) || (src_frpos <= src_apos)) ? esl_abc_CGetMissing(abc) : gapchar;
1496 }
1497 else {
1498 char2add = gapchar;
1499 }
1500 if(ngapA) for(i = 0; i < ngapA[0]; i++) dst_str[dst_apos++] = char2add;
1501
1502 /* add gaps or missing character after every position */
1503 for(src_apos = 0; src_apos < src_len; src_apos++) {
1504 dst_str[dst_apos++] = src_str[src_apos];
1505 if(i_am_fragment) {
1506 char2add = ((src_flpos >= src_apos) || (src_frpos <= src_apos)) ? esl_abc_CGetMissing(abc) : gapchar;
1507 }
1508 else {
1509 char2add = gapchar;
1510 }
1511 for(i = 0; i < ngapA[(src_apos+1)]; i++) dst_str[dst_apos++] = char2add;
1512 }
1513
1514 *ret_dst_str = dst_str;
1515 return eslOK;
1516
1517 ERROR:
1518 return eslEMEM;
1519 }
1520
1521
1522 /* validate_no_nongaps_in_rf_gaps
1523 *
1524 * Given an RF string with gaps defined as by alphabet <abc>
1525 * and another string of same length. Make sure none of the
1526 * positions that are gaps in the RF string are non-gaps in the
1527 * other string. Return TRUE if none are. Return FALSE if at
1528 * least one is.
1529 *
1530 * Returns TRUE if 0 characters in <other_str> in same position
1531 * as a gap in <rf_str> are non-gaps. FALSE otherwise.
1532 */
1533 int
validate_no_nongaps_in_rf_gaps(const ESL_ALPHABET * abc,char * rf_str,char * other_str,int64_t len)1534 validate_no_nongaps_in_rf_gaps(const ESL_ALPHABET *abc, char *rf_str, char *other_str, int64_t len)
1535 {
1536 int64_t i;
1537 for(i = 0; i < len; i++) {
1538 if((! rfchar_is_nongap_nonmissing(abc, rf_str[i])) && (rfchar_is_nongap_nonmissing(abc, other_str[i]))) return FALSE;
1539 }
1540 return TRUE;
1541 }
1542
1543 /* determine_gap_columns_to_add
1544 * (stolen and slightly modified from Infernal 1.1rc1's cmalign.c)
1545 *
1546 * Given <maxgap> and <maxmis>, two arrays of the number of gap RF
1547 * positions and '~' RF (missing data) after each non-gap RF
1548 * (consensus) position in the eventual final merged alignment,
1549 * calculate how many inserts and missing data inserts
1550 * we need to add at each position of <msa> to expand it out to the
1551 * appropriate size of the eventual merged alignment.
1552 *
1553 * max{gap,mis}[0] is number of gaps/missing before 1st cpos in merged aln
1554 * max{gap,mis}[cpos] is number of gaps/missing after cpos in merged aln
1555 * for cpos = 1..clen
1556 * clen is the number of non-gap RF positions in msa (and in eventual merged msa).
1557 *
1558 * We allocate fill and return ret_ngapA[0..msa->alen], ret_nmisA[0..msa->alen],
1559 * and ret_neitherA[0..msa->alen] here.
1560 *
1561 * ret_n{gap,mis}gapA[0] is number of gaps/missing to add before 1st position of msa
1562 * ret_n{gap,mis}gapA[apos] is number of gaps/missing to add after alignment position apos
1563 * for apos = 1..msa->alen
1564 *
1565 * ret_neitherA[apos] = ngapA[apos] + nmisA[apos]
1566 *
1567 * This is similar to the esl_msa.c helper function of the same name,
1568 * but that function does not bother with missing data '~'.
1569 *
1570 * Returns eslOK on success.
1571 * eslEMEM on memory alloaction error
1572 * eslEINVAL if gaps occur before missing data b/t any two nongap RF posns
1573 * eslERANGE if a value exceeds what we expected (based on earlier
1574 * checks before this function was entered).
1575 * if !eslOK, errbuf if filled.
1576 */
1577 int
determine_gap_columns_to_add(ESL_MSA * msa,int * maxgap,int * maxmis,int clen,int ** ret_ngapA,int ** ret_nmisA,int ** ret_neitherA,char * errbuf)1578 determine_gap_columns_to_add(ESL_MSA *msa, int *maxgap, int *maxmis, int clen, int **ret_ngapA, int **ret_nmisA, int **ret_neitherA, char *errbuf)
1579 {
1580 int status;
1581 int apos;
1582 int prv_cpos = 0; /* alignment position corresponding to consensus position cpos-1 */
1583 int cpos = 0;
1584 int ngap = 0;
1585 int nmis = 0;
1586 int *ngapA = NULL;
1587 int *nmisA = NULL;
1588 int *neitherA = NULL;
1589
1590 /* contract check */
1591 if(maxmis[0] != 0) ESL_FAIL(eslEINVAL, errbuf, "missing characters exist prior to first cpos, this shouldn't happen.\n");
1592
1593 ESL_ALLOC(ngapA, sizeof(int) * (msa->alen+1));
1594 ESL_ALLOC(nmisA, sizeof(int) * (msa->alen+1));
1595 ESL_ALLOC(neitherA, sizeof(int) * (msa->alen+1));
1596 esl_vec_ISet(ngapA, (msa->alen+1), 0);
1597 esl_vec_ISet(nmisA, (msa->alen+1), 0);
1598 esl_vec_ISet(neitherA, (msa->alen+1), 0);
1599
1600 for(apos = 0; apos < msa->alen; apos++) {
1601 if(esl_abc_CIsMissing(msa->abc, msa->rf[apos])) {
1602 nmis++;
1603 if(ngap > 0) ESL_FAIL(eslEINVAL, errbuf, "after nongap RF pos %d, %d gap columns precede a missing data column (none should)", cpos, ngap);
1604 }
1605 else if(esl_abc_CIsGap(msa->abc, msa->rf[apos])) {
1606 ngap++;
1607 }
1608 else { /* a consensus position */
1609 /* a few sanity checks */
1610 if(ngap > maxgap[cpos]) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "%d inserts before cpos %d greater than max expected (%d).\n", ngap, cpos, maxgap[cpos]);
1611 if(nmis > maxmis[cpos]) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "%d EL inserts before cpos %d greater than max expected (%d).\n", nmis, cpos, maxmis[cpos]);
1612
1613 if (cpos == 0) {
1614 /* gaps and/or missing data before first consensus position: flush right */
1615 if(maxgap[cpos] > 0 && maxmis[cpos] == 0) { /* most common case */
1616 ngapA[prv_cpos] = maxgap[cpos] - ngap; /* flush right gaps (no missing) */
1617 }
1618 else if(maxgap[cpos] == 0 && maxmis[cpos] > 0) {
1619 nmisA[prv_cpos] = maxmis[cpos] - nmis; /* flush right missing (no gaps) */
1620 }
1621 else if(maxgap[cpos] > 0 && maxmis[cpos] > 0) {
1622 /* missing data (ELs) is always 5' of gaps */
1623 nmisA[prv_cpos] = maxmis[cpos] - nmis; /* flush right */
1624 ngapA[prv_cpos+nmis] = maxgap[cpos] - ngap; /* flush right, after missing */
1625 }
1626 }
1627 else {
1628 /* gaps and/missing data after interior consensus position (i.e. not before 1st cpos or after last)
1629 * Determine where to place inserts and/or missing data.
1630 * Handle each of 4 possibilities separately, note that if
1631 * maxgap[cpos] == 0 then ngap == 0, and if maxmis[cpos] == 0 then nmis == 0 (see sanity check above).
1632 */
1633 if(maxgap[cpos] > 0 && maxmis[cpos] == 0) { /* most common case */
1634 ngapA[prv_cpos + 1 + (ngap/2)] = maxgap[cpos] - ngap; /* internal cpos: split */
1635 }
1636 else if(maxgap[cpos] == 0 && maxmis[cpos] > 0) {
1637 nmisA[prv_cpos + 1 + (nmis/2)] = maxmis[cpos] - nmis; /* internal cpos: split */
1638 }
1639 else if(maxgap[cpos] > 0 && maxmis[cpos] > 0) {
1640 /* missing data is always 5' of gaps */
1641 nmisA[prv_cpos + 1 + (nmis/2)] = maxmis[cpos] - nmis; /* internal cpos: split */
1642 ngapA[prv_cpos + 1 + nmis + (ngap/2)] = maxgap[cpos] - ngap; /* internal cpos: split */
1643 }
1644 /* final case is if (maxgap[cpos] == 0 && maxmis[cpos] == 0)
1645 * in this case we do nothing.
1646 */
1647 }
1648 cpos++;
1649 prv_cpos = apos;
1650 ngap = 0;
1651 nmis = 0;
1652 }
1653 }
1654 /* deal with gaps and missing data after final consensus position */
1655
1656 /* first, validate that clen is what it should be */
1657 if(cpos != clen) {
1658 if(ngapA != NULL) free(ngapA);
1659 if(nmisA != NULL) free(nmisA);
1660 if(neitherA != NULL) free(neitherA);
1661 ESL_FAIL(eslEINCONCEIVABLE, errbuf, "consensus length (%d) is not the expected length (%d).", cpos, clen);
1662 }
1663
1664 if(maxgap[cpos] > 0 && maxmis[cpos] == 0) { /* most common case */
1665 ngapA[prv_cpos + 1 + ngap] = maxgap[cpos] - ngap; /* flush left gaps (no missing) */
1666 }
1667 else if(maxgap[cpos] == 0 && maxmis[cpos] > 0) {
1668 nmisA[prv_cpos + 1 + nmis] = maxmis[cpos] - nmis; /* flush left missing (no gaps) */
1669 }
1670 else if(maxgap[cpos] > 0 && maxmis[cpos] > 0) {
1671 /* missing data (ELs) is always 5' of gaps */
1672 nmisA[prv_cpos + 1 + nmis] = maxmis[cpos] - nmis; /* flush left */
1673 ngapA[prv_cpos + 1 + nmis + ngap] = maxgap[cpos] - ngap; /* flush left, after missing */
1674 }
1675
1676 /* determine neitherA[], the number of gaps due to either inserts or missing data after each apos */
1677 for(apos = 0; apos <= msa->alen; apos++) {
1678 neitherA[apos] = ngapA[apos] + nmisA[apos];
1679 }
1680
1681 *ret_ngapA = ngapA;
1682 *ret_nmisA = nmisA;
1683 *ret_neitherA = neitherA;
1684
1685 return eslOK;
1686
1687 ERROR:
1688 if(ngapA != NULL) free(ngapA);
1689 if(nmisA != NULL) free(nmisA);
1690 if(neitherA != NULL) free(neitherA);
1691 ESL_FAIL(status, errbuf, "Memory allocation error.");
1692 return status; /*NEVERREACHED*/
1693 }
1694
1695 /* write_pfam_msa_top
1696 *
1697 * Highly specialized function for printing out the 'top' of a
1698 * Stockholm alignment file, i.e. the Stockholm header line, comments
1699 * and GF annotation to an msa file. This function is necessary when
1700 * printing the merged alignment file in small memory mode (when
1701 * --small enabled). <msa> is an alignment with no sequence
1702 * information (no aseq, ax, GS, or GR data).
1703 */
1704 void
write_pfam_msa_top(FILE * fp,ESL_MSA * msa)1705 write_pfam_msa_top(FILE *fp, ESL_MSA *msa)
1706 {
1707 int i, maxgf;
1708
1709 /* rest of code in this function was stolen verbatim from actually_write_stockholm in esl_msa.c */
1710
1711 /* Magic Stockholm header
1712 */
1713 fprintf(fp, "# STOCKHOLM 1.0\n");
1714
1715 /* Free text comments
1716 */
1717 for (i = 0; i < msa->ncomment; i++)
1718 fprintf(fp, "# %s\n", msa->comment[i]);
1719 if (msa->ncomment > 0) fprintf(fp, "\n");
1720
1721 maxgf = maxwidth(msa->gf_tag, msa->ngf);
1722 if (maxgf < 2) maxgf = 2;
1723
1724 /* GF section: per-file annotation
1725 */
1726 if (msa->name != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "ID", msa->name);
1727 if (msa->acc != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "AC", msa->acc);
1728 if (msa->desc != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "DE", msa->desc);
1729 if (msa->au != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "AU", msa->au);
1730
1731 /* Thresholds are hacky. Pfam has two. Rfam has one.
1732 */
1733 if (msa->cutset[eslMSA_GA1] && msa->cutset[eslMSA_GA2])
1734 fprintf(fp, "#=GF %-*s %.1f %.1f\n",
1735 maxgf, "GA", msa->cutoff[eslMSA_GA1], msa->cutoff[eslMSA_GA2]);
1736 else if (msa->cutset[eslMSA_GA1])
1737 fprintf(fp, "#=GF %-*s %.1f\n",
1738 maxgf, "GA", msa->cutoff[eslMSA_GA1]);
1739
1740 if (msa->cutset[eslMSA_NC1] && msa->cutset[eslMSA_NC2])
1741 fprintf(fp, "#=GF %-*s %.1f %.1f\n",
1742 maxgf, "NC", msa->cutoff[eslMSA_NC1], msa->cutoff[eslMSA_NC2]);
1743 else if (msa->cutset[eslMSA_NC1])
1744 fprintf(fp, "#=GF %-*s %.1f\n",
1745 maxgf, "NC", msa->cutoff[eslMSA_NC1]);
1746
1747 if (msa->cutset[eslMSA_TC1] && msa->cutset[eslMSA_TC2])
1748 fprintf(fp, "#=GF %-*s %.1f %.1f\n",
1749 maxgf, "TC", msa->cutoff[eslMSA_TC1], msa->cutoff[eslMSA_TC2]);
1750 else if (msa->cutset[eslMSA_TC1])
1751 fprintf(fp, "#=GF %-*s %.1f\n",
1752 maxgf, "TC", msa->cutoff[eslMSA_TC1]);
1753
1754 for (i = 0; i < msa->ngf; i++)
1755 fprintf(fp, "#=GF %-*s %s\n", maxgf, msa->gf_tag[i], msa->gf[i]);
1756 fprintf(fp, "\n");
1757
1758 return;
1759 }
1760
1761 /* write_pfam_msa_gc
1762 *
1763 * Highly specialized function for printing out the GC annotation to a
1764 * Pfam Stockholm alignment file (1 line/seq). This function is
1765 * necessary when printing the merged alignment file in small memory
1766 * mode (when --small enabled). <msa> is an alignment with no sequence
1767 * information (no aseq, ax, GS, nor GR data).
1768 */
1769 void
write_pfam_msa_gc(FILE * fp,ESL_MSA * msa,int margin)1770 write_pfam_msa_gc(FILE *fp, ESL_MSA *msa, int margin)
1771 {
1772 int i;
1773 if (msa->ss_cons != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "SS_cons", msa->ss_cons); }
1774 if (msa->sa_cons != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "SA_cons", msa->sa_cons); }
1775 if (msa->pp_cons != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "PP_cons", msa->pp_cons); }
1776 if (msa->rf != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "RF", msa->rf); }
1777 for (i = 0; i < msa->ngc; i++) { fprintf(fp, "#=GC %-*s %s\n", margin-6, msa->gc_tag[i], msa->gc[i]); }
1778 fprintf(fp, "//\n");
1779 }
1780
1781 /* maxwidth()
1782 * Return the length of the longest string in
1783 * an array of strings.
1784 */
1785 static int64_t
maxwidth(char ** s,int n)1786 maxwidth(char **s, int n)
1787 {
1788 int64_t max,len;
1789 int i;
1790
1791 max = 0;
1792 for (i = 0; i < n; i++)
1793 if (s[i] != NULL)
1794 {
1795 len = strlen(s[i]);
1796 if (len > max) max = len;
1797 }
1798 return max;
1799 }
1800
1801 /* rfchar_is_nongap_nonmissing()
1802 * Return FALSE if a character from RF annotation is
1803 * a gap or a missing character, else return TRUE.
1804 */
1805 static int
rfchar_is_nongap_nonmissing(const ESL_ALPHABET * abc,char rfchar)1806 rfchar_is_nongap_nonmissing(const ESL_ALPHABET *abc, char rfchar) {
1807 if(esl_abc_CIsGap (abc, rfchar)) return FALSE;
1808 if(esl_abc_CIsMissing(abc, rfchar)) return FALSE;
1809 return TRUE;
1810 }
1811