1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
4 *
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
7 * for details.
8 *****************************************************************/
9
10 /* msa.c
11 * SRE, Mon May 17 10:48:47 1999
12 *
13 * SQUID's interface for multiple sequence alignment
14 * manipulation: access to the MSA object.
15 *
16 * RCS $Id: msa.c 297 2014-10-31 13:02:37Z fabian $ (Original squid RCS Id: msa.c,v 1.18 2002/10/12 04:40:35 eddy Exp)
17 */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include "squid.h"
23 #include "msa.h" /* multiple sequence alignment object support */
24 #include "gki.h" /* string indexing hashtable code */
25 #include "ssi.h" /* SSI sequence file indexing code */
26
27 /* Function: MSAAlloc()
28 * Date: SRE, Tue May 18 10:45:47 1999 [St. Louis]
29 *
30 * Purpose: Allocate an MSA structure, return a pointer
31 * to it.
32 *
33 * Designed to be used in three ways:
34 * 1) We know exactly the dimensions of the alignment:
35 * both nseq and alen.
36 * msa = MSAAlloc(nseq, alen);
37 *
38 * 2) We know the number of sequences but not alen.
39 * (We add sequences later.)
40 * msa = MSAAlloc(nseq, 0);
41 *
42 * 3) We even don't know the number of sequences, so
43 * we'll have to dynamically expand allocations.
44 * We provide a blocksize for the allocation expansion,
45 * and expand when needed.
46 * msa = MSAAlloc(10, 0);
47 * if (msa->nseq == msa->nseqalloc) MSAExpand(msa);
48 *
49 * Args: nseq - number of sequences, or nseq allocation blocksize
50 * alen - length of alignment in columns, or 0
51 *
52 * Returns: pointer to new MSA object, w/ all values initialized.
53 * Note that msa->nseq is initialized to 0, though space
54 * is allocated.
55 *
56 * Diagnostics: "always works". Die()'s on memory allocation failure.
57 *
58 */
59 MSA *
MSAAlloc(int nseq,int alen)60 MSAAlloc(int nseq, int alen)
61 {
62 MSA *msa;
63 int i;
64
65 msa = MallocOrDie(sizeof(MSA));
66 msa->aseq = MallocOrDie(sizeof(char *) * nseq);
67 msa->sqname = MallocOrDie(sizeof(char *) * nseq);
68 msa->sqlen = MallocOrDie(sizeof(int) * nseq);
69 msa->wgt = MallocOrDie(sizeof(float) * nseq);
70
71 for (i = 0; i < nseq; i++)
72 {
73 msa->sqname[i] = NULL;
74 msa->sqlen[i] = 0;
75 msa->wgt[i] = -1.0;
76
77 if (alen != 0) msa->aseq[i] = MallocOrDie(sizeof(char) * (alen+1));
78 else msa->aseq[i] = NULL;
79 }
80
81 msa->alen = alen;
82 msa->nseq = 0;
83 msa->nseqalloc = nseq;
84 msa->nseqlump = nseq;
85
86 msa->flags = 0;
87 msa->type = kOtherSeq;
88 msa->name = NULL;
89 msa->desc = NULL;
90 msa->acc = NULL;
91 msa->au = NULL;
92 msa->ss_cons = NULL;
93 msa->sa_cons = NULL;
94 msa->rf = NULL;
95 msa->sqacc = NULL;
96 msa->sqdesc = NULL;
97 msa->ss = NULL;
98 msa->sslen = NULL;
99 msa->sa = NULL;
100 msa->salen = NULL;
101 msa->co = NULL;
102 msa->colen = NULL;
103 msa->index = GKIInit();
104 msa->lastidx = 0;
105
106 for (i = 0; i < MSA_MAXCUTOFFS; i++) {
107 msa->cutoff[i] = 0.;
108 msa->cutoff_is_set[i] = FALSE;
109 }
110
111 /* Initialize unparsed optional markup
112 */
113 msa->comment = NULL;
114 msa->ncomment = 0;
115 msa->alloc_ncomment = 0;
116
117 msa->gf_tag = NULL;
118 msa->gf = NULL;
119 msa->ngf = 0;
120
121 msa->gs_tag = NULL;
122 msa->gs = NULL;
123 msa->gs_idx = NULL;
124 msa->ngs = 0;
125
126 msa->gc_tag = NULL;
127 msa->gc = NULL;
128 msa->gc_idx = NULL;
129 msa->ngc = 0;
130
131 msa->gr_tag = NULL;
132 msa->gr = NULL;
133 msa->gr_idx = NULL;
134 msa->ngr = 0;
135
136 /* Done. Return the alloced, initialized structure
137 */
138 return msa;
139 }
140
141 /* Function: MSAExpand()
142 * Date: SRE, Tue May 18 11:06:53 1999 [St. Louis]
143 *
144 * Purpose: Increase the sequence allocation in an MSA
145 * by msa->nseqlump. (Typically used when we're reading
146 * in an alignment sequentially from a file,
147 * so we don't know nseq until we're done.)
148 *
149 * Args: msa - the MSA object
150 *
151 * Returns: (void)
152 *
153 */
154 void
MSAExpand(MSA * msa)155 MSAExpand(MSA *msa)
156 {
157 int i,j;
158
159 msa->nseqalloc += msa->nseqlump;
160
161 msa->aseq = ReallocOrDie(msa->aseq, sizeof(char *) * msa->nseqalloc);
162 msa->sqname = ReallocOrDie(msa->sqname, sizeof(char *) * msa->nseqalloc);
163 msa->sqlen = ReallocOrDie(msa->sqlen, sizeof(char *) * msa->nseqalloc);
164 msa->wgt = ReallocOrDie(msa->wgt, sizeof(float) * msa->nseqalloc);
165
166 if (msa->ss != NULL) {
167 msa->ss = ReallocOrDie(msa->ss, sizeof(char *) * msa->nseqalloc);
168 msa->sslen = ReallocOrDie(msa->sslen, sizeof(int) * msa->nseqalloc);
169 }
170 if (msa->sa != NULL) {
171 msa->sa = ReallocOrDie(msa->sa, sizeof(char *) * msa->nseqalloc);
172 msa->salen = ReallocOrDie(msa->salen, sizeof(int) * msa->nseqalloc);
173 }
174 if (msa->sqacc != NULL)
175 msa->sqacc = ReallocOrDie(msa->sqacc, sizeof(char *) * msa->nseqalloc);
176 if (msa->sqdesc != NULL)
177 msa->sqdesc =ReallocOrDie(msa->sqdesc,sizeof(char *) * msa->nseqalloc);
178
179 for (i = msa->nseqalloc-msa->nseqlump; i < msa->nseqalloc; i++)
180 {
181 msa->sqname[i] = NULL;
182 msa->wgt[i] = -1.0;
183
184 if (msa->sqacc != NULL) msa->sqacc[i] = NULL;
185 if (msa->sqdesc != NULL) msa->sqdesc[i] = NULL;
186
187 if (msa->alen != 0)
188 msa->aseq[i] = ReallocOrDie(msa->aseq[i], sizeof(char) * (msa->alen+1));
189 else msa->aseq[i] = NULL;
190 msa->sqlen[i] = 0;
191
192 if (msa->ss != NULL) {
193 if (msa->alen != 0)
194 msa->ss[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1));
195 else msa->ss[i] = NULL;
196 msa->sslen[i] = 0;
197 }
198 if (msa->sa != NULL) {
199 if (msa->alen != 0)
200 msa->sa[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1));
201 else
202 msa->sa[i] = NULL;
203 msa->salen[i] = 0;
204 }
205 }
206
207 /* Reallocate and re-init for unparsed #=GS tags, if we have some.
208 * gs is [0..ngs-1][0..nseq-1][], so we're reallocing the middle
209 * set of pointers.
210 */
211 if (msa->gs != NULL)
212 for (i = 0; i < msa->ngs; i++)
213 {
214 if (msa->gs[i] != NULL)
215 {
216 msa->gs[i] = ReallocOrDie(msa->gs[i], sizeof(char *) * msa->nseqalloc);
217 for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++)
218 msa->gs[i][j] = NULL;
219 }
220 }
221
222 /* Reallocate and re-init for unparsed #=GR tags, if we have some.
223 * gr is [0..ngs-1][0..nseq-1][], so we're reallocing the middle
224 * set of pointers.
225 */
226 if (msa->gr != NULL)
227 for (i = 0; i < msa->ngr; i++)
228 {
229 if (msa->gr[i] != NULL)
230 {
231 msa->gr[i] = ReallocOrDie(msa->gr[i], sizeof(char *) * msa->nseqalloc);
232 for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++)
233 msa->gr[i][j] = NULL;
234 }
235 }
236
237 return;
238 }
239
240 /* Function: MSAFree()
241 * Date: SRE, Tue May 18 11:20:16 1999 [St. Louis]
242 *
243 * Purpose: Free a multiple sequence alignment structure.
244 *
245 * Args: msa - the alignment
246 *
247 * Returns: (void)
248 */
249 void
MSAFree(MSA * msa)250 MSAFree(MSA *msa)
251 {
252 Free2DArray((void **) msa->aseq, msa->nseq);
253 Free2DArray((void **) msa->sqname, msa->nseq);
254 Free2DArray((void **) msa->sqacc, msa->nseq);
255 Free2DArray((void **) msa->sqdesc, msa->nseq);
256 Free2DArray((void **) msa->ss, msa->nseq);
257 Free2DArray((void **) msa->sa, msa->nseq);
258 Free2DArray((void **) msa->co, msa->nseq);
259
260 if (msa->sqlen != NULL) free(msa->sqlen);
261 if (msa->wgt != NULL) free(msa->wgt);
262
263 if (msa->name != NULL) free(msa->name);
264 if (msa->desc != NULL) free(msa->desc);
265 if (msa->acc != NULL) free(msa->acc);
266 if (msa->au != NULL) free(msa->au);
267 if (msa->ss_cons != NULL) free(msa->ss_cons);
268 if (msa->sa_cons != NULL) free(msa->sa_cons);
269 if (msa->rf != NULL) free(msa->rf);
270 if (msa->sslen != NULL) free(msa->sslen);
271 if (msa->salen != NULL) free(msa->salen);
272
273 Free2DArray((void **) msa->comment, msa->ncomment);
274 Free2DArray((void **) msa->gf_tag, msa->ngf);
275 Free2DArray((void **) msa->gf, msa->ngf);
276 Free2DArray((void **) msa->gs_tag, msa->ngs);
277 Free3DArray((void ***)msa->gs, msa->ngs, msa->nseq);
278 Free2DArray((void **) msa->gc_tag, msa->ngc);
279 Free2DArray((void **) msa->gc, msa->ngc);
280 Free2DArray((void **) msa->gr_tag, msa->ngr);
281 Free3DArray((void ***)msa->gr, msa->ngr, msa->nseq);
282
283 GKIFree(msa->index);
284 GKIFree(msa->gs_idx);
285 GKIFree(msa->gc_idx);
286 GKIFree(msa->gr_idx);
287
288 free(msa);
289 }
290
291
292 /* Function: MSASetSeqAccession()
293 * Date: SRE, Mon Jun 21 04:13:33 1999 [Sanger Centre]
294 *
295 * Purpose: Set a sequence accession in an MSA structure.
296 * Handles some necessary allocation/initialization.
297 *
298 * Args: msa - multiple alignment to add accession to
299 * seqidx - index of sequence to attach accession to
300 * acc - accession
301 *
302 * Returns: void
303 */
304 void
MSASetSeqAccession(MSA * msa,int seqidx,char * acc)305 MSASetSeqAccession(MSA *msa, int seqidx, char *acc)
306 {
307 int x;
308
309 if (msa->sqacc == NULL) {
310 msa->sqacc = MallocOrDie(sizeof(char *) * msa->nseqalloc);
311 for (x = 0; x < msa->nseqalloc; x++)
312 msa->sqacc[x] = NULL;
313 }
314 msa->sqacc[seqidx] = sre_strdup(acc, -1);
315 }
316
317 /* Function: MSASetSeqDescription()
318 * Date: SRE, Mon Jun 21 04:21:09 1999 [Sanger Centre]
319 *
320 * Purpose: Set a sequence description in an MSA structure.
321 * Handles some necessary allocation/initialization.
322 *
323 * Args: msa - multiple alignment to add accession to
324 * seqidx - index of sequence to attach accession to
325 * desc - description
326 *
327 * Returns: void
328 */
329 void
MSASetSeqDescription(MSA * msa,int seqidx,char * desc)330 MSASetSeqDescription(MSA *msa, int seqidx, char *desc)
331 {
332 int x;
333
334 if (msa->sqdesc == NULL) {
335 msa->sqdesc = MallocOrDie(sizeof(char *) * msa->nseqalloc);
336 for (x = 0; x < msa->nseqalloc; x++)
337 msa->sqdesc[x] = NULL;
338 }
339 msa->sqdesc[seqidx] = sre_strdup(desc, -1);
340 }
341
342
343 /* Function: MSAAddComment()
344 * Date: SRE, Tue Jun 1 17:37:21 1999 [St. Louis]
345 *
346 * Purpose: Add an (unparsed) comment line to the MSA structure,
347 * allocating as necessary.
348 *
349 * Args: msa - a multiple alignment
350 * s - comment line to add
351 *
352 * Returns: (void)
353 */
354 void
MSAAddComment(MSA * msa,char * s)355 MSAAddComment(MSA *msa, char *s)
356 {
357 /* If this is our first recorded comment, we need to malloc();
358 * and if we've filled available space, we need to realloc().
359 * Note the arbitrary lumpsize of 10 lines per allocation...
360 */
361 if (msa->comment == NULL) {
362 msa->comment = MallocOrDie (sizeof(char *) * 10);
363 msa->alloc_ncomment = 10;
364 }
365 if (msa->ncomment == msa->alloc_ncomment) {
366 msa->alloc_ncomment += 10;
367 msa->comment = ReallocOrDie(msa->comment, sizeof(char *) * msa->alloc_ncomment);
368 }
369
370 msa->comment[msa->ncomment] = sre_strdup(s, -1);
371 msa->ncomment++;
372 return;
373 }
374
375 /* Function: MSAAddGF()
376 * Date: SRE, Wed Jun 2 06:53:54 1999 [bus to Madison]
377 *
378 * Purpose: Add an unparsed #=GF markup line to the MSA
379 * structure, allocating as necessary.
380 *
381 * Args: msa - a multiple alignment
382 * tag - markup tag (e.g. "AU")
383 * value - free text markup (e.g. "Alex Bateman")
384 *
385 * Returns: (void)
386 */
387 void
MSAAddGF(MSA * msa,char * tag,char * value)388 MSAAddGF(MSA *msa, char *tag, char *value)
389 {
390 /* If this is our first recorded unparsed #=GF line, we need to malloc();
391 * if we've filled availabl space If we already have a hash index, and the GF
392 * Note the arbitrary lumpsize of 10 lines per allocation...
393 */
394 if (msa->gf_tag == NULL) {
395 msa->gf_tag = MallocOrDie (sizeof(char *) * 10);
396 msa->gf = MallocOrDie (sizeof(char *) * 10);
397 msa->alloc_ngf = 10;
398 }
399 if (msa->ngf == msa->alloc_ngf) {
400 msa->alloc_ngf += 10;
401 msa->gf_tag = ReallocOrDie(msa->gf_tag, sizeof(char *) * msa->alloc_ngf);
402 msa->gf = ReallocOrDie(msa->gf, sizeof(char *) * msa->alloc_ngf);
403 }
404
405 msa->gf_tag[msa->ngf] = sre_strdup(tag, -1);
406 msa->gf[msa->ngf] = sre_strdup(value, -1);
407 msa->ngf++;
408
409 return;
410 }
411
412
413 /* Function: MSAAddGS()
414 * Date: SRE, Wed Jun 2 06:57:03 1999 [St. Louis]
415 *
416 * Purpose: Add an unparsed #=GS markup line to the MSA
417 * structure, allocating as necessary.
418 *
419 * It's possible that we could get more than one
420 * of the same type of GS tag per sequence; for
421 * example, "DR PDB;" structure links in Pfam.
422 * Hack: handle these by appending to the string,
423 * in a \n separated fashion.
424 *
425 * Args: msa - multiple alignment structure
426 * tag - markup tag (e.g. "AC")
427 * sqidx - index of sequence to assoc markup with (0..nseq-1)
428 * value - markup (e.g. "P00666")
429 *
430 * Returns: 0 on success
431 */
432 void
MSAAddGS(MSA * msa,char * tag,int sqidx,char * value)433 MSAAddGS(MSA *msa, char *tag, int sqidx, char *value)
434 {
435 int tagidx;
436 int i;
437
438 /* Is this an unparsed tag name that we recognize?
439 * If not, handle adding it to index, and reallocating
440 * as needed.
441 */
442 if (msa->gs_tag == NULL) /* first tag? init w/ malloc */
443 {
444 msa->gs_idx = GKIInit();
445 tagidx = GKIStoreKey(msa->gs_idx, tag);
446 SQD_DASSERT1((tagidx == 0));
447 msa->gs_tag = MallocOrDie(sizeof(char *));
448 msa->gs = MallocOrDie(sizeof(char **));
449 msa->gs[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
450 for (i = 0; i < msa->nseqalloc; i++)
451 msa->gs[0][i] = NULL;
452 }
453 else
454 {
455 /* new tag? */
456 tagidx = GKIKeyIndex(msa->gs_idx, tag);
457 if (tagidx < 0) { /* it's a new tag name; realloc */
458 tagidx = GKIStoreKey(msa->gs_idx, tag);
459 /* since we alloc in blocks of 1,
460 we always realloc upon seeing
461 a new tag. */
462 SQD_DASSERT1((tagidx == msa->ngs));
463 msa->gs_tag = ReallocOrDie(msa->gs_tag, (msa->ngs+1) * sizeof(char *));
464 msa->gs = ReallocOrDie(msa->gs, (msa->ngs+1) * sizeof(char **));
465 msa->gs[msa->ngs] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
466 for (i = 0; i < msa->nseqalloc; i++)
467 msa->gs[msa->ngs][i] = NULL;
468 }
469 }
470
471 if (tagidx == msa->ngs) {
472 msa->gs_tag[tagidx] = sre_strdup(tag, -1);
473 msa->ngs++;
474 }
475
476 if (msa->gs[tagidx][sqidx] == NULL) /* first annotation of this seq with this tag? */
477 msa->gs[tagidx][sqidx] = sre_strdup(value, -1);
478 else {
479 /* >1 annotation of this seq with this tag; append */
480 int len;
481 if ((len = sre_strcat(&(msa->gs[tagidx][sqidx]), -1, "\n", 1)) < 0)
482 Die("failed to sre_strcat()");
483 if (sre_strcat(&(msa->gs[tagidx][sqidx]), len, value, -1) < 0)
484 Die("failed to sre_strcat()");
485 }
486 return;
487 }
488
489 /* Function: MSAAppendGC()
490 * Date: SRE, Thu Jun 3 06:25:14 1999 [Madison]
491 *
492 * Purpose: Add an unparsed #=GC markup line to the MSA
493 * structure, allocating as necessary.
494 *
495 * When called multiple times for the same tag,
496 * appends value strings together -- used when
497 * parsing multiblock alignment files, for
498 * example.
499 *
500 * Args: msa - multiple alignment structure
501 * tag - markup tag (e.g. "CS")
502 * value - markup, one char per aligned column
503 *
504 * Returns: (void)
505 */
506 void
MSAAppendGC(MSA * msa,char * tag,char * value)507 MSAAppendGC(MSA *msa, char *tag, char *value)
508 {
509 int tagidx;
510
511 /* Is this an unparsed tag name that we recognize?
512 * If not, handle adding it to index, and reallocating
513 * as needed.
514 */
515 if (msa->gc_tag == NULL) /* first tag? init w/ malloc */
516 {
517 msa->gc_tag = MallocOrDie(sizeof(char *));
518 msa->gc = MallocOrDie(sizeof(char *));
519 msa->gc_idx = GKIInit();
520 tagidx = GKIStoreKey(msa->gc_idx, tag);
521 SQD_DASSERT1((tagidx == 0));
522 msa->gc[0] = NULL;
523 }
524 else
525 { /* new tag? */
526 tagidx = GKIKeyIndex(msa->gc_idx, tag);
527 if (tagidx < 0) { /* it's a new tag name; realloc */
528 tagidx = GKIStoreKey(msa->gc_idx, tag);
529 /* since we alloc in blocks of 1,
530 we always realloc upon seeing
531 a new tag. */
532 SQD_DASSERT1((tagidx == msa->ngc));
533 msa->gc_tag = ReallocOrDie(msa->gc_tag, (msa->ngc+1) * sizeof(char **));
534 msa->gc = ReallocOrDie(msa->gc, (msa->ngc+1) * sizeof(char **));
535 msa->gc[tagidx] = NULL;
536 }
537 }
538
539 if (tagidx == msa->ngc) {
540 msa->gc_tag[tagidx] = sre_strdup(tag, -1);
541 msa->ngc++;
542 }
543 sre_strcat(&(msa->gc[tagidx]), -1, value, -1);
544 return;
545 }
546
547 /* Function: MSAGetGC()
548 * Date: SRE, Fri Aug 13 13:25:57 1999 [St. Louis]
549 *
550 * Purpose: Given a tagname for a miscellaneous #=GC column
551 * annotation, return a pointer to the annotation
552 * string.
553 *
554 * Args: msa - alignment and its annotation
555 * tag - name of the annotation
556 *
557 * Returns: ptr to the annotation string. Caller does *not*
558 * free; is managed by msa object still.
559 */
560 char *
MSAGetGC(MSA * msa,char * tag)561 MSAGetGC(MSA *msa, char *tag)
562 {
563 int tagidx;
564
565 if (msa->gc_idx == NULL) return NULL;
566 if ((tagidx = GKIKeyIndex(msa->gc_idx, tag)) < 0) return NULL;
567 return msa->gc[tagidx];
568 }
569
570
571 /* Function: MSAAppendGR()
572 * Date: SRE, Thu Jun 3 06:34:38 1999 [Madison]
573 *
574 * Purpose: Add an unparsed #=GR markup line to the
575 * MSA structure, allocating as necessary.
576 *
577 * When called multiple times for the same tag,
578 * appends value strings together -- used when
579 * parsing multiblock alignment files, for
580 * example.
581 *
582 * Args: msa - multiple alignment structure
583 * tag - markup tag (e.g. "SS")
584 * sqidx - index of seq to assoc markup with (0..nseq-1)
585 * value - markup, one char per aligned column
586 *
587 * Returns: (void)
588 */
589 void
MSAAppendGR(MSA * msa,char * tag,int sqidx,char * value)590 MSAAppendGR(MSA *msa, char *tag, int sqidx, char *value)
591 {
592 int tagidx;
593 int i;
594
595 /* Is this an unparsed tag name that we recognize?
596 * If not, handle adding it to index, and reallocating
597 * as needed.
598 */
599 if (msa->gr_tag == NULL) /* first tag? init w/ malloc */
600 {
601 msa->gr_tag = MallocOrDie(sizeof(char *));
602 msa->gr = MallocOrDie(sizeof(char **));
603 msa->gr[0] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
604 for (i = 0; i < msa->nseqalloc; i++)
605 msa->gr[0][i] = NULL;
606 msa->gr_idx = GKIInit();
607 tagidx = GKIStoreKey(msa->gr_idx, tag);
608 SQD_DASSERT1((tagidx == 0));
609 }
610 else
611 {
612 /* new tag? */
613 tagidx = GKIKeyIndex(msa->gr_idx, tag);
614 if (tagidx < 0) { /* it's a new tag name; realloc */
615 tagidx = GKIStoreKey(msa->gr_idx, tag);
616 /* since we alloc in blocks of 1,
617 we always realloc upon seeing
618 a new tag. */
619 SQD_DASSERT1((tagidx == msa->ngr));
620 msa->gr_tag = ReallocOrDie(msa->gr_tag, (msa->ngr+1) * sizeof(char *));
621 msa->gr = ReallocOrDie(msa->gr, (msa->ngr+1) * sizeof(char **));
622 msa->gr[msa->ngr] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
623 for (i = 0; i < msa->nseqalloc; i++)
624 msa->gr[msa->ngr][i] = NULL;
625 }
626 }
627
628 if (tagidx == msa->ngr) {
629 msa->gr_tag[tagidx] = sre_strdup(tag, -1);
630 msa->ngr++;
631 }
632 sre_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1);
633 return;
634 }
635
636
637 /* Function: MSAVerifyParse()
638 * Date: SRE, Sat Jun 5 14:24:24 1999 [Madison, 1999 worm mtg]
639 *
640 * Purpose: Last function called after a multiple alignment is
641 * parsed. Checks that parse was successful; makes sure
642 * required information is present; makes sure required
643 * information is consistent. Some fields that are
644 * only use during parsing may be freed (sqlen, for
645 * example).
646 *
647 * Some fields in msa may be modified (msa->alen is set,
648 * for example).
649 *
650 * Args: msa - the multiple alignment
651 * sqname, aseq must be set
652 * nseq must be correct
653 * alen need not be set; will be set here.
654 * wgt will be set here if not already set
655 *
656 * Returns: (void)
657 * Will Die() here with diagnostics on error.
658 *
659 * Example:
660 */
661 void
MSAVerifyParse(MSA * msa)662 MSAVerifyParse(MSA *msa)
663 {
664 int idx;
665
666 if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s",
667 msa->name != NULL ? msa->name : "");
668
669 msa->alen = msa->sqlen[0];
670
671 /* We can rely on msa->sqname[] being valid for any index,
672 * because of the way the line parsers always store any name
673 * they add to the index.
674 */
675 for (idx = 0; idx < msa->nseq; idx++)
676 {
677 /* aseq is required. */
678 if (msa->aseq[idx] == NULL)
679 Die("Parse error: No sequence for %s in alignment %s", msa->sqname[idx],
680 msa->name != NULL ? msa->name : "");
681 /* either all weights must be set, or none of them */
682 if ((msa->flags & MSA_SET_WGT) && msa->wgt[idx] == -1.0)
683 Die("Parse error: some weights are set, but %s doesn't have one in alignment %s",
684 msa->sqname[idx],
685 msa->name != NULL ? msa->name : "");
686 /* all aseq must be same length. */
687 if (msa->sqlen[idx] != msa->alen)
688 Die("Parse error: sequence %s: length %d, expected %d in alignment %s",
689 msa->sqname[idx], msa->sqlen[idx], msa->alen,
690 msa->name != NULL ? msa->name : "");
691 /* if SS is present, must have length right */
692 if (msa->ss != NULL && msa->ss[idx] != NULL && msa->sslen[idx] != msa->alen)
693 Die("Parse error: #=GR SS annotation for %s: length %d, expected %d in alignment %s",
694 msa->sqname[idx], msa->sslen[idx], msa->alen,
695 msa->name != NULL ? msa->name : "");
696 /* if SA is present, must have length right */
697 if (msa->sa != NULL && msa->sa[idx] != NULL && msa->salen[idx] != msa->alen)
698 Die("Parse error: #=GR SA annotation for %s: length %d, expected %d in alignment %s",
699 msa->sqname[idx], msa->salen[idx], msa->alen,
700 msa->name != NULL ? msa->name : "");
701 }
702
703 /* if cons SS is present, must have length right */
704 if (msa->ss_cons != NULL && strlen(msa->ss_cons) != msa->alen) /* FIXME */
705 Die("Parse error: #=GC SS_cons annotation: length %d, expected %d in alignment %s",
706 strlen(msa->ss_cons), msa->alen,
707 msa->name != NULL ? msa->name : "");
708
709 /* if cons SA is present, must have length right */
710 if (msa->sa_cons != NULL && strlen(msa->sa_cons) != msa->alen) /* FIXME */
711 Die("Parse error: #=GC SA_cons annotation: length %d, expected %d in alignment %s",
712 strlen(msa->sa_cons), msa->alen,
713 msa->name != NULL ? msa->name : "");
714
715 /* if RF is present, must have length right */
716 if (msa->rf != NULL && strlen(msa->rf) != msa->alen)
717 Die("Parse error: #=GC RF annotation: length %d, expected %d in alignment %s",
718 strlen(msa->rf), msa->alen,
719 msa->name != NULL ? msa->name : "");
720
721 /* Check that all or no weights are set */
722 if (!(msa->flags & MSA_SET_WGT))
723 FSet(msa->wgt, msa->nseq, 1.0); /* default weights */
724
725 /* Clean up a little from the parser */
726 if (msa->sqlen != NULL) { free(msa->sqlen); msa->sqlen = NULL; }
727 if (msa->sslen != NULL) { free(msa->sslen); msa->sslen = NULL; }
728 if (msa->salen != NULL) { free(msa->salen); msa->salen = NULL; }
729
730 return;
731 }
732
733
734 /* @* MSAVerifyParseDub */
735 void
MSAVerifyParseDub(MSA * msa)736 MSAVerifyParseDub(MSA *msa)
737 {
738 int idx;
739
740 if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s",
741 msa->name != NULL ? msa->name : "");
742
743 msa->alen = msa->sqlen[0];
744
745 /* We can rely on msa->sqname[] being valid for any index,
746 * because of the way the line parsers always store any name
747 * they add to the index.
748 */
749 for (idx = 0; idx < msa->nseq; idx++)
750 {
751 /* aseq is required. */
752 if (msa->aseq[idx] == NULL)
753 Die("Parse error: No sequence for %s in alignment %s", msa->sqname[idx],
754 msa->name != NULL ? msa->name : "");
755 /* either all weights must be set, or none of them */
756 if ((msa->flags & MSA_SET_WGT) && msa->wgt[idx] == -1.0)
757 Die("Parse error: some weights are set, but %s doesn't have one in alignment %s",
758 msa->sqname[idx],
759 msa->name != NULL ? msa->name : "");
760 /* this is Dublin format, aseq don't have to be same length. */
761 if (msa->sqlen[idx] > msa->alen) {
762 msa->alen = msa->sqlen[idx];
763 }
764 /* if SS is present, must have length right */
765 if (msa->ss != NULL && msa->ss[idx] != NULL && msa->sslen[idx] != msa->sqlen[idx])
766 Die("Parse error: #=GR SS annotation for %s: length %d, expected %d in alignment %s",
767 msa->sqname[idx], msa->sslen[idx], msa->sqlen[idx],
768 msa->name != NULL ? msa->name : "");
769 /* if SA is present, must have length right */
770 if (msa->sa != NULL && msa->sa[idx] != NULL && msa->salen[idx] != msa->sqlen[idx])
771 Die("Parse error: #=GR SA annotation for %s: length %d, expected %d in alignment %s",
772 msa->sqname[idx], msa->salen[idx], msa->sqlen[idx],
773 msa->name != NULL ? msa->name : "");
774
775 /* if CO is present, must have length right */
776 if (msa->co != NULL && msa->co[idx] != NULL && msa->colen[idx] != msa->sqlen[idx])
777 Die("Parse error: #=GR CO annotation for %s: length %d, expected %d in alignment %s",
778 msa->sqname[idx], msa->colen[idx], msa->sqlen[idx],
779 msa->name != NULL ? msa->name : "");
780 }
781
782
783
784
785 /* Check that all or no weights are set */
786 if (!(msa->flags & MSA_SET_WGT))
787 FSet(msa->wgt, msa->nseq, 1.0); /* default weights */
788
789 /* Clean up a little from the parser */
790 if (msa->sqlen != NULL) { free(msa->sqlen); msa->sqlen = NULL; }
791 if (msa->sslen != NULL) { free(msa->sslen); msa->sslen = NULL; }
792 if (msa->salen != NULL) { free(msa->salen); msa->salen = NULL; }
793 if (msa->colen != NULL) { free(msa->colen); msa->colen = NULL; }
794
795 return;
796 } /* this is the end of MSAVerifyParseDub() */
797
798
799
800 /* Function: MSAFileOpen()
801 * Date: SRE, Tue May 18 13:22:01 1999 [St. Louis]
802 *
803 * Purpose: Open an alignment database file and prepare
804 * for reading one alignment, or sequentially
805 * in the (rare) case of multiple MSA databases
806 * (e.g. Stockholm format).
807 *
808 * Args: filename - name of file to open
809 * if "-", read stdin
810 * if it ends in ".gz", read from pipe to gunzip -dc
811 * format - format of file (e.g. MSAFILE_STOCKHOLM)
812 * env - environment variable for path (e.g. BLASTDB)
813 *
814 * Returns: opened MSAFILE * on success.
815 * NULL on failure:
816 * usually, because the file doesn't exist;
817 * for gzip'ed files, may also mean that gzip isn't in the path.
818 */
819 MSAFILE *
MSAFileOpen(char * filename,int format,char * env)820 MSAFileOpen(char *filename, int format, char *env)
821 {
822 MSAFILE *afp;
823
824 afp = MallocOrDie(sizeof(MSAFILE));
825 if (strcmp(filename, "-") == 0)
826 {
827 afp->f = stdin;
828 afp->do_stdin = TRUE;
829 afp->do_gzip = FALSE;
830 afp->fname = sre_strdup("[STDIN]", -1);
831 afp->ssi = NULL; /* can't index stdin because we can't seek*/
832 }
833 #ifndef SRE_STRICT_ANSI
834 /* popen(), pclose() aren't portable to non-POSIX systems; disable */
835 else if (Strparse("^.*\\.gz$", filename, 0))
836 {
837 char cmd[256];
838
839 /* Note that popen() will return "successfully"
840 * if file doesn't exist, because gzip works fine
841 * and prints an error! So we have to check for
842 * existence of file ourself.
843 */
844 if (! FileExists(filename))
845 Die("%s: file does not exist", filename);
846 if (strlen(filename) + strlen("gzip -dc ") >= 256)
847 Die("filename > 255 char in MSAFileOpen()");
848 sprintf(cmd, "gzip -dc %s", filename);
849 if ((afp->f = popen(cmd, "r")) == NULL)
850 return NULL;
851
852 afp->do_stdin = FALSE;
853 afp->do_gzip = TRUE;
854 afp->fname = sre_strdup(filename, -1);
855 /* we can't index a .gz file, because we can't seek in a pipe afaik */
856 afp->ssi = NULL;
857 }
858 #endif /*SRE_STRICT_ANSI*/
859 else
860 {
861 char *ssifile;
862 char *dir;
863
864 /* When we open a file, it may be either in the current
865 * directory, or in the directory indicated by the env
866 * argument - and we have to construct the SSI filename accordingly.
867 */
868 if ((afp->f = fopen(filename, "r")) != NULL)
869 {
870 ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5));
871 sprintf(ssifile, "%s.ssi", filename);
872 }
873 else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL)
874 {
875 char *full;
876 full = FileConcat(dir, filename);
877 ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename) + 5));
878 sprintf(ssifile, "%s.ssi", full);
879 free(dir);
880 }
881 else return NULL;
882
883 afp->do_stdin = FALSE;
884 afp->do_gzip = FALSE;
885 afp->fname = sre_strdup(filename, -1);
886 afp->ssi = NULL;
887
888 /* Open the SSI index file. If it doesn't exist, or
889 * it's corrupt, or some error happens, afp->ssi stays NULL.
890 */
891 SSIOpen(ssifile, &(afp->ssi));
892 free(ssifile);
893 }
894
895 /* Invoke autodetection if we haven't already been told what
896 * to expect.
897 */
898 if (format == MSAFILE_UNKNOWN)
899 {
900 if (afp->do_stdin == TRUE || afp->do_gzip)
901 Die("Can't autodetect alignment file format from a stdin or gzip pipe");
902 format = MSAFileFormat(afp);
903 if (format == MSAFILE_UNKNOWN)
904 Die("Can't determine format of multiple alignment file %s", afp->fname);
905 }
906
907 afp->format = format;
908 afp->linenumber = 0;
909 afp->buf = NULL;
910 afp->buflen = 0;
911
912 return afp;
913 }
914
915
916 /* Function: MSAFilePositionByKey()
917 * MSAFilePositionByIndex()
918 * MSAFileRewind()
919 *
920 * Date: SRE, Tue Nov 9 19:02:54 1999 [St. Louis]
921 *
922 * Purpose: Family of functions for repositioning in
923 * open MSA files; analogous to a similarly
924 * named function series in HMMER's hmmio.c.
925 *
926 * Args: afp - open alignment file
927 * offset - disk offset in bytes
928 * key - key to look up in SSI indices
929 * idx - index of alignment.
930 *
931 * Returns: 0 on failure.
932 * 1 on success.
933 * If called on a non-fseek()'able file (e.g. a gzip'ed
934 * or pipe'd alignment), returns 0 as a failure flag.
935 */
936 int
MSAFileRewind(MSAFILE * afp)937 MSAFileRewind(MSAFILE *afp)
938 {
939 if (afp->do_gzip || afp->do_stdin) return 0;
940 rewind(afp->f);
941 return 1;
942 }
943 int
MSAFilePositionByKey(MSAFILE * afp,char * key)944 MSAFilePositionByKey(MSAFILE *afp, char *key)
945 {
946 int fh; /* filehandle is ignored */
947 SSIOFFSET offset; /* offset of the key alignment */
948
949 if (afp->ssi == NULL) return 0;
950 if (SSIGetOffsetByName(afp->ssi, key, &fh, &offset) != 0) return 0;
951 if (SSISetFilePosition(afp->f, &offset) != 0) return 0;
952 return 1;
953 }
954 int
MSAFilePositionByIndex(MSAFILE * afp,int idx)955 MSAFilePositionByIndex(MSAFILE *afp, int idx)
956 {
957 int fh; /* filehandled is passed but ignored */
958 SSIOFFSET offset; /* disk offset of desired alignment */
959
960 if (afp->ssi == NULL) return 0;
961 if (SSIGetOffsetByNumber(afp->ssi, idx, &fh, &offset) != 0) return 0;
962 if (SSISetFilePosition(afp->f, &offset) != 0) return 0;
963 return 1;
964 }
965
966
967 /* Function: MSAFileRead()
968 * Date: SRE, Fri May 28 16:01:43 1999 [St. Louis]
969 *
970 * Purpose: Read the next msa from an open alignment file.
971 * This is a wrapper around format-specific calls.
972 *
973 * Args: afp - open alignment file
974 *
975 * Returns: next alignment, or NULL if out of alignments
976 */
977 MSA *
MSAFileRead(MSAFILE * afp)978 MSAFileRead(MSAFILE *afp)
979 {
980 MSA *msa = NULL;
981
982 switch (afp->format) {
983 case MSAFILE_STOCKHOLM: msa = ReadStockholm(afp); break;
984 case MSAFILE_MSF: msa = ReadMSF(afp); break;
985 case MSAFILE_A2M: msa = ReadA2M(afp); break;
986 case MSAFILE_CLUSTAL: msa = ReadClustal(afp); break;
987 case MSAFILE_SELEX: msa = ReadSELEX(afp); break;
988 case MSAFILE_PHYLIP: msa = ReadPhylip(afp); break;
989 case MSAFILE_DUBLIN: msa = ReadDublin(afp); break; /* -> r296, FS */
990 default:
991 Die("MSAFILE corrupted: bad format index");
992 }
993 return msa;
994 }
995
996 /* Function: MSAFileClose()
997 * Date: SRE, Tue May 18 14:05:28 1999 [St. Louis]
998 *
999 * Purpose: Close an open MSAFILE.
1000 *
1001 * Args: afp - ptr to an open MSAFILE.
1002 *
1003 * Returns: void
1004 */
1005 void
MSAFileClose(MSAFILE * afp)1006 MSAFileClose(MSAFILE *afp)
1007 {
1008 #ifndef SRE_STRICT_ANSI /* gzip functionality only on POSIX systems */
1009 if (afp->do_gzip) pclose(afp->f);
1010 #endif
1011 if (! afp->do_stdin) fclose(afp->f);
1012 if (afp->buf != NULL) free(afp->buf);
1013 if (afp->ssi != NULL) SSIClose(afp->ssi);
1014 if (afp->fname != NULL) free(afp->fname);
1015 free(afp);
1016 }
1017
1018 char *
MSAFileGetLine(MSAFILE * afp)1019 MSAFileGetLine(MSAFILE *afp)
1020 {
1021 char *s;
1022 if ((s = sre_fgets(&(afp->buf), &(afp->buflen), afp->f)) == NULL)
1023 return NULL;
1024 afp->linenumber++;
1025 return afp->buf;
1026 }
1027
1028 void
MSAFileWrite(FILE * fp,MSA * msa,int outfmt,int do_oneline,int iWrap,int bResno,int iSeqType)1029 MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline, int iWrap, int bResno, int iSeqType)
1030 {
1031 switch (outfmt) {
1032 #ifdef CLUSTALO
1033 /*case MSAFILE_A2M: WriteA2M(stdout, msa, 0); break;*/
1034 /*case MSAFILE_VIENNA: WriteA2M(stdout, msa, 1); break;*/
1035 case MSAFILE_A2M: WriteA2M(stdout, msa, iWrap); break;
1036 #ifndef INT_MAX
1037 #define INT_MAX 2147483647
1038 #endif
1039 case MSAFILE_VIENNA: WriteA2M(stdout, msa, INT_MAX); break;
1040 case MSAFILE_CLUSTAL: WriteClustal(stdout, msa, iWrap, bResno, iSeqType); break;
1041 #else
1042 case MSAFILE_A2M: WriteA2M(stdout, msa); break;
1043 case MSAFILE_CLUSTAL: WriteClustal(stdout, msa); break;
1044 #endif
1045 case MSAFILE_MSF: WriteMSF(stdout, msa); break;
1046 case MSAFILE_PHYLIP: WritePhylip(stdout, msa); break;
1047 case MSAFILE_SELEX: WriteSELEX(stdout, msa); break;
1048 case MSAFILE_STOCKHOLM:
1049 if (do_oneline) WriteStockholmOneBlock(stdout, msa);
1050 else WriteStockholm(stdout, msa);
1051 break;
1052 default:
1053 Die("can't write. no such alignment format %d\n", outfmt);
1054 }
1055 }
1056
1057 /* Function: MSAGetSeqidx()
1058 * Date: SRE, Wed May 19 15:08:25 1999 [St. Louis]
1059 *
1060 * Purpose: From a sequence name, return seqidx appropriate
1061 * for an MSA structure.
1062 *
1063 * 1) try to guess the index. (pass -1 if you can't guess)
1064 * 2) Look up name in msa's hashtable.
1065 * 3) If it's a new name, store in msa's hashtable;
1066 * expand allocs as needed;
1067 * save sqname.
1068 *
1069 * Args: msa - alignment object
1070 * name - a sequence name
1071 * guess - a guess at the right index, or -1 if no guess.
1072 *
1073 * Returns: seqidx
1074 */
1075 int
MSAGetSeqidx(MSA * msa,char * name,int guess)1076 MSAGetSeqidx(MSA *msa, char *name, int guess)
1077 {
1078 int seqidx;
1079 /* can we guess? */
1080 if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0)
1081 return guess;
1082 /* else, a lookup in the index */
1083 if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0)
1084 return seqidx;
1085 /* else, it's a new name */
1086 seqidx = GKIStoreKey(msa->index, name);
1087 if (seqidx >= msa->nseqalloc) MSAExpand(msa);
1088
1089 msa->sqname[seqidx] = sre_strdup(name, -1);
1090 msa->nseq++;
1091 return seqidx;
1092 }
1093
1094
1095 /* Function: MSAFromAINFO()
1096 * Date: SRE, Mon Jun 14 11:22:24 1999 [St. Louis]
1097 *
1098 * Purpose: Convert the old aseq/ainfo alignment structure
1099 * to new MSA structure. Enables more rapid conversion
1100 * of codebase to the new world order.
1101 *
1102 * Args: aseq - [0..nseq-1][0..alen-1] alignment
1103 * ainfo - old-style optional info
1104 *
1105 * Returns: MSA *
1106 */
1107 MSA *
MSAFromAINFO(char ** aseq,AINFO * ainfo)1108 MSAFromAINFO(char **aseq, AINFO *ainfo)
1109 {
1110 MSA *msa;
1111 int i, j;
1112
1113 msa = MSAAlloc(ainfo->nseq, ainfo->alen);
1114 for (i = 0; i < ainfo->nseq; i++)
1115 {
1116 strcpy(msa->aseq[i], aseq[i]);
1117 msa->wgt[i] = ainfo->wgt[i];
1118 msa->sqname[i] = sre_strdup(ainfo->sqinfo[i].name, -1);
1119 msa->sqlen[i] = msa->alen;
1120 GKIStoreKey(msa->index, msa->sqname[i]);
1121
1122 if (ainfo->sqinfo[i].flags & SQINFO_ACC)
1123 MSASetSeqAccession(msa, i, ainfo->sqinfo[i].acc);
1124
1125 if (ainfo->sqinfo[i].flags & SQINFO_DESC)
1126 MSASetSeqDescription(msa, i, ainfo->sqinfo[i].desc);
1127
1128 if (ainfo->sqinfo[i].flags & SQINFO_SS) {
1129 if (msa->ss == NULL) {
1130 msa->ss = MallocOrDie(sizeof(char *) * msa->nseqalloc);
1131 msa->sslen = MallocOrDie(sizeof(int) * msa->nseqalloc);
1132 for (j = 0; j < msa->nseqalloc; j++) {
1133 msa->ss[j] = NULL;
1134 msa->sslen[j] = 0;
1135 }
1136 }
1137 MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].ss, &(msa->ss[i]));
1138 msa->sslen[i] = msa->alen;
1139 }
1140
1141 if (ainfo->sqinfo[i].flags & SQINFO_SA) {
1142 if (msa->sa == NULL) {
1143 msa->sa = MallocOrDie(sizeof(char *) * msa->nseqalloc);
1144 msa->salen = MallocOrDie(sizeof(int) * msa->nseqalloc);
1145 for (j = 0; j < msa->nseqalloc; j++) {
1146 msa->sa[j] = NULL;
1147 msa->salen[j] = 0;
1148 }
1149 }
1150 MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].sa, &(msa->sa[i]));
1151 msa->salen[i] = msa->alen;
1152 }
1153 }
1154 /* note that sre_strdup() returns NULL when passed NULL */
1155 msa->name = sre_strdup(ainfo->name, -1);
1156 msa->desc = sre_strdup(ainfo->desc, -1);
1157 msa->acc = sre_strdup(ainfo->acc, -1);
1158 msa->au = sre_strdup(ainfo->au, -1);
1159 msa->ss_cons = sre_strdup(ainfo->cs, -1);
1160 msa->rf = sre_strdup(ainfo->rf, -1);
1161 if (ainfo->flags & AINFO_TC) {
1162 msa->cutoff[MSA_CUTOFF_TC1] = ainfo->tc1; msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
1163 msa->cutoff[MSA_CUTOFF_TC2] = ainfo->tc2; msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
1164 }
1165 if (ainfo->flags & AINFO_NC) {
1166 msa->cutoff[MSA_CUTOFF_NC1] = ainfo->nc1; msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
1167 msa->cutoff[MSA_CUTOFF_NC2] = ainfo->nc2; msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
1168 }
1169 if (ainfo->flags & AINFO_GA) {
1170 msa->cutoff[MSA_CUTOFF_GA1] = ainfo->ga1; msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
1171 msa->cutoff[MSA_CUTOFF_GA2] = ainfo->ga2; msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
1172 }
1173 msa->nseq = ainfo->nseq;
1174 msa->alen = ainfo->alen;
1175 return msa;
1176 }
1177
1178
1179
1180
1181 /* Function: MSAFileFormat()
1182 * Date: SRE, Fri Jun 18 14:26:49 1999 [Sanger Centre]
1183 *
1184 * Purpose: (Attempt to) determine the format of an alignment file.
1185 * Since it rewinds the file pointer when it's done,
1186 * cannot be used on a pipe or gzip'ed file. Works by
1187 * calling SeqfileFormat() from sqio.c, then making sure
1188 * that the format is indeed an alignment. If the format
1189 * comes back as FASTA, it assumes that the format as A2M
1190 * (e.g. aligned FASTA).
1191 *
1192 * Args: fname - file to evaluate
1193 *
1194 * Returns: format code; e.g. MSAFILE_STOCKHOLM
1195 */
1196 int
MSAFileFormat(MSAFILE * afp)1197 MSAFileFormat(MSAFILE *afp)
1198 {
1199 int fmt;
1200
1201 fmt = SeqfileFormat(afp->f);
1202
1203 if (fmt == SQFILE_FASTA) fmt = MSAFILE_A2M;
1204
1205 if (fmt != MSAFILE_UNKNOWN && ! IsAlignmentFormat(fmt))
1206 Die("File %s does not appear to be an alignment file;\n\
1207 rather, it appears to be an unaligned file in %s format.\n\
1208 I'm expecting an alignment file in this context.\n",
1209 afp->fname,
1210 SeqfileFormat2String(fmt));
1211 return fmt;
1212 }
1213
1214
1215 /* Function: MSAMingap()
1216 * Date: SRE, Mon Jun 28 18:57:54 1999 [on jury duty, St. Louis Civil Court]
1217 *
1218 * Purpose: Remove all-gap columns from a multiple sequence alignment
1219 * and its associated per-residue data.
1220 *
1221 * Args: msa - the alignment
1222 *
1223 * Returns: (void)
1224 */
1225 void
MSAMingap(MSA * msa)1226 MSAMingap(MSA *msa)
1227 {
1228 int *useme; /* array of TRUE/FALSE flags for which columns to keep */
1229 int apos; /* position in original alignment */
1230 int idx; /* sequence index */
1231
1232 useme = MallocOrDie(sizeof(int) * msa->alen);
1233 for (apos = 0; apos < msa->alen; apos++)
1234 {
1235 for (idx = 0; idx < msa->nseq; idx++)
1236 if (! isgap(msa->aseq[idx][apos]))
1237 break;
1238 if (idx == msa->nseq) useme[apos] = FALSE; else useme[apos] = TRUE;
1239 }
1240 MSAShorterAlignment(msa, useme);
1241 free(useme);
1242 return;
1243 }
1244
1245 /* Function: MSANogap()
1246 * Date: SRE, Wed Nov 17 09:59:51 1999 [St. Louis]
1247 *
1248 * Purpose: Remove all columns from a multiple sequence alignment that
1249 * contain any gaps -- used for filtering before phylogenetic
1250 * analysis.
1251 *
1252 * Args: msa - the alignment
1253 *
1254 * Returns: (void). The alignment is modified, so if you want to keep
1255 * the original for something, make a copy.
1256 */
1257 void
MSANogap(MSA * msa)1258 MSANogap(MSA *msa)
1259 {
1260 int *useme; /* array of TRUE/FALSE flags for which columns to keep */
1261 int apos; /* position in original alignment */
1262 int idx; /* sequence index */
1263
1264 useme = MallocOrDie(sizeof(int) * msa->alen);
1265 for (apos = 0; apos < msa->alen; apos++)
1266 {
1267 for (idx = 0; idx < msa->nseq; idx++)
1268 if (isgap(msa->aseq[idx][apos]))
1269 break;
1270 if (idx == msa->nseq) useme[apos] = TRUE; else useme[apos] = FALSE;
1271 }
1272 MSAShorterAlignment(msa, useme);
1273 free(useme);
1274 return;
1275 }
1276
1277
1278 /* Function: MSAShorterAlignment()
1279 * Date: SRE, Wed Nov 17 09:49:32 1999 [St. Louis]
1280 *
1281 * Purpose: Given an array "useme" (0..alen-1) of TRUE/FALSE flags,
1282 * where TRUE means "keep this column in the new alignment":
1283 * Remove all columns annotated as "FALSE" in the useme
1284 * array.
1285 *
1286 * Args: msa - the alignment. The alignment is changed, so
1287 * if you don't want the original screwed up, make
1288 * a copy of it first.
1289 * useme - TRUE/FALSE flags for columns to keep: 0..alen-1
1290 *
1291 * Returns: (void)
1292 */
1293 void
MSAShorterAlignment(MSA * msa,int * useme)1294 MSAShorterAlignment(MSA *msa, int *useme)
1295 {
1296 int apos; /* position in original alignment */
1297 int mpos; /* position in new alignment */
1298 int idx; /* sequence index */
1299 int i; /* markup index */
1300
1301 /* Since we're minimizing, we can overwrite, using already allocated
1302 * memory.
1303 */
1304 for (apos = 0, mpos = 0; apos < msa->alen; apos++)
1305 {
1306 if (useme[apos] == FALSE) continue;
1307
1308 /* shift alignment and associated per-column+per-residue markup */
1309 if (mpos != apos)
1310 {
1311 for (idx = 0; idx < msa->nseq; idx++)
1312 {
1313 msa->aseq[idx][mpos] = msa->aseq[idx][apos];
1314 if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = msa->ss[idx][apos];
1315 if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = msa->sa[idx][apos];
1316
1317 for (i = 0; i < msa->ngr; i++)
1318 if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = msa->gr[i][idx][apos];
1319 }
1320
1321 if (msa->ss_cons != NULL) msa->ss_cons[mpos] = msa->ss_cons[apos];
1322 if (msa->sa_cons != NULL) msa->sa_cons[mpos] = msa->sa_cons[apos];
1323 if (msa->rf != NULL) msa->rf[mpos] = msa->rf[apos];
1324
1325 for (i = 0; i < msa->ngc; i++)
1326 msa->gc[i][mpos] = msa->gc[i][apos];
1327 }
1328 mpos++;
1329 }
1330
1331 msa->alen = mpos; /* set new length */
1332 /* null terminate everything */
1333 for (idx = 0; idx < msa->nseq; idx++)
1334 {
1335 msa->aseq[idx][mpos] = '\0';
1336 if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = '\0';
1337 if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = '\0';
1338
1339 for (i = 0; i < msa->ngr; i++)
1340 if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = '\0';
1341 }
1342
1343 if (msa->ss_cons != NULL) msa->ss_cons[mpos] = '\0';
1344 if (msa->sa_cons != NULL) msa->sa_cons[mpos] = '\0';
1345 if (msa->rf != NULL) msa->rf[mpos] = '\0';
1346
1347 for (i = 0; i < msa->ngc; i++)
1348 msa->gc[i][mpos] = '\0';
1349
1350 return;
1351 }
1352
1353
1354 /* Function: MSASmallerAlignment()
1355 * Date: SRE, Wed Jun 30 09:56:08 1999 [St. Louis]
1356 *
1357 * Purpose: Given an array "useme" of TRUE/FALSE flags for
1358 * each sequence in an alignment, construct
1359 * and return a new alignment containing only
1360 * those sequences that are flagged useme=TRUE.
1361 *
1362 * Used by routines such as MSAFilterAlignment()
1363 * and MSASampleAlignment().
1364 *
1365 * Limitations:
1366 * Does not copy unparsed Stockholm markup.
1367 *
1368 * Does not make assumptions about meaning of wgt;
1369 * if you want the new wgt vector renormalized, do
1370 * it yourself with FNorm(new->wgt, new->nseq).
1371 *
1372 * Args: msa -- the original (larger) alignment
1373 * useme -- [0..nseq-1] array of TRUE/FALSE flags; TRUE means include
1374 * this seq in new alignment
1375 * ret_new -- RETURN: new alignment
1376 *
1377 * Returns: void
1378 * ret_new is allocated here; free with MSAFree()
1379 */
1380 void
MSASmallerAlignment(MSA * msa,int * useme,MSA ** ret_new)1381 MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new)
1382 {
1383 MSA *new; /* RETURN: new alignment */
1384 int nnew; /* number of seqs in new msa (e.g. # of TRUEs) */
1385 int oidx, nidx; /* old, new indices */
1386 int i;
1387
1388 nnew = 0;
1389 for (oidx = 0; oidx < msa->nseq; oidx++)
1390 if (useme[oidx]) nnew++;
1391 if (nnew == 0) { *ret_new = NULL; return; }
1392
1393 new = MSAAlloc(nnew, 0);
1394 nidx = 0;
1395 for (oidx = 0; oidx < msa->nseq; oidx++)
1396 if (useme[oidx])
1397 {
1398 new->aseq[nidx] = sre_strdup(msa->aseq[oidx], msa->alen);
1399 new->sqname[nidx] = sre_strdup(msa->sqname[oidx], msa->alen);
1400 GKIStoreKey(new->index, msa->sqname[oidx]);
1401 new->wgt[nidx] = msa->wgt[oidx];
1402 if (msa->sqacc != NULL)
1403 MSASetSeqAccession(new, nidx, msa->sqacc[oidx]);
1404 if (msa->sqdesc != NULL)
1405 MSASetSeqDescription(new, nidx, msa->sqdesc[oidx]);
1406 if (msa->ss != NULL && msa->ss[oidx] != NULL)
1407 {
1408 if (new->ss == NULL) new->ss = MallocOrDie(sizeof(char *) * new->nseq);
1409 new->ss[nidx] = sre_strdup(msa->ss[oidx], -1);
1410 }
1411 if (msa->sa != NULL && msa->sa[oidx] != NULL)
1412 {
1413 if (new->sa == NULL) new->sa = MallocOrDie(sizeof(char *) * new->nseq);
1414 new->sa[nidx] = sre_strdup(msa->sa[oidx], -1);
1415 }
1416 nidx++;
1417 }
1418
1419 new->nseq = nnew;
1420 new->alen = msa->alen;
1421 new->flags = msa->flags;
1422 new->type = msa->type;
1423 new->name = sre_strdup(msa->name, -1);
1424 new->desc = sre_strdup(msa->desc, -1);
1425 new->acc = sre_strdup(msa->acc, -1);
1426 new->au = sre_strdup(msa->au, -1);
1427 new->ss_cons = sre_strdup(msa->ss_cons, -1);
1428 new->sa_cons = sre_strdup(msa->sa_cons, -1);
1429 new->rf = sre_strdup(msa->rf, -1);
1430 for (i = 0; i < MSA_MAXCUTOFFS; i++) {
1431 new->cutoff[i] = msa->cutoff[i];
1432 new->cutoff_is_set[i] = msa->cutoff_is_set[i];
1433 }
1434 free(new->sqlen);
1435
1436 MSAMingap(new);
1437 *ret_new = new;
1438 return;
1439 }
1440
1441
1442 /*****************************************************************
1443 * Retrieval routines
1444 *
1445 * Access to MSA structure data is possible through these routines.
1446 * I'm not doing this because of object oriented design, though
1447 * it might work in my favor someday.
1448 * I'm doing this because lots of MSA data is optional, and
1449 * checking through the chain of possible NULLs is a pain.
1450 *****************************************************************/
1451
1452 char *
MSAGetSeqAccession(MSA * msa,int idx)1453 MSAGetSeqAccession(MSA *msa, int idx)
1454 {
1455 if (msa->sqacc != NULL && msa->sqacc[idx] != NULL)
1456 return msa->sqacc[idx];
1457 else
1458 return NULL;
1459 }
1460 char *
MSAGetSeqDescription(MSA * msa,int idx)1461 MSAGetSeqDescription(MSA *msa, int idx)
1462 {
1463 if (msa->sqdesc != NULL && msa->sqdesc[idx] != NULL)
1464 return msa->sqdesc[idx];
1465 else
1466 return NULL;
1467 }
1468 char *
MSAGetSeqSS(MSA * msa,int idx)1469 MSAGetSeqSS(MSA *msa, int idx)
1470 {
1471 if (msa->ss != NULL && msa->ss[idx] != NULL)
1472 return msa->ss[idx];
1473 else
1474 return NULL;
1475 }
1476 char *
MSAGetSeqSA(MSA * msa,int idx)1477 MSAGetSeqSA(MSA *msa, int idx)
1478 {
1479 if (msa->sa != NULL && msa->sa[idx] != NULL)
1480 return msa->sa[idx];
1481 else
1482 return NULL;
1483 }
1484
1485
1486 /*****************************************************************
1487 * Information routines
1488 *
1489 * Access information about the MSA.
1490 *****************************************************************/
1491
1492 /* Function: MSAAverageSequenceLength()
1493 * Date: SRE, Sat Apr 6 09:41:34 2002 [St. Louis]
1494 *
1495 * Purpose: Return the average length of the (unaligned) sequences
1496 * in the MSA.
1497 *
1498 * Args: msa - the alignment
1499 *
1500 * Returns: average length
1501 */
1502 float
MSAAverageSequenceLength(MSA * msa)1503 MSAAverageSequenceLength(MSA *msa)
1504 {
1505 int i;
1506 float avg;
1507
1508 avg = 0.;
1509 for (i = 0; i < msa->nseq; i++)
1510 avg += (float) DealignedLength(msa->aseq[i]);
1511
1512 if (msa->nseq == 0) return 0.;
1513 else return (avg / msa->nseq);
1514 }
1515
1516
1517