1 /*****************************************************************
2 * SQUID - a library of functions for biological sequence analysis
3 * Copyright (C) 1992-2002 Washington University School of Medicine
4 *
5 * This source code is freely distributed under the terms of the
6 * GNU General Public License. See the files COPYRIGHT and LICENSE
7 * for details.
8 *****************************************************************/
9
10 /* File: sqio.c
11 * From: ureadseq.c in Don Gilbert's sequence i/o package
12 *
13 * Reads and writes nucleic/protein sequence in various
14 * formats. Data files may have multiple sequences.
15 *
16 * Heavily modified from READSEQ package
17 * Copyright (C) 1990 by D.G. Gilbert
18 * Biology Dept., Indiana University, Bloomington, IN 47405
19 * email: gilbertd@bio.indiana.edu
20 * Thanks Don!
21 *
22 * SRE: Modifications as noted. Fri Jul 3 09:44:54 1992
23 * Packaged for squid, Thu Oct 1 10:07:11 1992
24 * ANSI conversion in full swing, Mon Jul 12 12:22:21 1993
25 *
26 * CVS $Id: sqio.c,v 1.29 2002/08/26 23:10:52 eddy Exp)
27 *
28 *****************************************************************
29 * Basic API for single sequence reading:
30 *
31 * SQFILE *sqfp;
32 * char *seqfile;
33 * int format; - see squid.h for formats; example: SQFILE_FASTA
34 * char *seq;
35 * SQINFO sqinfo;
36 *
37 * if ((sqfp = SeqfileOpen(seqfile, format, "BLASTDB")) == NULL)
38 * Die("Failed to open sequence database file %s\n%s\n", seqfile, usage);
39 * while (ReadSeq(sqfp, sqfp->format, &seq, &sqinfo)) {
40 * do_stuff;
41 * FreeSequence(seq, &sqinfo);
42 * }
43 * SeqfileClose(sqfp);
44 *
45 *****************************************************************
46 */
47
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <ctype.h>
52
53 #ifndef SEEK_SET
54 #include <unistd.h>
55 #endif
56
57 #include "squid.h"
58 #include "msa.h"
59 #include "ssi.h"
60
61 static void SeqfileGetLine(SQFILE *V);
62
63 #define kStartLength 500
64
65 static char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";
66 static char *primenuc = "ACGTUN";
67 static char *protonly = "EFIPQZ";
68
69 static SQFILE *seqfile_open(char *filename, int format, char *env, int ssimode);
70
71 /* Function: SeqfileOpen()
72 *
73 * Purpose : Open a sequence database file and prepare for reading
74 * sequentially.
75 *
76 * Args: filename - name of file to open
77 * format - format of file
78 * env - environment variable for path (e.g. BLASTDB)
79 * ssimode - -1, SSI_OFFSET_I32, or SSI_OFFSET_I64
80 *
81 * Returns opened SQFILE ptr, or NULL on failure.
82 */
83 SQFILE *
SeqfileOpen(char * filename,int format,char * env)84 SeqfileOpen(char *filename, int format, char *env)
85 {
86 return seqfile_open(filename, format, env, -1);
87 }
88 SQFILE *
SeqfileOpenForIndexing(char * filename,int format,char * env,int ssimode)89 SeqfileOpenForIndexing(char *filename, int format, char *env, int ssimode)
90 {
91 return seqfile_open(filename, format, env, ssimode);
92 }
93 static SQFILE *
seqfile_open(char * filename,int format,char * env,int ssimode)94 seqfile_open(char *filename, int format, char *env, int ssimode)
95 {
96 SQFILE *dbfp;
97
98 dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE));
99
100 dbfp->ssimode = ssimode;
101 dbfp->rpl = -1; /* flag meaning "unset" */
102 dbfp->lastrpl = 0;
103 dbfp->maxrpl = 0;
104 dbfp->bpl = -1; /* flag meaning "unset" */
105 dbfp->lastbpl = 0;
106 dbfp->maxbpl = 0;
107
108 /* Open our file handle.
109 * Three possibilities:
110 * 1. normal file open
111 * 2. filename = "-"; read from stdin
112 * 3. filename = "*.gz"; read thru pipe from gzip
113 * If we're reading from stdin or a pipe, we can't reliably
114 * back up, so we can't do two-pass parsers like the interleaved alignment
115 * formats.
116 */
117 if (strcmp(filename, "-") == 0)
118 {
119 dbfp->f = stdin;
120 dbfp->do_stdin = TRUE;
121 dbfp->do_gzip = FALSE;
122 dbfp->fname = sre_strdup("[STDIN]", -1);
123 }
124 #ifndef SRE_STRICT_ANSI
125 /* popen(), pclose() aren't portable to non-POSIX systems; disable */
126 else if (Strparse("^.*\\.gz$", filename, 0))
127 {
128 char cmd[256];
129
130 /* Note that popen() will return "successfully"
131 * if file doesn't exist, because gzip works fine
132 * and prints an error! So we have to check for
133 * existence of file ourself.
134 */
135 if (! FileExists(filename))
136 Die("%s: file does not exist", filename);
137
138 if (strlen(filename) + strlen("gzip -dc ") >= 256)
139 Die("filename > 255 char in SeqfileOpen()");
140 sprintf(cmd, "gzip -dc %s", filename);
141 if ((dbfp->f = popen(cmd, "r")) == NULL)
142 return NULL;
143
144 dbfp->do_stdin = FALSE;
145 dbfp->do_gzip = TRUE;
146 dbfp->fname = sre_strdup(filename, -1);
147 }
148 #endif /*SRE_STRICT_ANSI*/
149 else
150 {
151 if ((dbfp->f = fopen(filename, "r")) == NULL &&
152 (dbfp->f = EnvFileOpen(filename, env, NULL)) == NULL)
153 return NULL;
154
155 dbfp->do_stdin = FALSE;
156 dbfp->do_gzip = FALSE;
157 dbfp->fname = sre_strdup(filename, -1);
158 }
159
160
161 /* Invoke autodetection if we haven't already been told what
162 * to expect.
163 */
164 if (format == SQFILE_UNKNOWN)
165 {
166 if (dbfp->do_stdin == TRUE || dbfp->do_gzip)
167 Die("Can't autodetect sequence file format from a stdin or gzip pipe");
168 format = SeqfileFormat(dbfp->f);
169 if (format == SQFILE_UNKNOWN)
170 Die("Can't determine format of sequence file %s", dbfp->fname);
171 }
172
173 /* The hack for sequential access of an interleaved alignment file:
174 * read the alignment in, we'll copy sequences out one at a time.
175 */
176 dbfp->msa = NULL;
177 dbfp->afp = NULL;
178 dbfp->format = format;
179 dbfp->linenumber = 0;
180 dbfp->buf = NULL;
181 dbfp->buflen = 0;
182 if (IsAlignmentFormat(format))
183 {
184 /* We'll be reading from the MSA interface. Copy our data
185 * to the MSA afp's structure.
186 */
187 dbfp->afp = MallocOrDie(sizeof(MSAFILE));
188 dbfp->afp->f = dbfp->f; /* just a ptr, don't close */
189 dbfp->afp->do_stdin = dbfp->do_stdin;
190 dbfp->afp->do_gzip = dbfp->do_gzip;
191 dbfp->afp->fname = dbfp->fname; /* just a ptr, don't free */
192 dbfp->afp->format = dbfp->format; /* e.g. format */
193 dbfp->afp->linenumber = dbfp->linenumber; /* e.g. 0 */
194 dbfp->afp->buf = NULL;
195 dbfp->afp->buflen = 0;
196
197 if ((dbfp->msa = MSAFileRead(dbfp->afp)) == NULL)
198 Die("Failed to read any alignment data from file %s", dbfp->fname);
199 /* hack: overload/reuse msa->lastidx; indicates
200 next seq to return upon a ReadSeq() call */
201 dbfp->msa->lastidx = 0;
202
203 return dbfp;
204 }
205
206 /* Load the first line.
207 */
208 SeqfileGetLine(dbfp);
209 return dbfp;
210 }
211
212 /* Function: SeqfilePosition()
213 *
214 * Purpose: Move to a particular offset in a seqfile.
215 * Will not work on alignment files.
216 */
217 void
SeqfilePosition(SQFILE * sqfp,SSIOFFSET * offset)218 SeqfilePosition(SQFILE *sqfp, SSIOFFSET *offset)
219 {
220 if (sqfp->do_stdin || sqfp->do_gzip || IsAlignmentFormat(sqfp->format))
221 Die("SeqfilePosition() failed: in a nonrewindable data file or stream");
222
223 if (SSISetFilePosition(sqfp->f, offset) != 0)
224 Die("SSISetFilePosition failed, but that shouldn't happen.");
225 SeqfileGetLine(sqfp);
226 }
227
228
229 /* Function: SeqfileRewind()
230 *
231 * Purpose: Set a sequence file back to the first sequence.
232 *
233 * Won't work on alignment files. Although it would
234 * seem that it could (just set msa->lastidx back to 0),
235 * that'll fail on "multiple multiple" alignment file formats
236 * (e.g. Stockholm).
237 */
238 void
SeqfileRewind(SQFILE * sqfp)239 SeqfileRewind(SQFILE *sqfp)
240 {
241 if (sqfp->do_stdin || sqfp->do_gzip)
242 Die("SeqfileRewind() failed: in a nonrewindable data file or stream");
243
244 rewind(sqfp->f);
245 SeqfileGetLine(sqfp);
246 }
247
248 /* Function: SeqfileLineParameters()
249 * Date: SRE, Thu Feb 15 17:00:41 2001 [St. Louis]
250 *
251 * Purpose: After all the sequences have been read from the file,
252 * but before closing it, retrieve overall bytes-per-line and
253 * residues-per-line info. If non-zero, these mean that
254 * the file contains homogeneous sequence line lengths (except
255 * the last line in each record).
256 *
257 * If either of bpl or rpl is determined to be inhomogeneous,
258 * both are returned as 0.
259 *
260 * Args: *sqfp - an open but fully read sequence file
261 * ret_bpl - RETURN: bytes per line, or 0 if inhomogeneous
262 * ret_rpl - RETURN: residues per line, or 0 if inhomogenous.
263 *
264 * Returns: void
265 */
266 void
SeqfileLineParameters(SQFILE * V,int * ret_bpl,int * ret_rpl)267 SeqfileLineParameters(SQFILE *V, int *ret_bpl, int *ret_rpl)
268 {
269 if (V->rpl > 0 && V->maxrpl == V->rpl &&
270 V->bpl > 0 && V->maxbpl == V->bpl) {
271 *ret_bpl = V->bpl;
272 *ret_rpl = V->rpl;
273 } else {
274 *ret_bpl = 0;
275 *ret_rpl = 0;
276 }
277 }
278
279
280 void
SeqfileClose(SQFILE * sqfp)281 SeqfileClose(SQFILE *sqfp)
282 {
283 /* note: don't test for sqfp->msa being NULL. Now that
284 * we're holding afp open and allowing access to multi-MSA
285 * databases (e.g. Stockholm format, Pfam), msa ends
286 * up being NULL when we run out of alignments.
287 */
288 if (sqfp->afp != NULL) {
289 if (sqfp->msa != NULL) MSAFree(sqfp->msa);
290 if (sqfp->afp->buf != NULL) free(sqfp->afp->buf);
291 free(sqfp->afp);
292 }
293 #ifndef SRE_STRICT_ANSI /* gunzip functionality only on POSIX systems */
294 if (sqfp->do_gzip) pclose(sqfp->f);
295 #endif
296 else if (! sqfp->do_stdin) fclose(sqfp->f);
297 if (sqfp->buf != NULL) free(sqfp->buf);
298 if (sqfp->fname != NULL) free(sqfp->fname);
299 free(sqfp);
300 }
301
302
303 /* Function: SeqfileGetLine()
304 * Date: SRE, Tue Jun 22 09:15:49 1999 [Sanger Centre]
305 *
306 * Purpose: read a line from a sequence file into V->buf
307 * If the fgets() is NULL, sets V->buf[0] to '\0'.
308 *
309 * Args: V
310 *
311 * Returns: void
312 */
313 static void
SeqfileGetLine(SQFILE * V)314 SeqfileGetLine(SQFILE *V)
315 {
316 if (V->ssimode >= 0)
317 if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->ssioffset)))
318 Die("SSIGetFilePosition() failed");
319 if (sre_fgets(&(V->buf), &(V->buflen), V->f) == NULL)
320 *(V->buf) = '\0';
321 V->linenumber++;
322 }
323
324
325 void
FreeSequence(char * seq,SQINFO * sqinfo)326 FreeSequence(char *seq, SQINFO *sqinfo)
327 {
328 if (seq != NULL) free(seq); /* FS, r244, here is potential problem in profile/profile */
329 if (sqinfo->flags & SQINFO_SS){
330 if (NULL != sqinfo->ss){ /* FS, r244 -> r245 */
331 free(sqinfo->ss);
332 }
333 }
334 if (sqinfo->flags & SQINFO_SA){
335 if (NULL != sqinfo->sa){ /* FS, r244 -> r245 */
336 free(sqinfo->sa);
337 }
338 }
339 if (sqinfo->flags & SQINFO_CO){
340 if (NULL != sqinfo->co){ /* FS, r296 -> */
341 free(sqinfo->co);
342 }
343 }
344 } /* this is the end of FreeSequence() */
345
346 int
SetSeqinfoString(SQINFO * sqinfo,char * sptr,int flag)347 SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag)
348 {
349 int len;
350 int pos;
351
352 /* silently ignore NULL. */
353 if (sptr == NULL) return 1;
354
355 while (*sptr == ' ') sptr++; /* ignore leading whitespace */
356 for (pos = strlen(sptr)-1; pos >= 0; pos--)
357 if (! isspace((int) sptr[pos])) break;
358 sptr[pos+1] = '\0'; /* ignore trailing whitespace */
359
360 switch (flag) {
361 case SQINFO_NAME:
362 if (*sptr != '-')
363 {
364 strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1);
365 sqinfo->name[SQINFO_NAMELEN-1] = '\0';
366 sqinfo->flags |= SQINFO_NAME;
367 }
368 break;
369
370 case SQINFO_ID:
371 if (*sptr != '-')
372 {
373 strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1);
374 sqinfo->id[SQINFO_NAMELEN-1] = '\0';
375 sqinfo->flags |= SQINFO_ID;
376 }
377 break;
378
379 case SQINFO_ACC:
380 if (*sptr != '-')
381 {
382 strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1);
383 sqinfo->acc[SQINFO_NAMELEN-1] = '\0';
384 sqinfo->flags |= SQINFO_ACC;
385 }
386 break;
387
388 case SQINFO_DESC:
389 if (*sptr != '-')
390 {
391 if (sqinfo->flags & SQINFO_DESC) /* append? */
392 {
393 len = strlen(sqinfo->desc);
394 if (len < SQINFO_DESCLEN-2) /* is there room? */
395 {
396 strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++;
397 strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len);
398 }
399 }
400 else /* else copy */
401 strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1);
402 sqinfo->desc[SQINFO_DESCLEN-1] = '\0';
403 sqinfo->flags |= SQINFO_DESC;
404 }
405 break;
406
407 case SQINFO_START:
408 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
409 sqinfo->start = atoi(sptr);
410 if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START;
411 break;
412
413 case SQINFO_STOP:
414 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
415 sqinfo->stop = atoi(sptr);
416 if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP;
417 break;
418
419 case SQINFO_OLEN:
420 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
421 sqinfo->olen = atoi(sptr);
422 if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN;
423 break;
424
425 default:
426 Die("Invalid flag %d to SetSeqinfoString()", flag);
427 }
428 return 1;
429 }
430
431 void
SeqinfoCopy(SQINFO * sq1,SQINFO * sq2)432 SeqinfoCopy(SQINFO *sq1, SQINFO *sq2)
433 {
434 sq1->flags = sq2->flags;
435 if (sq2->flags & SQINFO_NAME) strcpy(sq1->name, sq2->name);
436 if (sq2->flags & SQINFO_ID) strcpy(sq1->id, sq2->id);
437 if (sq2->flags & SQINFO_ACC) strcpy(sq1->acc, sq2->acc);
438 if (sq2->flags & SQINFO_DESC) strcpy(sq1->desc, sq2->desc);
439 if (sq2->flags & SQINFO_LEN) sq1->len = sq2->len;
440 if (sq2->flags & SQINFO_START) sq1->start = sq2->start;
441 if (sq2->flags & SQINFO_STOP) sq1->stop = sq2->stop;
442 if (sq2->flags & SQINFO_OLEN) sq1->olen = sq2->olen;
443 if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type;
444 if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss);
445 if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa);
446 if (sq2->flags & SQINFO_CO) sq1->co = Strdup(sq2->co);
447 }
448
449 /* Function: ToDNA()
450 *
451 * Purpose: Convert a sequence to DNA.
452 * U --> T
453 */
454 void
ToDNA(char * seq)455 ToDNA(char *seq)
456 {
457 for (; *seq != '\0'; seq++)
458 {
459 if (*seq == 'U') *seq = 'T';
460 else if (*seq == 'u') *seq = 't';
461 }
462 }
463
464 /* Function: ToRNA()
465 *
466 * Purpose: Convert a sequence to RNA.
467 * T --> U
468 */
469 void
ToRNA(char * seq)470 ToRNA(char *seq)
471 {
472 for (; *seq != '\0'; seq++)
473 {
474 if (*seq == 'T') *seq = 'U';
475 else if (*seq == 't') *seq = 'u';
476 }
477 }
478
479
480 /* Function: ToIUPAC()
481 *
482 * Purpose: Convert X's, o's, other junk in a nucleic acid sequence to N's,
483 * to comply with IUPAC code. If is_aseq is TRUE, will allow gap
484 * characters though, so we can call ToIUPAC() on aligned seqs.
485 *
486 * NUCLEOTIDES is defined in squid.h as:
487 * "ACGTUNRYMKSWHBVDacgtunrymkswhbvd"
488 * gap chars allowed by isgap() are defined in squid.h as:
489 * " ._-~"
490 *
491 * WU-BLAST's pressdb will
492 * choke on X's, for instance, necessitating conversion
493 * of certain genome centers' data.
494 */
495 void
ToIUPAC(char * seq,int is_aseq)496 ToIUPAC(char *seq, int is_aseq)
497 {
498 if (is_aseq) {
499 for (; *seq != '\0'; seq++)
500 if (strchr(NUCLEOTIDES, *seq) == NULL && ! isgap(*seq)) *seq = 'N';
501 } else {
502 for (; *seq != '\0'; seq++)
503 if (strchr(NUCLEOTIDES, *seq) == NULL) *seq = 'N';
504 }
505 }
506
507
508 /* Function: addseq()
509 *
510 * Purpose: Add a line of sequence to the growing string in V.
511 *
512 * In the seven supported unaligned formats, all sequence
513 * lines may contain whitespace that must be filtered out;
514 * four formats (PIR, EMBL, Genbank, GCG) include coordinates
515 * that must be filtered out. Thus an (!isdigit && !isspace)
516 * test on each character before we accept it.
517 */
518 static void
addseq(char * s,struct ReadSeqVars * V)519 addseq(char *s, struct ReadSeqVars *V)
520 {
521 char *s0;
522 char *sq;
523 int rpl; /* valid residues per line */
524 int bpl; /* characters per line */
525
526 if (V->ssimode == -1)
527 { /* Normal mode: keeping the seq */
528 /* Make sure we have enough room. We know that s is <= buflen,
529 * so just make sure we've got room for a whole new buflen worth
530 * of sequence.
531 */
532 if (V->seqlen + V->buflen > V->maxseq) {
533 V->maxseq += MAX(V->buflen, kStartLength);
534 V->seq = ReallocOrDie (V->seq, V->maxseq+1);
535 }
536
537 sq = V->seq + V->seqlen;
538 while (*s != 0) {
539 #ifdef CLUSTALO
540 if (! isdigit((int) *s) && ! isspace((int) *s) && isprint((int) *s)) {
541 #else
542 if (! isdigit((int) *s) && ! isspace((int) *s)) {
543 #endif
544 *sq = *s;
545 sq++;
546 }
547 s++;
548 }
549 V->seqlen = sq - V->seq;
550 }
551 else /* else: indexing mode, discard the seq */
552 {
553 s0 = s;
554 rpl = 0;
555 while (*s != 0) {
556 if (! isdigit((int) *s) && ! isspace((int) *s)) {
557 rpl++;
558 }
559 s++;
560 }
561 V->seqlen += rpl;
562 bpl = s - s0;
563
564 /* Keep track of the global rpl, bpl for the file.
565 * This is overly complicated because we have to
566 * allow the last line of each record (e.g. the last addseq() call
567 * on each sequence) to have a different length - and sometimes
568 * we'll have one-line sequence records, too. Thus we only
569 * do something with the global V->rpl when we have *passed over*
570 * a line - we keep the last line's rpl in last_rpl. And because
571 * a file might consist entirely of single-line records, we keep
572 * a third guy, maxrpl, that tells us the maximum rpl of any line
573 * in the file. If we reach the end of file and rpl is still unset,
574 * we'll set it to maxrpl. If we reach eof and rpl is set, but is
575 * less than maxrpl, that's a weird case where a last line in some
576 * record is longer than every other line.
577 */
578 if (V->rpl != 0) { /* 0 means we already know rpl is invalid */
579 if (V->lastrpl > 0) { /* we're on something that's not the first line */
580 if (V->rpl > 0 && V->lastrpl != V->rpl) V->rpl = 0;
581 else if (V->rpl == -1) V->rpl = V->lastrpl;
582 }
583 V->lastrpl = rpl;
584 if (rpl > V->maxrpl) V->maxrpl = rpl; /* make sure we check max length of final lines */
585 }
586 if (V->bpl != 0) { /* 0 means we already know bpl is invalid */
587 if (V->lastbpl > 0) { /* we're on something that's not the first line */
588 if (V->bpl > 0 && V->lastbpl != V->bpl) V->bpl = 0;
589 else if (V->bpl == -1) V->bpl = V->lastbpl;
590 }
591 V->lastbpl = bpl;
592 if (bpl > V->maxbpl) V->maxbpl = bpl; /* make sure we check max length of final lines */
593 }
594 } /* end of indexing mode of addseq(). */
595
596 }
597
598 static void
599 readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V)
600 {
601 int addend = 0;
602 int done = 0;
603
604 V->seqlen = 0;
605 V->lastrpl = V->lastbpl = 0;
606 if (addfirst) {
607 if (V->ssimode >= 0) V->d_off = V->ssioffset;
608 addseq(V->buf, V);
609 } else if (V->ssimode >= 0)
610 if (0 != SSIGetFilePosition(V->f, V->ssimode, &(V->d_off)))
611 Die("SSIGetFilePosition() failed");
612
613 do {
614 SeqfileGetLine(V);
615 /* feof() alone is a bug; files not necessarily \n terminated */
616 if (*(V->buf) == '\0' && feof(V->f))
617 done = TRUE;
618 done |= (*endTest)(V->buf, &addend);
619 if (addend || !done)
620 addseq(V->buf, V);
621 } while (!done);
622 }
623
624
625 static int
626 endPIR(char *s, int *addend)
627 {
628 *addend = 0;
629 if ((strncmp(s, "///", 3) == 0) ||
630 (strncmp(s, "ENTRY", 5) == 0))
631 return 1;
632 else
633 return 0;
634 }
635
636 static void
637 readPIR(struct ReadSeqVars *V)
638 {
639 char *sptr;
640 /* load first line of entry */
641 while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) {
642 SeqfileGetLine(V);
643 }
644 if (feof(V->f)) return;
645 if (V->ssimode >= 0) V->r_off = V->ssioffset;
646
647 if ((sptr = strtok(V->buf + 15, "\n\t ")) != NULL)
648 {
649 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
650 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
651 }
652 do {
653 SeqfileGetLine(V);
654 if (!feof(V->f) && strncmp(V->buf, "TITLE", 5) == 0)
655 SetSeqinfoString(V->sqinfo, V->buf+15, SQINFO_DESC);
656 else if (!feof(V->f) && strncmp(V->buf, "ACCESSION", 9) == 0)
657 {
658 if ((sptr = strtok(V->buf+15, " \t\n")) != NULL)
659 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
660 }
661 } while (! feof(V->f) && (strncmp(V->buf,"SEQUENCE", 8) != 0));
662 SeqfileGetLine(V); /* skip next line, coords */
663
664 readLoop(0, endPIR, V);
665
666 /* reading a real PIR-CODATA database file, we keep the source coords
667 */
668 V->sqinfo->start = 1;
669 V->sqinfo->stop = V->seqlen;
670 V->sqinfo->olen = V->seqlen;
671 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
672
673 /* get next line
674 */
675 while (!feof(V->f) && strncmp(V->buf, "ENTRY", 5) != 0) {
676 SeqfileGetLine(V);
677 }
678 }
679
680
681
682 static int
683 endIG(char *s, int *addend)
684 {
685 *addend = 1; /* 1 or 2 occur in line w/ bases */
686 return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL));
687 }
688
689 static void
690 readIG(struct ReadSeqVars *V)
691 {
692 char *nm;
693 /* position past ';' comments */
694 do {
695 SeqfileGetLine(V);
696 } while (! (feof(V->f) || ((*V->buf != 0) && (*V->buf != ';')) ));
697
698 if (!feof(V->f))
699 {
700 if ((nm = strtok(V->buf, "\n\t ")) != NULL)
701 SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
702
703 readLoop(0, endIG, V);
704 }
705
706 while (!(feof(V->f) || ((*V->buf != '\0') && (*V->buf == ';'))))
707 SeqfileGetLine(V);
708 }
709
710 static int
711 endStrider(char *s, int *addend)
712 {
713 *addend = 0;
714 return (strstr( s, "//") != NULL);
715 }
716
717 static void
718 readStrider(struct ReadSeqVars *V)
719 {
720 char *nm;
721
722 while ((!feof(V->f)) && (*V->buf == ';'))
723 {
724 if (strncmp(V->buf,"; DNA sequence", 14) == 0)
725 {
726 if ((nm = strtok(V->buf+16, ",\n\t ")) != NULL)
727 SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
728 }
729 SeqfileGetLine(V);
730 }
731
732 if (! feof(V->f))
733 readLoop(1, endStrider, V);
734
735 /* load next line
736 */
737 while ((!feof(V->f)) && (*V->buf != ';'))
738 SeqfileGetLine(V);
739 }
740
741
742 static int
743 endGB(char *s, int *addend)
744 {
745 *addend = 0;
746 return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s));
747 }
748
749 static void
750 readGenBank(struct ReadSeqVars *V)
751 {
752 char *sptr;
753 int in_definition;
754
755 /* We'll map three genbank identifiers onto names:
756 * LOCUS -> sqinfo.name
757 * ACCESSION -> sqinfo.acc [primary accession only]
758 * VERSION -> sqinfo.id
759 * We don't currently store the GI number, or secondary accessions.
760 */
761 while (strncmp(V->buf, "LOCUS", 5) != 0) {
762 SeqfileGetLine(V);
763 }
764 if (V->ssimode >= 0) V->r_off = V->ssioffset;
765
766 if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
767 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
768
769 in_definition = FALSE;
770 while (! feof(V->f))
771 {
772 SeqfileGetLine(V);
773 if (! feof(V->f) && strstr(V->buf, "DEFINITION") == V->buf)
774 {
775 if ((sptr = strtok(V->buf+12, "\n")) != NULL)
776 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
777 in_definition = TRUE;
778 }
779 else if (! feof(V->f) && strstr(V->buf, "ACCESSION") == V->buf)
780 {
781 if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
782 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
783 in_definition = FALSE;
784 }
785 else if (! feof(V->f) && strstr(V->buf, "VERSION") == V->buf)
786 {
787 if ((sptr = strtok(V->buf+12, "\n\t ")) != NULL)
788 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
789 in_definition = FALSE;
790 }
791 else if (strncmp(V->buf,"ORIGIN", 6) != 0)
792 {
793 if (in_definition)
794 SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC);
795 }
796 else
797 break;
798 }
799
800 readLoop(0, endGB, V);
801
802 /* reading a real GenBank database file, we keep the source coords
803 */
804 V->sqinfo->start = 1;
805 V->sqinfo->stop = V->seqlen;
806 V->sqinfo->olen = V->seqlen;
807 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
808
809
810 while (!(feof(V->f) || ((*V->buf!=0) && (strstr(V->buf,"LOCUS") == V->buf))))
811 SeqfileGetLine(V);
812 /* SRE: V->s now holds "//", so sequential
813 reads are wedged: fixed Tue Jul 13 1993 */
814 while (!feof(V->f) && strstr(V->buf, "LOCUS ") != V->buf)
815 SeqfileGetLine(V);
816 }
817
818 static int
819 endGCGdata(char *s, int *addend)
820 {
821 *addend = 0;
822 return (*s == '>');
823 }
824
825 static void
826 readGCGdata(struct ReadSeqVars *V)
827 {
828 int binary = FALSE; /* whether data are binary or not */
829 int blen = 0; /* length of binary sequence */
830
831 /* first line contains ">>>>" followed by name */
832 if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->buf, 2))
833 {
834 binary = TRUE;
835 SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
836 blen = atoi(sqd_parse[2]);
837 }
838 else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->buf, 1))
839 SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
840 else
841 Die("bogus GCGdata format? %s", V->buf);
842
843 /* second line contains free text description */
844 SeqfileGetLine(V);
845 SetSeqinfoString(V->sqinfo, V->buf, SQINFO_DESC);
846
847 if (binary) {
848 /* allocate for blen characters +3... (allow for 3 bytes of slop) */
849 if (blen >= V->maxseq) {
850 V->maxseq = blen;
851 if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL)
852 Die("malloc failed");
853 }
854 /* read (blen+3)/4 bytes from file */
855 if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4))
856 Die("fread failed");
857 V->seqlen = blen;
858 /* convert binary code to seq */
859 GCGBinaryToSequence(V->seq, blen);
860 }
861 else readLoop(0, endGCGdata, V);
862
863 while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>'))))
864 SeqfileGetLine(V);
865 }
866
867 static int
868 endPearson(char *s, int *addend)
869 {
870 *addend = 0;
871 return(*s == '>');
872 }
873
874 #ifdef CLUSTALO
875 static int
876 only_whitespace(const char *s) {
877 while (*s != '\0') {
878 if (!isspace((unsigned char)*s))
879 return 0;
880 s++;
881 }
882 return 1;
883 }
884 #endif
885
886 static void
887 readPearson(struct ReadSeqVars *V)
888 {
889 char *sptr;
890
891 if (V->ssimode >= 0) V->r_off = V->ssioffset;
892
893 #ifdef CLUSTALO
894 while (only_whitespace(V->buf)) {
895 SeqfileGetLine(V);
896 }
897
898 if (feof(V->f) || *V->buf != '>')
899 #else
900 if (*V->buf != '>')
901 #endif
902 #ifdef CLUSTALO
903 Die("\
904 File %s does not appear to be in FASTA format at line %d.\n\
905 You may want to specify the file format on the command line.\n\
906 Usually this is done with an option --infmt <fmt>.\n",
907 V->fname, V->linenumber);
908 #else
909 Die("\
910 File %s does not appear to be in FASTA format at line %d.\n\
911 You may want to specify the file format on the command line.\n\
912 Usually this is done with an option --informat <fmt>.\n",
913 V->fname, V->linenumber);
914 #endif
915 if ((sptr = strtok(V->buf+1, "\n\t ")) != NULL)
916 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
917 if ((sptr = strtok(NULL, "\n")) != NULL)
918 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
919
920 readLoop(0, endPearson, V);
921
922 while (!(feof(V->f) || ((*V->buf != 0) && (*V->buf == '>')))) {
923 SeqfileGetLine(V);
924 }
925 }
926
927
928 static int
929 endEMBL(char *s, int *addend)
930 {
931 *addend = 0;
932 /* Some people (Berlin 5S rRNA database, f'r instance) use
933 * an extended EMBL format that attaches extra data after
934 * the sequence -- watch out for that. We use the fact that
935 * real EMBL sequence lines begin with five spaces.
936 *
937 * We can use this as the sole end test because readEMBL() will
938 * advance to the next ID line before starting to read again.
939 */
940 return (strncmp(s," ",5) != 0);
941 /* return ((strstr(s,"//") != NULL) || (strstr(s,"ID ") == s)); */
942 }
943
944 static void
945 readEMBL(struct ReadSeqVars *V)
946 {
947 char *sptr;
948
949 /* make sure we have first line */
950 while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) {
951 SeqfileGetLine(V);
952 }
953 if (V->ssimode >= 0) V->r_off = V->ssioffset;
954
955 if ((sptr = strtok(V->buf+5, "\n\t ")) != NULL)
956 {
957 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
958 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
959 }
960
961 do {
962 SeqfileGetLine(V);
963 if (!feof(V->f) && strstr(V->buf, "AC ") == V->buf)
964 {
965 if ((sptr = strtok(V->buf+5, "; \t\n")) != NULL)
966 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
967 }
968 else if (!feof(V->f) && strstr(V->buf, "DE ") == V->buf)
969 {
970 if ((sptr = strtok(V->buf+5, "\n")) != NULL)
971 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
972 }
973 } while (! feof(V->f) && strncmp(V->buf,"SQ",2) != 0);
974
975 readLoop(0, endEMBL, V);
976
977 /* Hack for Staden experiment files: convert - to N
978 */
979 if (V->ssimode == -1) /* if we're in ssi mode, we're not keeping the seq */
980 for (sptr = V->seq; *sptr != '\0'; sptr++)
981 if (*sptr == '-') *sptr = 'N';
982
983 /* reading a real EMBL database file, we keep the source coords
984 */
985 V->sqinfo->start = 1;
986 V->sqinfo->stop = V->seqlen;
987 V->sqinfo->olen = V->seqlen;
988 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
989
990 /* load next record's ID line */
991 while (!feof(V->f) && strncmp(V->buf, "ID ", 4) != 0) {
992 SeqfileGetLine(V);
993 }
994
995 }
996
997
998 static int
999 endZuker(char *s, int *addend)
1000 {
1001 *addend = 0;
1002 return( *s == '(' );
1003 }
1004
1005 static void
1006 readZuker(struct ReadSeqVars *V)
1007 {
1008 char *sptr;
1009
1010 SeqfileGetLine(V); /*s == "seqLen seqid string..."*/
1011
1012 if ((sptr = strtok(V->buf+6, " \t\n")) != NULL)
1013 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
1014
1015 if ((sptr = strtok(NULL, "\n")) != NULL)
1016 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
1017
1018 readLoop(0, endZuker, V);
1019
1020 while (!(feof(V->f) | ((*V->buf != '\0') & (*V->buf == '('))))
1021 SeqfileGetLine(V);
1022 }
1023
1024 static void
1025 readUWGCG(struct ReadSeqVars *V)
1026 {
1027 char *si;
1028 char *sptr;
1029 int done;
1030
1031 V->seqlen = 0;
1032
1033 /*writeseq: " %s Length: %d (today) Check: %d ..\n" */
1034 /*drop above or ".." from id*/
1035 if ((si = strstr(V->buf," Length: ")) != NULL) *si = 0;
1036 else if ((si = strstr(V->buf,"..")) != NULL) *si = 0;
1037
1038 if ((sptr = strtok(V->buf, "\n\t ")) != NULL)
1039 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
1040
1041 do {
1042 done = feof(V->f);
1043 SeqfileGetLine(V);
1044 if (! done) addseq(V->buf, V);
1045 } while (!done);
1046 }
1047
1048
1049 /* Function: ReadSeq()
1050 *
1051 * Purpose: Read next sequence from an open database file.
1052 * Return the sequence and associated info.
1053 *
1054 * Args: fp - open sequence database file pointer
1055 * format - format of the file (previously determined
1056 * by call to SeqfileFormat()).
1057 * Currently unused, since we carry it in V.
1058 * ret_seq - RETURN: sequence
1059 * sqinfo - RETURN: filled in w/ other information
1060 *
1061 * Limitations: uses squid_errno, so it's not threadsafe.
1062 *
1063 * Return: 1 on success, 0 on failure.
1064 * ret_seq and some field of sqinfo are allocated here,
1065 * The preferred call mechanism to properly free the memory is:
1066 *
1067 * SQINFO sqinfo;
1068 * char *seq;
1069 *
1070 * ReadSeq(fp, format, &seq, &sqinfo);
1071 * ... do something...
1072 * FreeSequence(seq, &sqinfo);
1073 */
1074 int
1075 ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo)
1076 {
1077 int gotuw;
1078
1079 squid_errno = SQERR_OK;
1080
1081 /* Here's the hack for sequential access of sequences from
1082 * the multiple sequence alignment formats
1083 */
1084 if (IsAlignmentFormat(V->format))
1085 {
1086 if (V->msa->lastidx >= V->msa->nseq)
1087 { /* out of data. try to read another alignment */
1088 MSAFree(V->msa);
1089 if ((V->msa = MSAFileRead(V->afp)) == NULL)
1090 return 0;
1091 V->msa->lastidx = 0;
1092 }
1093 /* copy and dealign the appropriate aligned seq */
1094 /* AW: stopping squid from dealigning sequences and corresponding info */
1095 #ifdef CLUSTALO
1096 V->seq = sre_strdup(V->msa->aseq[V->msa->lastidx], V->msa->alen);
1097 #else
1098 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
1099 V->msa->aseq[V->msa->lastidx], &(V->seq));
1100 #endif
1101 V->seqlen = strlen(V->seq);
1102
1103 /* Extract sqinfo stuff for this sequence from the msa.
1104 * Tedious; code that should be cleaned.
1105 */
1106 sqinfo->flags = 0;
1107 if (V->msa->sqname[V->msa->lastidx] != NULL)
1108 SetSeqinfoString(sqinfo, V->msa->sqname[V->msa->lastidx], SQINFO_NAME);
1109 if (V->msa->sqacc != NULL && V->msa->sqacc[V->msa->lastidx] != NULL)
1110 SetSeqinfoString(sqinfo, V->msa->sqacc[V->msa->lastidx], SQINFO_ACC);
1111 if (V->msa->sqdesc != NULL && V->msa->sqdesc[V->msa->lastidx] != NULL)
1112 SetSeqinfoString(sqinfo, V->msa->sqdesc[V->msa->lastidx], SQINFO_DESC);
1113 if (V->msa->ss != NULL && V->msa->ss[V->msa->lastidx] != NULL) {
1114 /* AW: stopping squid from dealigning sequences and corresponding info */
1115 #ifdef CLUSTALO
1116 sqinfo->ss = sre_strdup(V->msa->ss[V->msa->lastidx], V->msa->alen);
1117 #else
1118 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
1119 V->msa->ss[V->msa->lastidx], &(sqinfo->ss));
1120 #endif
1121 sqinfo->flags |= SQINFO_SS;
1122 }
1123 if (V->msa->sa != NULL && V->msa->sa[V->msa->lastidx] != NULL) {
1124 /* AW: stopping squid from dealigning sequences and corresponding info */
1125 #ifdef CLUSTALO
1126 sqinfo->sa = sre_strdup(V->msa->sa[V->msa->lastidx], V->msa->alen);
1127 #else
1128 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
1129 V->msa->sa[V->msa->lastidx], &(sqinfo->sa));
1130 #endif
1131 sqinfo->flags |= SQINFO_SA;
1132 }
1133 if (V->msa->co != NULL && V->msa->co[V->msa->lastidx] != NULL) {
1134 /* AW: stopping squid from dealigning sequences and corresponding info */
1135 #ifdef CLUSTALO
1136 sqinfo->co = sre_strdup(V->msa->co[V->msa->lastidx], V->msa->alen);
1137 #else
1138 MakeDealignedString(V->msa->aseq[V->msa->lastidx], V->msa->alen,
1139 V->msa->co[V->msa->lastidx], &(sqinfo->co));
1140 #endif
1141 sqinfo->flags |= SQINFO_CO;
1142 } /* co */
1143 V->msa->lastidx++;
1144 }
1145 else {
1146 if (feof(V->f)) return 0;
1147
1148 if (V->ssimode == -1) { /* normal mode */
1149 V->seq = (char*) calloc (kStartLength+1, sizeof(char));
1150 V->maxseq = kStartLength;
1151 } else { /* index mode: discarding seq */
1152 V->seq = NULL;
1153 V->maxseq = 0;
1154 }
1155 V->seqlen = 0;
1156 V->sqinfo = sqinfo;
1157 V->sqinfo->flags = 0;
1158
1159 switch (V->format) {
1160 case SQFILE_IG : readIG(V); break;
1161 case SQFILE_STRIDER : readStrider(V); break;
1162 case SQFILE_GENBANK : readGenBank(V); break;
1163 case SQFILE_FASTA : readPearson(V); break;
1164 case SQFILE_EMBL : readEMBL(V); break;
1165 case SQFILE_ZUKER : readZuker(V); break;
1166 case SQFILE_PIR : readPIR(V); break;
1167 case SQFILE_GCGDATA : readGCGdata(V); break;
1168
1169 case SQFILE_GCG :
1170 do { /* skip leading comments on GCG file */
1171 gotuw = (strstr(V->buf,"..") != NULL);
1172 if (gotuw) readUWGCG(V);
1173 SeqfileGetLine(V);
1174 } while (! feof(V->f));
1175 break;
1176
1177 case SQFILE_IDRAW: /* SRE: no attempt to read idraw postscript */
1178 default:
1179 squid_errno = SQERR_FORMAT;
1180 free(V->seq);
1181 return 0;
1182 }
1183 if (V->seq != NULL) /* (it can be NULL in indexing mode) */
1184 V->seq[V->seqlen] = 0; /* stick a string terminator on it */
1185 }
1186
1187 /* Cleanup
1188 */
1189 sqinfo->len = V->seqlen;
1190 sqinfo->flags |= SQINFO_LEN;
1191 *ret_seq = V->seq;
1192 if (squid_errno == SQERR_OK) return 1; else return 0;
1193 }
1194
1195 /* Function: SeqfileFormat()
1196 * Date: SRE, Tue Jun 22 10:58:58 1999 [Sanger Centre]
1197 *
1198 * Purpose: Determine format of an open file.
1199 * Returns format code.
1200 * Rewinds the file.
1201 *
1202 * Autodetects the following unaligned formats:
1203 * SQFILE_FASTA
1204 * SQFILE_GENBANK
1205 * SQFILE_EMBL
1206 * SQFILE_GCG
1207 * SQFILE_GCGDATA
1208 * SQFILE_PIR
1209 * Also autodetects the following alignment formats:
1210 * MSAFILE_STOCKHOLM
1211 * MSAFILE_MSF
1212 * MSAFILE_CLUSTAL
1213 * MSAFILE_SELEX
1214 * MSAFILE_PHYLIP
1215 *
1216 * Can't autodetect MSAFILE_A2M, calls it SQFILE_FASTA.
1217 * MSAFileFormat() does the opposite.
1218 *
1219 * Args: sfp - open SQFILE
1220 *
1221 * Return: format code, or SQFILE_UNKNOWN if unrecognized
1222 */
1223 int
1224 SeqfileFormat(FILE *fp)
1225 {
1226 char *buf;
1227 int len;
1228 int fmt = SQFILE_UNKNOWN;
1229 int ndataline;
1230 char *bufcpy, *s, *s1, *s2;
1231 int has_junk;
1232
1233 buf = NULL;
1234 len = 0;
1235 ndataline = 0;
1236 has_junk = FALSE;
1237 while (sre_fgets(&buf, &len, fp) != NULL)
1238 {
1239 if (IsBlankline(buf)) continue;
1240
1241 /* Well-behaved formats identify themselves in first nonblank line.
1242 */
1243 if (ndataline == 0)
1244 {
1245 if (strncmp(buf, ">>>>", 4) == 0 && strstr(buf, "Len: "))
1246 { fmt = SQFILE_GCGDATA; goto DONE; }
1247
1248 if (buf[0] == '>')
1249 { fmt = SQFILE_FASTA; goto DONE; }
1250
1251 if (strncmp(buf, "!!AA_SEQUENCE", 13) == 0 ||
1252 strncmp(buf, "!!NA_SEQUENCE", 13) == 0)
1253 { fmt = SQFILE_GCG; goto DONE; }
1254
1255 if (strncmp(buf, "# DUBLIN 1.", 11) == 0) /* -> r296, FS */
1256 { fmt = MSAFILE_DUBLIN; goto DONE; }
1257
1258 if (strncmp(buf, "# STOCKHOLM 1.", 14) == 0)
1259 { fmt = MSAFILE_STOCKHOLM; goto DONE; }
1260
1261 if (strncmp(buf, "CLUSTAL", 7) == 0 &&
1262 strstr(buf, "multiple sequence alignment") != NULL)
1263 { fmt = MSAFILE_CLUSTAL; goto DONE; }
1264
1265 if (strncmp(buf, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0 ||
1266 strncmp(buf, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0)
1267 { fmt = MSAFILE_MSF; goto DONE; }
1268
1269 /* PHYLIP id: also just a good bet */
1270 bufcpy = sre_strdup(buf, -1);
1271 s = bufcpy;
1272 if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&
1273 (s2 = sre_strtok(&s, WHITESPACE, NULL)) != NULL &&
1274 IsInt(s1) &&
1275 IsInt(s2))
1276 { free(bufcpy); fmt = MSAFILE_PHYLIP; goto DONE; }
1277 free(bufcpy);
1278 }
1279
1280 /* We trust that other formats identify themselves soon.
1281 */
1282 /* dead giveaways for extended SELEX */
1283 if (strncmp(buf, "#=AU", 4) == 0 ||
1284 strncmp(buf, "#=ID", 4) == 0 ||
1285 strncmp(buf, "#=AC", 4) == 0 ||
1286 strncmp(buf, "#=DE", 4) == 0 ||
1287 strncmp(buf, "#=GA", 4) == 0 ||
1288 strncmp(buf, "#=TC", 4) == 0 ||
1289 strncmp(buf, "#=NC", 4) == 0 ||
1290 strncmp(buf, "#=SQ", 4) == 0 ||
1291 strncmp(buf, "#=SS", 4) == 0 ||
1292 strncmp(buf, "#=CS", 4) == 0 ||
1293 strncmp(buf, "#=RF", 4) == 0)
1294 { fmt = MSAFILE_SELEX; goto DONE; }
1295
1296 if (strncmp(buf, "///", 3) == 0 || strncmp(buf, "ENTRY ", 6) == 0)
1297 { fmt = SQFILE_PIR; goto DONE; }
1298
1299 /* a ha, diagnostic of an (old) MSF file */
1300 if ((strstr(buf, "..") != NULL) &&
1301 (strstr(buf, "MSF:") != NULL) &&
1302 (strstr(buf, "Check:")!= NULL))
1303 { fmt = MSAFILE_MSF; goto DONE; }
1304
1305 /* unaligned GCG (must follow MSF test!) */
1306 if (strstr(buf, " Check: ") != NULL && strstr(buf, "..") != NULL)
1307 { fmt = SQFILE_GCG; goto DONE; }
1308
1309 if (strncmp(buf,"LOCUS ",6) == 0 || strncmp(buf,"ORIGIN ",6) == 0)
1310 { fmt = SQFILE_GENBANK; goto DONE; }
1311
1312 if (strncmp(buf,"ID ",5) == 0 || strncmp(buf,"SQ ",5) == 0)
1313 { fmt = SQFILE_EMBL; goto DONE; }
1314
1315 /* But past here, we're being desperate. A simple SELEX file is
1316 * very difficult to detect; we can only try to disprove it.
1317 */
1318 s = buf;
1319 if ((s1 = sre_strtok(&s, WHITESPACE, NULL)) == NULL) continue; /* skip blank lines */
1320 if (strchr("#%", *s1) != NULL) continue; /* skip comment lines */
1321
1322 /* Disproof 1. Noncomment, nonblank lines in a SELEX file
1323 * must have at least two space-delimited fields (name/seq)
1324 */
1325 if ((s2 = sre_strtok(&s, WHITESPACE, NULL)) == NULL)
1326 has_junk = TRUE;
1327
1328 /* Disproof 2.
1329 * The sequence field should look like a sequence.
1330 */
1331 if (s2 != NULL && Seqtype(s2) == kOtherSeq)
1332 has_junk = TRUE;
1333
1334 ndataline++;
1335 if (ndataline == 300) break; /* only look at first 300 lines */
1336 }
1337
1338 if (ndataline == 0)
1339 Die("Sequence file contains no data");
1340
1341 /* If we've made it this far, we've run out of data, but there
1342 * was at least one line of it; check if we've
1343 * disproven SELEX. If not, cross our fingers, pray, and guess SELEX.
1344 */
1345 if (has_junk == TRUE) fmt = SQFILE_UNKNOWN;
1346 else fmt = MSAFILE_SELEX;
1347
1348 DONE:
1349 if (buf != NULL) free(buf);
1350 rewind(fp);
1351 return fmt;
1352 }
1353
1354 /* Function: GCGBinaryToSequence()
1355 *
1356 * Purpose: Convert a GCG 2BIT binary string to DNA sequence.
1357 * 0 = C 1 = T 2 = A 3 = G
1358 * 4 nts/byte
1359 *
1360 * Args: seq - binary sequence. Converted in place to DNA.
1361 * len - length of DNA. binary is (len+3)/4 bytes
1362 */
1363 int
1364 GCGBinaryToSequence(char *seq, int len)
1365 {
1366 int bpos; /* position in binary */
1367 int spos; /* position in sequence */
1368 char twobit;
1369 int i;
1370
1371 for (bpos = (len-1)/4; bpos >= 0; bpos--)
1372 {
1373 twobit = seq[bpos];
1374 spos = bpos*4;
1375
1376 for (i = 3; i >= 0; i--)
1377 {
1378 switch (twobit & 0x3) {
1379 case 0: seq[spos+i] = 'C'; break;
1380 case 1: seq[spos+i] = 'T'; break;
1381 case 2: seq[spos+i] = 'A'; break;
1382 case 3: seq[spos+i] = 'G'; break;
1383 }
1384 twobit = twobit >> 2;
1385 }
1386 }
1387 seq[len] = '\0';
1388 return 1;
1389 }
1390
1391
1392 /* Function: GCGchecksum()
1393 * Date: SRE, Mon May 31 11:13:21 1999 [St. Louis]
1394 *
1395 * Purpose: Calculate a GCG checksum for a sequence.
1396 * Code provided by Steve Smith of Genetics
1397 * Computer Group.
1398 *
1399 * Args: seq - sequence to calculate checksum for.
1400 * may contain gap symbols.
1401 * len - length of sequence (usually known,
1402 * so save a strlen() call)
1403 *
1404 * Returns: GCG checksum.
1405 */
1406 int
1407 GCGchecksum(char *seq, int len)
1408 {
1409 int i; /* position in sequence */
1410 int chk = 0; /* calculated checksum */
1411
1412 for (i = 0; i < len; i++)
1413 chk = (chk + (i % 57 + 1) * (sre_toupper((int) seq[i]))) % 10000;
1414 return chk;
1415 }
1416
1417
1418 /* Function: GCGMultchecksum()
1419 *
1420 * Purpose: GCG checksum for a multiple alignment: sum of
1421 * individual sequence checksums (including their
1422 * gap characters) modulo 10000.
1423 *
1424 * Implemented using spec provided by Steve Smith of
1425 * Genetics Computer Group.
1426 *
1427 * Args: seqs - sequences to be checksummed; aligned or not
1428 * nseq - number of sequences
1429 *
1430 * Return: the checksum, a number between 0 and 9999
1431 */
1432 int
1433 GCGMultchecksum(char **seqs, int nseq)
1434 {
1435 int chk = 0;
1436 int idx;
1437
1438 for (idx = 0; idx < nseq; idx++)
1439 chk = (chk + GCGchecksum(seqs[idx], strlen(seqs[idx]))) % 10000;
1440 return chk;
1441 }
1442
1443
1444
1445
1446 /* Function: Seqtype()
1447 *
1448 * Purpose: Returns a (very good) guess about type of sequence:
1449 * kDNA, kRNA, kAmino, or kOtherSeq.
1450 *
1451 * Modified from, and replaces, Gilbert getseqtype().
1452 */
1453 int
1454 Seqtype(char *seq)
1455 {
1456 int saw; /* how many non-gap characters I saw */
1457 char c;
1458 int po = 0; /* count of protein-only */
1459 int nt = 0; /* count of t's */
1460 int nu = 0; /* count of u's */
1461 int na = 0; /* count of nucleotides */
1462 int aa = 0; /* count of amino acids */
1463 int no = 0; /* count of others */
1464
1465 /* Look at the first 300 non-gap characters
1466 */
1467
1468 #ifdef CLUSTALO
1469 /* VGGNGDDYLSGGTGNDTL is recognized as unknown using squid's default
1470 * approach.
1471 * We change it to the following:
1472
1473 * 1. counting: ignore gaps and not alpha characters. if protein-only then
1474 * count as such (po). otherwise decide if amino-acid (aa) or nucleic-acid
1475 * (na) or unknown (no)
1476 *
1477 * 2. determine type: if we saw more unknown than aa or na, return unknown.
1478 * if encountered protein-only return protein-only. otherwise decide based
1479 * on majority. (if aa==na return na)
1480 */
1481 for (saw = 0; *seq != '\0' && saw < 300; seq++) {
1482 c = sre_toupper((int) *seq);
1483 int unknown = 1;
1484
1485 if (isgap(c) || ! isalpha((int) c)) {
1486 continue;
1487 }
1488
1489 if (strchr(protonly, c)) {
1490 po++;
1491 unknown = 0;
1492 }
1493
1494 if (strchr(aminos,c)) {
1495 aa++;
1496 unknown = 0;
1497 }
1498
1499 if (strchr(primenuc,c)) {
1500 na++;
1501 unknown = 0;
1502
1503 if (c == 'T')
1504 nt++;
1505 else if (c == 'U')
1506 nu++;
1507 }
1508
1509 if (unknown) {
1510 no ++;
1511 }
1512
1513 saw++;
1514 }
1515
1516 if (no > aa && no > na)
1517 return kOtherSeq;
1518
1519 if (po > 0 || aa>na)
1520 return kAmino;
1521
1522 if (na >= aa) {
1523 if (nu > nt)
1524 return kRNA;
1525 else
1526 return kDNA;
1527 }
1528
1529 return kOtherSeq;
1530
1531
1532 #else
1533 for (saw = 0; *seq != '\0' && saw < 300; seq++)
1534 {
1535 c = sre_toupper((int) *seq);
1536 if (! isgap(c))
1537 {
1538 if (strchr(protonly, c)) po++;
1539 else if (strchr(primenuc,c)) {
1540 na++;
1541 if (c == 'T') nt++;
1542 else if (c == 'U') nu++;
1543 }
1544 else if (strchr(aminos,c)) aa++;
1545 else if (isalpha((int) c)) no++;
1546 saw++;
1547 }
1548 }
1549
1550 if (no > 0) return kOtherSeq;
1551 else if (po > 0) return kAmino;
1552 else if (na > aa) {
1553 if (nu > nt) return kRNA;
1554 else return kDNA;
1555 }
1556 else return kAmino; /* ooooh. risky. */
1557 #endif
1558
1559 }
1560
1561
1562 /* Function: GuessAlignmentSeqtype()
1563 * Date: SRE, Wed Jul 7 09:42:34 1999 [St. Louis]
1564 *
1565 * Purpose: Try to guess whether an alignment is protein
1566 * or nucleic acid; return a code for the
1567 * type (kRNA, kDNA, or kAmino).
1568 *
1569 * Args: aseq - array of aligned sequences. (Could also
1570 * be an rseq unaligned sequence array)
1571 * nseq - number of aseqs
1572 *
1573 * Returns: kRNA, kDNA, kAmino;
1574 * kOtherSeq if inconsistency is detected.
1575 */
1576 int
1577 GuessAlignmentSeqtype(char **aseq, int nseq)
1578 {
1579 int idx;
1580 int nrna = 0;
1581 int ndna = 0;
1582 int namino = 0;
1583 int nother = 0;
1584
1585 for (idx = 0; idx < nseq; idx++)
1586 switch (Seqtype(aseq[idx])) {
1587 case kRNA: nrna++; break;
1588 case kDNA: ndna++; break;
1589 case kAmino: namino++; break;
1590 default: nother++;
1591 }
1592
1593 /* Unambiguous decisions:
1594 */
1595 if (nother) return kOtherSeq;
1596 if (namino == nseq) return kAmino;
1597 if (ndna == nseq) return kDNA;
1598 if (nrna == nseq) return kRNA;
1599
1600 /* Ambiguous decisions:
1601 */
1602 if (namino == 0) return kRNA; /* it's nucleic acid, but seems mixed RNA/DNA */
1603 return kAmino; /* some amino acid seen; others probably short seqs, some
1604 of which may be entirely ACGT (ala,cys,gly,thr). We
1605 could be a little more sophisticated: U would be a giveaway
1606 that we're not in protein seqs */
1607 }
1608
1609 /* Function: WriteSimpleFASTA()
1610 * Date: SRE, Tue Nov 16 18:06:00 1999 [St. Louis]
1611 *
1612 * Purpose: Just write a FASTA format sequence to a file;
1613 * minimal interface, mostly for quick and dirty programs.
1614 *
1615 * Args: fp - open file handle (stdout, possibly)
1616 * seq - sequence to output
1617 * name - name for the sequence
1618 * desc - optional description line, or NULL.
1619 *
1620 * Returns: void
1621 */
1622 void
1623 WriteSimpleFASTA(FILE *fp, char *seq, char *name, char *desc)
1624 {
1625 char buf[61];
1626 int len;
1627 int pos;
1628
1629 len = strlen(seq);
1630 buf[60] = '\0';
1631 fprintf(fp, ">%s %s\n", name, desc != NULL ? desc : "");
1632 for (pos = 0; pos < len; pos += 60)
1633 {
1634 strncpy(buf, seq+pos, 60);
1635 fprintf(fp, "%s\n", buf);
1636 }
1637 }
1638
1639 int
1640 WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo)
1641 {
1642 int numline = 0;
1643 int lines = 0, spacer = 0, width = 50, tab = 0;
1644 int i, j, l, l1, ibase;
1645 char endstr[10];
1646 char s[100]; /* buffer for sequence */
1647 char ss[100]; /* buffer for structure */
1648 int checksum = 0;
1649 int seqlen;
1650 int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */
1651 int dostruc; /* TRUE to print structure lines*/
1652
1653 which_case = 0;
1654 dostruc = FALSE;
1655 seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq);
1656
1657 if (IsAlignmentFormat(outform))
1658 Die("Tried to write an aligned format with WriteSeq() -- bad, bad.");
1659
1660
1661 strcpy( endstr,"");
1662 l1 = 0;
1663 checksum = GCGchecksum(seq, seqlen);
1664
1665 switch (outform) {
1666 case SQFILE_UNKNOWN: /* no header, just sequence */
1667 strcpy(endstr,"\n"); /* end w/ extra blank line */
1668 break;
1669
1670 case SQFILE_GENBANK:
1671 fprintf(outf,"LOCUS %s %d bp\n",
1672 sqinfo->name, seqlen);
1673 fprintf(outf,"ACCESSION %s\n",
1674 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : ".");
1675 fprintf(outf,"DEFINITION %s\n",
1676 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : ".");
1677 fprintf(outf,"VERSION %s\n",
1678 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : ".");
1679 fprintf(outf,"ORIGIN \n");
1680 spacer = 11;
1681 numline = 1;
1682 strcpy(endstr, "\n//");
1683 break;
1684
1685 case SQFILE_GCGDATA:
1686 fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen);
1687 fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1688 break;
1689
1690 case SQFILE_PIR:
1691 fprintf(outf, "ENTRY %s\n",
1692 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
1693 fprintf(outf, "TITLE %s\n",
1694 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1695 fprintf(outf, "ACCESSION %s\n",
1696 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1697 fprintf(outf, "SUMMARY #Length %d #Checksum %d\n",
1698 sqinfo->len, checksum);
1699 fprintf(outf, "SEQUENCE\n");
1700 fprintf(outf, " 5 10 15 20 25 30\n");
1701 spacer = 2; /* spaces after every residue */
1702 numline = 1; /* number lines w/ coords */
1703 width = 30; /* 30 aa per line */
1704 strcpy(endstr, "\n///");
1705 break;
1706
1707 case SQFILE_SQUID:
1708 fprintf(outf, "NAM %s\n", sqinfo->name);
1709 if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN))
1710 fprintf(outf, "SRC %s %s %d..%d::%d\n",
1711 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-",
1712 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-",
1713 (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0,
1714 (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0,
1715 (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0);
1716 if (sqinfo->flags & SQINFO_DESC)
1717 fprintf(outf, "DES %s\n", sqinfo->desc);
1718 if (sqinfo->flags & SQINFO_SS)
1719 {
1720 fprintf(outf, "SEQ +SS\n");
1721 dostruc = TRUE; /* print structure lines too */
1722 }
1723 else
1724 fprintf(outf, "SEQ\n");
1725 numline = 1; /* number seq lines w/ coords */
1726 strcpy(endstr, "\n++");
1727 break;
1728
1729 case SQFILE_EMBL:
1730 fprintf(outf,"ID %s\n",
1731 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
1732 fprintf(outf,"AC %s\n",
1733 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1734 fprintf(outf,"DE %s\n",
1735 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1736 fprintf(outf,"SQ %d BP\n", seqlen);
1737 strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
1738 tab = 5; /** added 31jan91 */
1739 spacer = 11; /** added 31jan91 */
1740 break;
1741
1742 case SQFILE_GCG:
1743 fprintf(outf,"%s\n", sqinfo->name);
1744 if (sqinfo->flags & SQINFO_ACC)
1745 fprintf(outf,"ACCESSION %s\n", sqinfo->acc);
1746 if (sqinfo->flags & SQINFO_DESC)
1747 fprintf(outf,"DEFINITION %s\n", sqinfo->desc);
1748 fprintf(outf," %s Length: %d (today) Check: %d ..\n",
1749 sqinfo->name, seqlen, checksum);
1750 spacer = 11;
1751 numline = 1;
1752 strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */
1753 break;
1754
1755 case SQFILE_STRIDER: /* ?? map ?*/
1756 fprintf(outf,"; ### from DNA Strider ;-)\n");
1757 fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n",
1758 sqinfo->name, seqlen, checksum);
1759 strcpy(endstr, "\n//");
1760 break;
1761
1762 /* SRE: Don had Zuker default to Pearson, which is not
1763 intuitive or helpful, since Zuker's MFOLD can't read
1764 Pearson format. More useful to use kIG */
1765 case SQFILE_ZUKER:
1766 which_case = 1; /* MFOLD requires upper case. */
1767 /*FALLTHRU*/
1768 case SQFILE_IG:
1769 fprintf(outf,";%s %s\n",
1770 sqinfo->name,
1771 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
1772 fprintf(outf,"%s\n", sqinfo->name);
1773 strcpy(endstr,"1"); /* == linear dna */
1774 break;
1775
1776 case SQFILE_RAW: /* Raw: no header at all. */
1777 break;
1778
1779 default :
1780 case SQFILE_FASTA:
1781 fprintf(outf,">%s %s\n", sqinfo->name,
1782 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
1783 break;
1784 }
1785
1786 if (which_case == 1) s2upper(seq);
1787 if (which_case == 2) s2lower(seq);
1788
1789
1790 width = MIN(width,100);
1791 for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) {
1792 if (l1 < 0) l1 = 0;
1793 else if (l1 == 0) {
1794 if (numline) fprintf(outf,"%8d ",ibase);
1795 for (j=0; j<tab; j++) fputc(' ',outf);
1796 }
1797 if ((spacer != 0) && ((l+1) % spacer == 1))
1798 { s[l] = ' '; ss[l] = ' '; l++; }
1799 s[l] = seq[i];
1800 ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.';
1801 l++; i++;
1802 l1++; /* don't count spaces for width*/
1803 if (l1 == width || i == seqlen) {
1804 s[l] = ss[l] = '\0';
1805 l = 0; l1 = 0;
1806 if (dostruc)
1807 {
1808 fprintf(outf, "%s\n", s);
1809 if (numline) fprintf(outf," ");
1810 for (j=0; j<tab; j++) fputc(' ',outf);
1811 if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr);
1812 else fprintf(outf,"%s\n",ss);
1813 }
1814 else
1815 {
1816 if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr);
1817 else fprintf(outf,"%s\n",s);
1818 }
1819 lines++;
1820 ibase = i+1;
1821 }
1822 }
1823 return lines;
1824 }
1825
1826
1827 /* Function: ReadMultipleRseqs()
1828 *
1829 * Purpose: Open a data file and
1830 * parse it into an array of rseqs (raw, unaligned
1831 * sequences).
1832 *
1833 * Caller is responsible for free'ing memory allocated
1834 * to ret_rseqs, ret_weights, and ret_names.
1835 *
1836 * Weights are currently only supported for MSF format.
1837 * Sequences read from all other formats will be assigned
1838 * weights of 1.0. If the caller isn't interested in
1839 * weights, it passes NULL as ret_weights.
1840 *
1841 * Returns 1 on success. Returns 0 on failure and sets
1842 * squid_errno to indicate the cause.
1843 */
1844 int
1845 ReadMultipleRseqs(char *seqfile,
1846 int fformat,
1847 char ***ret_rseqs,
1848 SQINFO **ret_sqinfo,
1849 int *ret_num)
1850 {
1851 SQINFO *sqinfo; /* array of sequence optional info */
1852 SQFILE *dbfp; /* open ptr for sequential access of file */
1853 char **rseqs; /* sequence array */
1854 int numalloced; /* num of seqs currently alloced for */
1855 int num;
1856
1857
1858 num = 0;
1859 numalloced = 16;
1860 rseqs = (char **) MallocOrDie (numalloced * sizeof(char *));
1861 sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO));
1862 if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0;
1863
1864 while (ReadSeq(dbfp, dbfp->format, &rseqs[num], &(sqinfo[num])))
1865 {
1866 num++;
1867 if (num == numalloced) /* more seqs coming, alloc more room */
1868 {
1869 numalloced += 16;
1870 rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *));
1871 sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO));
1872 }
1873 }
1874 SeqfileClose(dbfp);
1875
1876 *ret_rseqs = rseqs;
1877 *ret_sqinfo = sqinfo;
1878 *ret_num = num;
1879 return 1;
1880 }
1881
1882
1883 /* Function: String2SeqfileFormat()
1884 * Date: SRE, Sun Jun 27 15:25:54 1999 [TW 723 over Canadian Shield]
1885 *
1886 * Purpose: Convert a string (e.g. from command line option arg)
1887 * to a format code. Case insensitive. Return
1888 * MSAFILE_UNKNOWN/SQFILE_UNKNOWN if string is bad.
1889 * Uses codes defined in squid.h (unaligned formats) and
1890 * msa.h (aligned formats).
1891 *
1892 * Args: s - string to convert; e.g. "stockholm"
1893 *
1894 * Returns: format code; e.g. MSAFILE_STOCKHOLM
1895 */
1896 int
1897 String2SeqfileFormat(char *s)
1898 {
1899 char *s2;
1900 int code = SQFILE_UNKNOWN;
1901
1902 if (s == NULL) return SQFILE_UNKNOWN;
1903 s2 = sre_strdup(s, -1);
1904 s2upper(s2);
1905
1906 if (strcmp(s2, "FASTA") == 0) code = SQFILE_FASTA;
1907 #ifdef CLUSTALO
1908 else if (strcmp(s2, "FA") == 0) code = SQFILE_FASTA;
1909 else if (strcmp(s2, "VIENNA") == 0) code = SQFILE_VIENNA;
1910 else if (strcmp(s2, "VIE") == 0) code = SQFILE_VIENNA;
1911 #endif
1912 else if (strcmp(s2, "GENBANK") == 0) code = SQFILE_GENBANK;
1913 #ifdef CLUSTALO
1914 else if (strcmp(s2, "GB") == 0) code = SQFILE_GENBANK;
1915 #endif
1916 else if (strcmp(s2, "EMBL") == 0) code = SQFILE_EMBL;
1917 else if (strcmp(s2, "GCG") == 0) code = SQFILE_GCG;
1918 else if (strcmp(s2, "GCGDATA") == 0) code = SQFILE_GCGDATA;
1919 else if (strcmp(s2, "RAW") == 0) code = SQFILE_RAW;
1920 else if (strcmp(s2, "IG") == 0) code = SQFILE_IG;
1921 else if (strcmp(s2, "STRIDER") == 0) code = SQFILE_STRIDER;
1922 else if (strcmp(s2, "IDRAW") == 0) code = SQFILE_IDRAW;
1923 else if (strcmp(s2, "ZUKER") == 0) code = SQFILE_ZUKER;
1924 else if (strcmp(s2, "PIR") == 0) code = SQFILE_PIR;
1925 else if (strcmp(s2, "SQUID") == 0) code = SQFILE_SQUID;
1926 else if (strcmp(s2, "STOCKHOLM") == 0) code = MSAFILE_STOCKHOLM;
1927 #ifdef CLUSTALO
1928 else if (strcmp(s2, "ST") == 0) code = MSAFILE_STOCKHOLM;
1929 else if (strcmp(s2, "STK") == 0) code = MSAFILE_STOCKHOLM;
1930 #endif
1931 else if (strcmp(s2, "SELEX") == 0) code = MSAFILE_SELEX;
1932 else if (strcmp(s2, "MSF") == 0) code = MSAFILE_MSF;
1933 else if (strcmp(s2, "CLUSTAL") == 0) code = MSAFILE_CLUSTAL;
1934 #ifdef CLUSTALO
1935 else if (strcmp(s2, "CLU") == 0) code = MSAFILE_CLUSTAL;
1936 #endif
1937 else if (strcmp(s2, "A2M") == 0) code = MSAFILE_A2M;
1938 else if (strcmp(s2, "PHYLIP") == 0) code = MSAFILE_PHYLIP;
1939 #ifdef CLUSTALO
1940 else if (strcmp(s2, "PHY") == 0) code = MSAFILE_PHYLIP;
1941 #endif
1942 else if (strcmp(s2, "EPS") == 0) code = MSAFILE_EPS;
1943 #ifdef CLUSTALO
1944 else code = SQFILE_UNKNOWN;
1945 #endif
1946 free(s2);
1947 return code;
1948 }
1949 char *
1950 SeqfileFormat2String(int code)
1951 {
1952 switch (code) {
1953 case SQFILE_UNKNOWN: return "unknown";
1954 case SQFILE_FASTA: return "FASTA";
1955 #ifdef CLUSTALO
1956 case SQFILE_VIENNA: return "Vienna";
1957 #endif
1958 case SQFILE_GENBANK: return "Genbank";
1959 case SQFILE_EMBL: return "EMBL";
1960 case SQFILE_GCG: return "GCG";
1961 case SQFILE_GCGDATA: return "GCG data library";
1962 case SQFILE_RAW: return "raw";
1963 case SQFILE_IG: return "Intelligenetics";
1964 case SQFILE_STRIDER: return "MacStrider";
1965 case SQFILE_IDRAW: return "Idraw Postscript";
1966 case SQFILE_ZUKER: return "Zuker";
1967 case SQFILE_PIR: return "PIR";
1968 case SQFILE_SQUID: return "SQUID";
1969 case MSAFILE_STOCKHOLM: return "Stockholm";
1970 case MSAFILE_SELEX: return "SELEX";
1971 case MSAFILE_MSF: return "MSF";
1972 case MSAFILE_CLUSTAL: return "Clustal";
1973 case MSAFILE_A2M: return "a2m";
1974 case MSAFILE_PHYLIP: return "Phylip";
1975 case MSAFILE_EPS: return "EPS";
1976 default:
1977 Die("Bad code passed to MSAFormat2String()");
1978 }
1979 /*NOTREACHED*/
1980 return NULL;
1981 }
1982
1983
1984 /* Function: MSAToSqinfo()
1985 * Date: SRE, Tue Jul 20 14:36:56 1999 [St. Louis]
1986 *
1987 * Purpose: Take an MSA and generate a SQINFO array suitable
1988 * for use in annotating the unaligned sequences.
1989 * Return the array.
1990 *
1991 * Permanent temporary code. sqinfo was poorly designed.
1992 * it must eventually be replaced, but the odds
1993 * of this happening soon are nil, so I have to deal.
1994 *
1995 * Args: msa - the alignment
1996 *
1997 * Returns: ptr to allocated sqinfo array.
1998 * Freeing is ghastly: free in each individual sqinfo[i]
1999 * with FreeSequence(NULL, &(sqinfo[i])), then
2000 * free(sqinfo).
2001 */
2002 SQINFO *
2003 MSAToSqinfo(MSA *msa)
2004 {
2005 int idx;
2006 SQINFO *sqinfo;
2007
2008 sqinfo = MallocOrDie(sizeof(SQINFO) * msa->nseq);
2009
2010 for (idx = 0; idx < msa->nseq; idx++)
2011 {
2012 sqinfo[idx].flags = 0;
2013 SetSeqinfoString(&(sqinfo[idx]),
2014 msa->sqname[idx], SQINFO_NAME);
2015 SetSeqinfoString(&(sqinfo[idx]),
2016 MSAGetSeqAccession(msa, idx), SQINFO_ACC);
2017 SetSeqinfoString(&(sqinfo[idx]),
2018 MSAGetSeqDescription(msa, idx), SQINFO_DESC);
2019
2020 if (msa->ss != NULL && msa->ss[idx] != NULL) {
2021 MakeDealignedString(msa->aseq[idx], msa->alen,
2022 msa->ss[idx], &(sqinfo[idx].ss));
2023 sqinfo[idx].flags |= SQINFO_SS;
2024 }
2025
2026 if (msa->sa != NULL && msa->sa[idx] != NULL) {
2027 MakeDealignedString(msa->aseq[idx], msa->alen,
2028 msa->sa[idx], &(sqinfo[idx].sa));
2029 sqinfo[idx].flags |= SQINFO_SA;
2030 }
2031
2032 sqinfo[idx].len = DealignedLength(msa->aseq[idx]);
2033 sqinfo[idx].flags |= SQINFO_LEN;
2034 }
2035 return sqinfo;
2036 }
2037
2038
2039
2040 /* cc -o sqio_test -DA_QUIET_DAY -L. sqio.c -lsquid */
2041 #ifdef A_QUIET_DAY
2042 #include "ssi.h"
2043 int
2044 main(int argc, char **argv)
2045 {
2046 FILE *fp;
2047 char *filename;
2048 char *buf;
2049 int len;
2050 int mode = 3;
2051 SSIOFFSET off;
2052
2053 filename = argv[1];
2054
2055 if (mode == 1) {
2056 buf = malloc(sizeof(char) * 256);
2057 if ((fp = fopen(filename, "r")) == NULL)
2058 Die("open of %s failed", filename);
2059 while (fgets(buf, 255, fp) != NULL)
2060 ;
2061 fclose(fp);
2062 free(buf);
2063 } else if (mode == 2) {
2064 if ((fp = fopen(filename, "r")) == NULL)
2065 Die("open of %s failed", filename);
2066 buf = NULL; len = 0;
2067 while (sre_fgets(&buf, &len, fp) != NULL)
2068 SSIGetFilePosition(fp, SSI_OFFSET_I32, &off);
2069 fclose(fp);
2070 free(buf);
2071 } else if (mode == 3) {
2072 SQFILE *dbfp;
2073 SQINFO info;
2074
2075 if ((dbfp = SeqfileOpen(filename, SQFILE_FASTA, NULL)) == NULL)
2076 Die("open of %s failed", filename);
2077 while (ReadSeq(dbfp, dbfp->format, &buf, &info)) {
2078 SSIGetFilePosition(dbfp->f, SSI_OFFSET_I32, &off);
2079 FreeSequence(buf, &info);
2080 }
2081 SeqfileClose(dbfp);
2082 }
2083
2084 }
2085
2086
2087 #endif
2088