1 /* SQUID - A C function library for biological sequence analysis
2 * Copyright (C) 1992-1996 Sean R. Eddy
3 *
4 * This source code is distributed under terms of the
5 * GNU General Public License. See the files COPYING
6 * and GNULICENSE for further details.
7 *
8 */
9
10 /* File: sqio.c
11 * From: ureadseq.c in Don Gilbert's sequence i/o package
12 *
13 * Reads and writes nucleic/protein sequence in various
14 * formats. Data files may have multiple sequences.
15 *
16 * Heavily modified from READSEQ package
17 * Copyright (C) 1990 by D.G. Gilbert
18 * Biology Dept., Indiana University, Bloomington, IN 47405
19 * email: gilbertd@bio.indiana.edu
20 * Thanks Don!
21 *
22 * SRE: Modifications as noted. Fri Jul 3 09:44:54 1992
23 * Packaged for squid, Thu Oct 1 10:07:11 1992
24 * ANSI conversion in full swing, Mon Jul 12 12:22:21 1993
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <ctype.h>
31
32 #ifndef SEEK_SET
33 #include <unistd.h> /* may SunOS rot in hell */
34 #endif
35
36 #include "squid.h"
37
38 #ifdef MEMDEBUG
39 #include "dbmalloc.h"
40 #endif
41
42 #define kStartLength 500
43
44 static char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*";
45 static char *primenuc = "ACGTUN";
46 static char *protonly = "EFIPQZ";
47 /* static char stdsymbols[6] = "_.-*?"; */
48 static char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'|`~\"\\";
49 static char *seqsymbols = allsymbols;
50 /*
51 use general form of isseqchar -- all chars + symbols.
52 no formats except nbrf (?) use symbols in data area as
53 anything other than sequence chars.
54 (wrong. PIR-CODATA does. Remove /)
55 */
56
57
58 void
FreeSequence(char * seq,SQINFO * sqinfo)59 FreeSequence(char *seq, SQINFO *sqinfo)
60 {
61 if (seq != NULL) free(seq);
62 if (sqinfo->flags & SQINFO_SS) free(sqinfo->ss);
63 if (sqinfo->flags & SQINFO_SA) free(sqinfo->sa);
64 }
65
66 int
SetSeqinfoString(SQINFO * sqinfo,char * sptr,int flag)67 SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag)
68 {
69 int len;
70 int pos;
71
72 while (*sptr == ' ') sptr++; /* ignore leading whitespace */
73 for (pos = strlen(sptr)-1; pos >= 0; pos--)
74 if (! isspace(sptr[pos])) break;
75 sptr[pos+1] = '\0'; /* ignore trailing whitespace */
76
77 switch (flag) {
78 case SQINFO_NAME:
79 if (*sptr != '-')
80 {
81 strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1);
82 sqinfo->name[SQINFO_NAMELEN-1] = '\0';
83 sqinfo->flags |= SQINFO_NAME;
84 }
85 break;
86
87 case SQINFO_ID:
88 if (*sptr != '-')
89 {
90 strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1);
91 sqinfo->id[SQINFO_NAMELEN-1] = '\0';
92 sqinfo->flags |= SQINFO_ID;
93 }
94 break;
95
96 case SQINFO_ACC:
97 if (*sptr != '-')
98 {
99 strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1);
100 sqinfo->acc[SQINFO_NAMELEN-1] = '\0';
101 sqinfo->flags |= SQINFO_ACC;
102 }
103 break;
104
105 case SQINFO_DESC:
106 if (*sptr != '-')
107 {
108 if (sqinfo->flags & SQINFO_DESC) /* append? */
109 {
110 len = strlen(sqinfo->desc);
111 if (len < SQINFO_DESCLEN-2) /* is there room? */
112 {
113 strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++;
114 strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len);
115 }
116 }
117 else /* else copy */
118 strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1);
119 sqinfo->desc[SQINFO_DESCLEN-1] = '\0';
120 sqinfo->flags |= SQINFO_DESC;
121 }
122 break;
123
124 case SQINFO_START:
125 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
126 sqinfo->start = atoi(sptr);
127 if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START;
128 break;
129
130 case SQINFO_STOP:
131 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
132 sqinfo->stop = atoi(sptr);
133 if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP;
134 break;
135
136 case SQINFO_OLEN:
137 if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
138 sqinfo->olen = atoi(sptr);
139 if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN;
140 break;
141
142 default:
143 Die("Invalid flag %d to SetSeqinfoString()", flag);
144 }
145 return 1;
146 }
147
148 void
SeqinfoCopy(SQINFO * sq1,SQINFO * sq2)149 SeqinfoCopy(SQINFO *sq1, SQINFO *sq2)
150 {
151 sq1->flags = sq2->flags;
152 if (sq2->flags & SQINFO_NAME) strcpy(sq1->name, sq2->name);
153 if (sq2->flags & SQINFO_ID) strcpy(sq1->id, sq2->id);
154 if (sq2->flags & SQINFO_ACC) strcpy(sq1->acc, sq2->acc);
155 if (sq2->flags & SQINFO_DESC) strcpy(sq1->desc, sq2->desc);
156 if (sq2->flags & SQINFO_LEN) sq1->len = sq2->len;
157 if (sq2->flags & SQINFO_START) sq1->start = sq2->start;
158 if (sq2->flags & SQINFO_STOP) sq1->stop = sq2->stop;
159 if (sq2->flags & SQINFO_OLEN) sq1->olen = sq2->olen;
160 if (sq2->flags & SQINFO_TYPE) sq1->type = sq2->type;
161 if (sq2->flags & SQINFO_SS) sq1->ss = Strdup(sq2->ss);
162 if (sq2->flags & SQINFO_SA) sq1->sa = Strdup(sq2->sa);
163 }
164
165 /* Function: ToDNA()
166 *
167 * Purpose: Convert a sequence to DNA.
168 * U --> T
169 */
170 void
ToDNA(char * seq)171 ToDNA(char *seq)
172 {
173 for (; *seq != '\0'; seq++)
174 {
175 if (*seq == 'U') *seq = 'T';
176 else if (*seq == 'u') *seq = 't';
177 }
178 }
179
180 /* Function: ToRNA()
181 *
182 * Purpose: Convert a sequence to RNA.
183 * T --> U
184 */
185 void
ToRNA(char * seq)186 ToRNA(char *seq)
187 {
188 for (; *seq != '\0'; seq++)
189 {
190 if (*seq == 'T') *seq = 'U';
191 else if (*seq == 't') *seq = 'u';
192 }
193 }
194
195
196 static int
isSeqChar(int c)197 isSeqChar(int c)
198 {
199 if (c > 127) return 0; /* IRIX 4.0 bug! isascii(255) returns TRUE */
200 return (isalpha(c) || strchr(seqsymbols,c));
201 }
202
203 static void
readline(FILE * f,char * s)204 readline(FILE *f, char *s)
205 {
206 char *cp;
207
208 if (NULL == fgets(s, LINEBUFLEN, f))
209 *s = 0;
210 else {
211 cp = strchr(s, '\n');
212 if (cp != NULL) *cp = 0;
213 }
214 }
215
216 /* Function: get_line()
217 * Date: SRE, Tue Mar 3 08:30:01 1998 [St. Louis]
218 *
219 * Purpose: read a line from a sequence file into V->sbuffer.
220 * If the fgets() is NULL, V->sbuffer is NULL.
221 * Trailing \n is chopped.
222 * If a trailing \n is not there, raise the
223 * lastlinelong flag in V; either we're at EOF or
224 * we have a very long line, over our fgets() buffer
225 * length.
226 *
227 * Args: V
228 *
229 * Returns: (void)
230 */
231 static void
get_line(struct ReadSeqVars * V)232 get_line(struct ReadSeqVars *V)
233 {
234 char *cp;
235
236 if (fgets(V->sbuffer, LINEBUFLEN, V->f) == NULL)
237 *(V->sbuffer) = '\0';
238 else {
239 cp = strchr(V->sbuffer, '\n');
240 if (cp != NULL) { *cp = '\0'; V->longline = FALSE; }
241 else V->longline = TRUE;
242 }
243 }
244
245
246 /* Function: addseq()
247 *
248 * Purpose: Add a line of sequence to the growing string in V.
249 */
250 static void
addseq(char * s,struct ReadSeqVars * V)251 addseq(char *s, struct ReadSeqVars *V)
252 {
253 while (*s != 0) {
254 if (isSeqChar((int) *s)) {
255 if (*s == '-' && V->dash_equals_n) *s = 'N';
256 if (V->seqlen >= V->maxseq) {
257 V->maxseq += kStartLength;
258 V->seq = (char*) ReallocOrDie (V->seq, V->maxseq+1);
259 }
260 V->seq[(V->seqlen)++] = *s;
261 }
262 s++;
263 }
264 }
265
266 static void
addstruc(char * s,struct ReadSeqVars * V)267 addstruc(char *s, struct ReadSeqVars *V)
268 {
269 char *sptr;
270
271 if (! (V->sqinfo->flags & SQINFO_SS))
272 {
273 V->sqinfo->ss = (char *) MallocOrDie ((V->maxseq+1) * sizeof(char));
274 V->sqinfo->flags |= SQINFO_SS;
275 sptr = V->sqinfo->ss;
276 }
277 else
278 {
279 V->sqinfo->ss = (char *) ReallocOrDie (V->sqinfo->ss, (V->maxseq+1) * sizeof(char));
280 sptr = V->sqinfo->ss;
281 while (*sptr != '\0') sptr++;
282 }
283
284 while (*s != 0)
285 {
286 if (isSeqChar((int)*s)) { *sptr = *s; sptr++; }
287 s++;
288 }
289 *sptr = '\0';
290 }
291
292
293 static void
readLoop(int addfirst,int (* endTest)(char *,int *),struct ReadSeqVars * V)294 readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V)
295 {
296 int addend = 0;
297 int done = 0;
298
299 V->seqlen = 0;
300 if (addfirst) addseq(V->sbuffer, V);
301 do {
302 get_line(V);
303 /* feof() alone is a bug; files not necessarily \n terminated */
304 if (*(V->sbuffer) == '\0' && feof(V->f))
305 done = TRUE;
306 done |= (*endTest)(V->sbuffer, &addend);
307 if (addend || !done)
308 addseq(V->sbuffer, V);
309 } while (!done);
310 }
311
312
313 static int
endPIR(char * s,int * addend)314 endPIR(char *s, int *addend)
315 {
316 *addend = 0;
317 if ((strncmp(s, "///", 3) == 0) ||
318 (strncmp(s, "ENTRY", 5) == 0))
319 return 1;
320 else
321 return 0;
322 }
323
324 static void
readPIR(struct ReadSeqVars * V)325 readPIR(struct ReadSeqVars *V)
326 {
327 char *sptr;
328 /* load first line of entry */
329 while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0)
330 get_line(V);
331 if (feof(V->f)) return;
332
333 if ((sptr = strtok(V->sbuffer + 15, "\n\t ")) != NULL)
334 {
335 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
336 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
337 }
338 do {
339 get_line(V);
340 if (!feof(V->f) && strncmp(V->sbuffer, "TITLE", 5) == 0)
341 SetSeqinfoString(V->sqinfo, V->sbuffer+15, SQINFO_DESC);
342 else if (!feof(V->f) && strncmp(V->sbuffer, "ACCESSION", 9) == 0)
343 {
344 if ((sptr = strtok(V->sbuffer+15, " \t\n")) != NULL)
345 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
346 }
347 } while (! feof(V->f) && (strncmp(V->sbuffer,"SEQUENCE", 8) != 0));
348 get_line(V); /* skip next line, coords */
349
350 readLoop(0, endPIR, V);
351
352 /* reading a real PIR-CODATA database file, we keep the source coords
353 */
354 V->sqinfo->start = 1;
355 V->sqinfo->stop = V->seqlen;
356 V->sqinfo->olen = V->seqlen;
357 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
358
359 /* get next line
360 */
361 while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0)
362 get_line(V);
363 }
364
365
366
367 static int
endIG(char * s,int * addend)368 endIG(char *s, int *addend)
369 {
370 *addend = 1; /* 1 or 2 occur in line w/ bases */
371 return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL));
372 }
373
374 static void
readIG(struct ReadSeqVars * V)375 readIG(struct ReadSeqVars *V)
376 {
377 char *nm;
378 /* position past ';' comments */
379 do {
380 get_line(V);
381 } while (! (feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer != ';')) ));
382
383 if (!feof(V->f))
384 {
385 if ((nm = strtok(V->sbuffer, "\n\t ")) != NULL)
386 SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
387
388 readLoop(0, endIG, V);
389 }
390
391 while (!(feof(V->f) || ((*V->sbuffer != '\0') && (*V->sbuffer == ';'))))
392 get_line(V);
393 }
394
395 static int
endStrider(char * s,int * addend)396 endStrider(char *s, int *addend)
397 {
398 *addend = 0;
399 return (strstr( s, "//") != NULL);
400 }
401
402 static void
readStrider(struct ReadSeqVars * V)403 readStrider(struct ReadSeqVars *V)
404 {
405 char *nm;
406
407 while ((!feof(V->f)) && (*V->sbuffer == ';'))
408 {
409 if (strncmp(V->sbuffer,"; DNA sequence", 14) == 0)
410 {
411 if ((nm = strtok(V->sbuffer+16, ",\n\t ")) != NULL)
412 SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
413 }
414 get_line(V);
415 }
416
417 if (! feof(V->f))
418 readLoop(1, endStrider, V);
419
420 /* load next line
421 */
422 while ((!feof(V->f)) && (*V->sbuffer != ';'))
423 get_line(V);
424 }
425
426
427 static int
endGB(char * s,int * addend)428 endGB(char *s, int *addend)
429 {
430 *addend = 0;
431 return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s));
432 }
433
434 static void
readGenBank(struct ReadSeqVars * V)435 readGenBank(struct ReadSeqVars *V)
436 {
437 char *sptr;
438 int in_definition;
439
440 while (strncmp(V->sbuffer, "LOCUS", 5) != 0)
441 get_line(V);
442
443 if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL)
444 {
445 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
446 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
447 }
448
449 in_definition = FALSE;
450 while (! feof(V->f))
451 {
452 get_line(V);
453 if (! feof(V->f) && strstr(V->sbuffer, "DEFINITION") == V->sbuffer)
454 {
455 if ((sptr = strtok(V->sbuffer+12, "\n")) != NULL)
456 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
457 in_definition = TRUE;
458 }
459 else if (! feof(V->f) && strstr(V->sbuffer, "ACCESSION") == V->sbuffer)
460 {
461 if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL)
462 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
463 in_definition = FALSE;
464 }
465 else if (strncmp(V->sbuffer,"ORIGIN", 6) != 0)
466 {
467 if (in_definition)
468 SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC);
469 }
470 else
471 break;
472 }
473
474 readLoop(0, endGB, V);
475
476 /* reading a real GenBank database file, we keep the source coords
477 */
478 V->sqinfo->start = 1;
479 V->sqinfo->stop = V->seqlen;
480 V->sqinfo->olen = V->seqlen;
481 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
482
483
484 while (!(feof(V->f) || ((*V->sbuffer!=0) && (strstr(V->sbuffer,"LOCUS") == V->sbuffer))))
485 get_line(V);
486 /* SRE: V->s now holds "//", so sequential
487 reads are wedged: fixed Tue Jul 13 1993 */
488 while (!feof(V->f) && strstr(V->sbuffer, "LOCUS ") != V->sbuffer)
489 get_line(V);
490 }
491
492 static int
endGCGdata(char * s,int * addend)493 endGCGdata(char *s, int *addend)
494 {
495 *addend = 0;
496 return (*s == '>');
497 }
498
499 static void
readGCGdata(struct ReadSeqVars * V)500 readGCGdata(struct ReadSeqVars *V)
501 {
502 int binary = FALSE; /* whether data are binary or not */
503 int blen; /* length of binary sequence */
504
505 /* first line contains ">>>>" followed by name */
506 if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->sbuffer, 2) == 0)
507 {
508 binary = TRUE;
509 SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
510 blen = atoi(sqd_parse[2]);
511 }
512 else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->sbuffer, 1) == 0)
513 SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
514 else
515 Die("bogus GCGdata format? %s", V->sbuffer);
516
517 /* second line contains free text description */
518 get_line(V);
519 SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC);
520
521 if (binary) {
522 /* allocate for blen characters +3... (allow for 3 bytes of slop) */
523 if (blen >= V->maxseq) {
524 V->maxseq = blen;
525 if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL)
526 Die("malloc failed");
527 }
528 /* read (blen+3)/4 bytes from file */
529 if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4))
530 Die("fread failed");
531 V->seqlen = blen;
532 /* convert binary code to seq */
533 GCGBinaryToSequence(V->seq, blen);
534 }
535 else readLoop(0, endGCGdata, V);
536
537 while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>'))))
538 get_line(V);
539 }
540
541 static int
endPearson(char * s,int * addend)542 endPearson(char *s, int *addend)
543 {
544 *addend = 0;
545 return(*s == '>');
546 }
547
548 static void
readPearson(struct ReadSeqVars * V)549 readPearson(struct ReadSeqVars *V)
550 {
551 char *sptr;
552
553 if ((sptr = strtok(V->sbuffer+1, "\n\t ")) != NULL)
554 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
555 if ((sptr = strtok(NULL, "\n")) != NULL)
556 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
557 /* workaround for long NCBI NR lines */
558 while (V->longline && ! feof(V->f)) get_line(V);
559
560 readLoop(0, endPearson, V);
561
562 while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>'))))
563 get_line(V);
564 }
565
566
567 static int
endEMBL(char * s,int * addend)568 endEMBL(char *s, int *addend)
569 {
570 *addend = 0;
571 /* Some people (Berlin 5S rRNA database, f'r instance) use
572 * an extended EMBL format that attaches extra data after
573 * the sequence -- watch out for that. We use the fact that
574 * real EMBL sequence lines begin with five spaces.
575 *
576 * We can use this as the sole end test because readEMBL() will
577 * advance to the next ID line before starting to read again.
578 */
579 return (strncmp(s," ",5) != 0);
580 /* return ((strstr(s,"//") != NULL) || (strstr(s,"ID ") == s)); */
581 }
582
583 static void
readEMBL(struct ReadSeqVars * V)584 readEMBL(struct ReadSeqVars *V)
585 {
586 char *sptr;
587
588 /* make sure we have first line */
589 while (!feof(V->f) && strncmp(V->sbuffer, "ID ", 4) != 0)
590 get_line(V);
591
592 if ((sptr = strtok(V->sbuffer+5, "\n\t ")) != NULL)
593 {
594 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
595 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
596 }
597
598 do {
599 get_line(V);
600 if (!feof(V->f) && strstr(V->sbuffer, "AC ") == V->sbuffer)
601 {
602 if ((sptr = strtok(V->sbuffer+5, "; \t\n")) != NULL)
603 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
604 }
605 else if (!feof(V->f) && strstr(V->sbuffer, "DE ") == V->sbuffer)
606 {
607 if ((sptr = strtok(V->sbuffer+5, "\n")) != NULL)
608 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
609 }
610 } while (! feof(V->f) && strncmp(V->sbuffer,"SQ",2) != 0);
611
612 readLoop(0, endEMBL, V);
613
614 /* reading a real EMBL database file, we keep the source coords
615 */
616 V->sqinfo->start = 1;
617 V->sqinfo->stop = V->seqlen;
618 V->sqinfo->olen = V->seqlen;
619 V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
620
621 /* load next record's ID line */
622 while (!feof(V->f) && strncmp(V->sbuffer, "ID ", 4) != 0)
623 get_line(V);
624 }
625
626
627 static int
endZuker(char * s,int * addend)628 endZuker(char *s, int *addend)
629 {
630 *addend = 0;
631 return( *s == '(' );
632 }
633
634 static void
readZuker(struct ReadSeqVars * V)635 readZuker(struct ReadSeqVars *V)
636 {
637 char *sptr;
638
639 get_line(V); /*s == "seqLen seqid string..."*/
640
641 if ((sptr = strtok(V->sbuffer+6, " \t\n")) != NULL)
642 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
643
644 if ((sptr = strtok(NULL, "\n")) != NULL)
645 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
646
647 readLoop(0, endZuker, V);
648
649 while (!(feof(V->f) | ((*V->sbuffer != '\0') & (*V->sbuffer == '('))))
650 get_line(V);
651 }
652
653 static void
readUWGCG(struct ReadSeqVars * V)654 readUWGCG(struct ReadSeqVars *V)
655 {
656 char *si;
657 char *sptr;
658 int done;
659
660 V->seqlen = 0;
661
662 /*writeseq: " %s Length: %d (today) Check: %d ..\n" */
663 /*drop above or ".." from id*/
664 if ((si = strstr(V->sbuffer," Length: ")) != NULL) *si = 0;
665 else if ((si = strstr(V->sbuffer,"..")) != NULL) *si = 0;
666
667 if ((sptr = strtok(V->sbuffer, "\n\t ")) != NULL)
668 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
669
670 do {
671 done = feof(V->f);
672 get_line(V);
673 if (! done) addseq(V->sbuffer, V);
674 } while (!done);
675 }
676
677
678 static void
readSquid(struct ReadSeqVars * V)679 readSquid(struct ReadSeqVars *V)
680 {
681 char *sptr;
682 int dostruc = FALSE;
683
684 while (strncmp(V->sbuffer, "NAM ", 4) != 0) get_line(V);
685
686 if ((sptr = strtok(V->sbuffer+4, "\n\t ")) != NULL)
687 SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
688
689 /*CONSTCOND*/
690 while (1)
691 {
692 get_line(V);
693 if (feof(V->f)) {squid_errno = SQERR_FORMAT; return; }
694
695 if (strncmp(V->sbuffer, "SRC ", 4) == 0)
696 {
697 if ((sptr = strtok(V->sbuffer+4, " \t\n")) != NULL)
698 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
699 if ((sptr = strtok(NULL, " \t\n")) != NULL)
700 SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
701 if ((sptr = strtok(NULL, ".")) != NULL)
702 SetSeqinfoString(V->sqinfo, sptr, SQINFO_START);
703 if ((sptr = strtok(NULL, ".:")) != NULL)
704 SetSeqinfoString(V->sqinfo, sptr, SQINFO_STOP);
705 if ((sptr = strtok(NULL, ": \t\n")) != NULL)
706 SetSeqinfoString(V->sqinfo, sptr, SQINFO_OLEN);
707 }
708 else if (strncmp(V->sbuffer, "DES ", 4) == 0)
709 {
710 if ((sptr = strtok(V->sbuffer+4, "\n")) != NULL)
711 SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
712 }
713 else if (strncmp(V->sbuffer,"SEQ", 3) == 0)
714 break;
715 }
716
717 if (strstr(V->sbuffer, "+SS") != NULL) dostruc = TRUE;
718
719 V->seqlen = 0;
720 /*CONSTCOND*/
721 while (1)
722 {
723 /* sequence line */
724 get_line(V);
725 if (feof(V->f) || strncmp(V->sbuffer, "++", 2) == 0)
726 break;
727 addseq(V->sbuffer, V);
728 /* structure line */
729 if (dostruc)
730 {
731 get_line(V);
732 if (feof(V->f)) { squid_errno = SQERR_FORMAT; return; }
733 addstruc(V->sbuffer, V);
734 }
735 }
736
737
738 while (!feof(V->f) && strncmp(V->sbuffer, "NAM ", 4) != 0)
739 get_line(V);
740 }
741
742
743 /* Function: SeqfileOpen()
744 *
745 * Purpose : Open a sequence database file and prepare for reading
746 * sequentially.
747 *
748 * Args: filename - name of file to open
749 * format - format of file
750 * env - environment variable for path (e.g. BLASTDB)
751 *
752 * Returns opened SQFILE ptr, or NULL on failure.
753 */
754 SQFILE *
SeqfileOpen(char * filename,int format,char * env)755 SeqfileOpen(char *filename, int format, char *env)
756 {
757 SQFILE *dbfp;
758
759 dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE));
760 dbfp->format = format;
761 dbfp->longline = FALSE;
762
763 /* Open our file handle.
764 * Three possibilities:
765 * 1. normal file open
766 * 2. filename = "-"; read from stdin
767 * 3. filename = "*.gz"; read thru pipe from gzip
768 * If we're reading from stdin or a pipe, we can't reliably
769 * back up, so we can't do two-pass parsers like the interleaved alignment
770 * formats.
771 */
772 if (strcmp(filename, "-") == 0)
773 {
774 if (IsInterleavedFormat(format))
775 Die("Can't read interleaved alignment formats thru stdin, sorry");
776
777 dbfp->f = stdin;
778 dbfp->do_stdin = TRUE;
779 dbfp->do_gzip = FALSE;
780 }
781 else if (Strparse("^.*\\.gz$", filename, 0) == 0)
782 {
783 char cmd[256];
784
785 if (IsInterleavedFormat(format))
786 Die("Can't read interleaved alignment formats thru gunzip, sorry");
787
788 if (strlen(filename) + strlen("gzip -dc ") >= 256)
789 { squid_errno = SQERR_PARAMETER; return NULL; }
790 sprintf(cmd, "gzip -dc %s", filename);
791 if ((dbfp->f = popen(cmd, "r")) == NULL)
792 { squid_errno = SQERR_NOFILE; return NULL; } /* file (or gzip!) doesn't exist */
793 dbfp->do_stdin = FALSE;
794 dbfp->do_gzip = TRUE;
795 }
796 else
797 {
798 if ((dbfp->f = fopen(filename, "r")) == NULL &&
799 (dbfp->f = EnvFileOpen(filename, env)) == NULL)
800 { squid_errno = SQERR_NOFILE; return NULL; }
801 dbfp->do_stdin = FALSE;
802 dbfp->do_gzip = FALSE;
803 }
804
805 /* The hack for sequential access of an interleaved alignment file:
806 * read the alignment in, we'll copy sequences out one at a time.
807 */
808 dbfp->ali_aseqs = NULL;
809 if (IsInterleavedFormat(format))
810 {
811 if (! ReadAlignment(filename, format, &(dbfp->ali_aseqs), &(dbfp->ali_ainfo)))
812 return NULL;
813 dbfp->ali_curridx = 0;
814 return dbfp;
815 }
816
817 /* Load the first line.
818 */
819 get_line(dbfp);
820
821 return dbfp;
822 }
823
824 /* Function: SeqfilePosition()
825 *
826 * Purpose: Move to a particular offset in a seqfile.
827 * Will not work on interleaved files (SELEX, MSF).
828 */
829 void
SeqfilePosition(SQFILE * sqfp,long offset)830 SeqfilePosition(SQFILE *sqfp, long offset)
831 {
832 if (sqfp->do_stdin || sqfp->do_gzip || IsInterleavedFormat(sqfp->format))
833 Die("SeqfilePosition() failed: in a nonrewindable data file or stream");
834
835 fseek(sqfp->f, offset, SEEK_SET);
836 get_line(sqfp);
837 }
838
839
840 /* Function: SeqfileRewind()
841 *
842 * Purpose: Set a sequence file back to the first sequence.
843 * How to do this varies with whether the file is an
844 * unaligned file, or an alignment file for which
845 * we're hacking sequential access.
846 */
847 void
SeqfileRewind(SQFILE * sqfp)848 SeqfileRewind(SQFILE *sqfp)
849 {
850 if (sqfp->do_stdin || sqfp->do_gzip)
851 Die("SeqfileRewind() failed: in a nonrewindable data file or stream");
852
853 if (sqfp->ali_aseqs != NULL) sqfp->ali_curridx = 0;
854 else {
855 rewind(sqfp->f);
856 get_line(sqfp);
857 }
858 }
859
860 void
SeqfileClose(SQFILE * sqfp)861 SeqfileClose(SQFILE *sqfp)
862 {
863 /* free static ptrs if we used them */
864 if (sqfp->ali_aseqs != NULL)
865 FreeAlignment(sqfp->ali_aseqs, &(sqfp->ali_ainfo));
866
867 if (sqfp->do_gzip) pclose(sqfp->f);
868 else if (! sqfp->do_stdin) fclose(sqfp->f);
869 free(sqfp);
870 }
871
872
873 /* Function: ReadSeq()
874 *
875 * Purpose: Read next sequence from an open database file.
876 * Return the sequence and associated info.
877 *
878 * Args: fp - open sequence database file pointer
879 * format - format of the file (previously determined
880 * by call to SeqfileFormat())
881 * ret_seq - RETURN: sequence
882 * sqinfo - RETURN: filled in w/ other information
883 *
884 * Return: 1 on success, 0 on failure.
885 * ret_seq and some field of sqinfo are allocated here,
886 * The preferred call mechanism to properly free the memory is:
887 *
888 * SQINFO sqinfo;
889 * char *seq;
890 *
891 * ReadSeq(fp, format, &seq, &sqinfo);
892 * ... do something...
893 * FreeSequence(seq, &sqinfo);
894 */
895 int
ReadSeq(SQFILE * V,int format,char ** ret_seq,SQINFO * sqinfo)896 ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo)
897 {
898 int gotuw;
899 int apos, rpos;
900
901 squid_errno = SQERR_OK;
902 if (format < kMinFormat || format > kMaxFormat)
903 {
904 squid_errno = SQERR_FORMAT;
905 *ret_seq = NULL;
906 return 0;
907 }
908
909 /* Here's the hack for sequential access of sequences from
910 * the multiple sequence alignment formats
911 */
912 if (format == kMSF || format == kSelex || format == kClustal) {
913 if (V->ali_curridx >= V->ali_ainfo.nseq)
914 return 0; /* out of aseqs */
915
916 SeqinfoCopy(sqinfo, &(V->ali_ainfo.sqinfo[V->ali_curridx]));
917
918 /* copy and dealign the appropriate aligned seq */
919 V->seq = MallocOrDie (sizeof(char) * (V->ali_ainfo.alen+1));
920 for (rpos = apos = 0; apos < V->ali_ainfo.alen; apos++)
921 if (!isgap(V->ali_aseqs[V->ali_curridx][apos]))
922 V->seq[rpos++] = V->ali_aseqs[V->ali_curridx][apos];
923 V->seq[rpos] = '\0';
924 V->seqlen = rpos;
925 V->ali_curridx++;
926 }
927 else {
928 if (feof(V->f)) return 0;
929
930 V->seq = (char*) calloc (kStartLength+1, sizeof(char));
931 V->maxseq = kStartLength;
932 V->seqlen = 0;
933 V->sqinfo = sqinfo;
934 V->sqinfo->flags = 0;
935 V->dash_equals_n = (format == kEMBL) ? TRUE : FALSE;
936
937 switch (format) {
938 case kIG : readIG(V); break;
939 case kStrider : readStrider(V); break;
940 case kGenBank : readGenBank(V); break;
941 case kPearson : readPearson(V); break;
942 case kEMBL : readEMBL(V); break;
943 case kZuker : readZuker(V); break;
944 case kPIR : readPIR(V); break;
945 case kSquid : readSquid(V); break;
946 case kGCGdata : readGCGdata(V); break;
947
948 case kGCG:
949 do { /* skip leading comments on GCG file */
950 gotuw = (strstr(V->sbuffer,"..") != NULL);
951 if (gotuw) readUWGCG(V);
952 get_line(V);
953 } while (! feof(V->f));
954 break;
955
956 case kIdraw: /* SRE: no attempt to read idraw postscript */
957 default:
958 squid_errno = SQERR_FORMAT;
959 free(V->seq);
960 return 0;
961 }
962 V->seq[V->seqlen] = 0; /* stick a string terminator on it */
963 }
964
965 /* Cleanup
966 */
967 sqinfo->len = V->seqlen;
968 sqinfo->flags |= SQINFO_LEN;
969 *ret_seq = V->seq;
970 if (squid_errno == SQERR_OK) return 1; else return 0;
971 }
972
973 /* Function: SeqfileFormat()
974 *
975 * Purpose: Determine format of seqfile, and return it
976 * through ret_format. From Gilbert's seqFileFormat().
977 *
978 * If filename is "-", we will read from stdin and
979 * assume that the stream is coming in FASTA format --
980 * either unaligned or aligned.
981 *
982 * Args: filename - name of sequence file
983 * ret_format - RETURN: format code for file, see squid.h
984 * for codes.
985 * env - name of environment variable containing
986 * a directory path that filename might also be
987 * found in. "BLASTDB", for example. Can be NULL.
988 *
989 * Return: 1 on success, 0 on failure.
990 */
991 int
SeqfileFormat(char * filename,int * ret_format,char * env)992 SeqfileFormat(char *filename, int *ret_format, char *env)
993 {
994 int foundIG = 0;
995 int foundStrider = 0;
996 int foundGB = 0;
997 int foundEMBL = 0;
998 int foundPearson = 0;
999 int foundZuker = 0;
1000 int gotGCGdata = 0;
1001 int gotPIR = 0;
1002 int gotSquid = 0;
1003 int gotuw = 0;
1004 int gotMSF = 0;
1005 int gotClustal = 0;
1006 int done = 0;
1007 int format = kUnknown;
1008 int nlines= 0, dnalines= 0;
1009 int splen = 0;
1010 char sp[LINEBUFLEN];
1011 FILE *fseq;
1012
1013 /* First check if filename is "-": special case indicating
1014 * a FASTA pipe.
1015 */
1016 if (strcmp(filename, "-") == 0)
1017 { *ret_format = kPearson; return 1; }
1018
1019 #define ReadOneLine(sp) \
1020 { done |= (feof(fseq)); \
1021 readline( fseq, sp); \
1022 if (!done) { splen = (int) strlen(sp); ++nlines; } }
1023
1024 if ((fseq = fopen(filename, "r")) == NULL &&
1025 (fseq = EnvFileOpen(filename, env)) == NULL)
1026 { squid_errno = SQERR_NOFILE; return 0; }
1027
1028 /* Look at a line at a time
1029 */
1030 while ( !done ) {
1031 ReadOneLine(sp);
1032
1033 if (sp==NULL || *sp=='\0')
1034 /*EMPTY*/ ;
1035
1036 /* high probability identities: */
1037
1038 else if (strstr(sp, " MSF:") != NULL &&
1039 strstr(sp, " Type:") != NULL &&
1040 strstr(sp, " Check:") != NULL)
1041 gotMSF = 1;
1042
1043 else if (strncmp(sp, "CLUSTAL ", 8) == 0 &&
1044 strstr( sp, "multiple sequence alignment"))
1045 gotClustal = 1;
1046
1047 else if (strstr(sp," Check: ") != NULL)
1048 gotuw= 1;
1049
1050 else if (strncmp(sp, "///", 3) == 0 || strncmp(sp, "ENTRY ", 6) == 0)
1051 gotPIR = 1;
1052
1053 else if (strncmp(sp, "++", 2) == 0 || strncmp(sp, "NAM ", 4) == 0)
1054 gotSquid = 1;
1055
1056 else if (strncmp(sp, ">>>>", 4) == 0 && strstr(sp, "Len: "))
1057 gotGCGdata = 1;
1058
1059 /* uncertain identities: */
1060
1061 else if (*sp ==';') {
1062 if (strstr(sp,"Strider") !=NULL) foundStrider= 1;
1063 else foundIG= 1;
1064 }
1065 else if (strncmp(sp,"LOCUS",5) == 0 || strncmp(sp,"ORIGIN",5) == 0)
1066 foundGB= 1;
1067
1068 else if (*sp == '>') {
1069 foundPearson = 1;
1070 }
1071
1072 else if (strstr(sp,"ID ") == sp || strstr(sp,"SQ ") == sp)
1073 foundEMBL= 1;
1074
1075 else if (*sp == '(')
1076 foundZuker= 1;
1077
1078 else {
1079 switch (Seqtype( sp )) {
1080 case kDNA:
1081 case kRNA: if (splen>20) dnalines++; break;
1082 default: break;
1083 }
1084 }
1085
1086 if (gotMSF) {format = kMSF; done = 1; }
1087 else if (gotClustal) {format = kClustal; done = 1; }
1088 else if (gotSquid) {format = kSquid; done = 1; }
1089 else if (gotPIR) {format = kPIR; done = 1; }
1090 else if (gotGCGdata) {format = kGCGdata; done = 1; }
1091 else if (gotuw)
1092 {
1093 if (foundIG) format= kIG; /* a TOIG file from GCG for certain */
1094 else format= kGCG;
1095 done= 1;
1096 }
1097 else if ((dnalines > 1) || done || (nlines > 500)) {
1098 /* decide on most likely format */
1099 /* multichar idents: */
1100 if (foundStrider) format= kStrider;
1101 else if (foundGB) format= kGenBank;
1102 else if (foundEMBL) format= kEMBL;
1103 /* single char idents: */
1104 else if (foundIG) format= kIG;
1105 else if (foundPearson) format= kPearson;
1106 else if (foundZuker) format= kZuker;
1107 /* spacing ident: */
1108 else if (IsSELEXFormat(filename)) format= kSelex;
1109 /* no format chars: */
1110 else
1111 {
1112 squid_errno = SQERR_FORMAT;
1113 return 0;
1114 }
1115
1116 done= 1;
1117 }
1118 }
1119
1120 if (fseq!=NULL) fclose(fseq);
1121
1122 *ret_format = format;
1123 return 1;
1124 #undef ReadOneLine
1125 }
1126
1127 /* Function: GCGBinaryToSequence()
1128 *
1129 * Purpose: Convert a GCG 2BIT binary string to DNA sequence.
1130 * 0 = C 1 = T 2 = A 3 = G
1131 * 4 nts/byte
1132 *
1133 * Args: seq - binary sequence. Converted in place to DNA.
1134 * len - length of DNA. binary is (len+3)/4 bytes
1135 */
1136 int
GCGBinaryToSequence(char * seq,int len)1137 GCGBinaryToSequence(char *seq, int len)
1138 {
1139 int bpos; /* position in binary */
1140 int spos; /* position in sequence */
1141 char twobit;
1142 int i;
1143
1144 for (bpos = (len-1)/4; bpos >= 0; bpos--)
1145 {
1146 twobit = seq[bpos];
1147 spos = bpos*4;
1148
1149 for (i = 3; i >= 0; i--)
1150 {
1151 switch (twobit & 0x3) {
1152 case 0: seq[spos+i] = 'C'; break;
1153 case 1: seq[spos+i] = 'T'; break;
1154 case 2: seq[spos+i] = 'A'; break;
1155 case 3: seq[spos+i] = 'G'; break;
1156 }
1157 twobit = twobit >> 2;
1158 }
1159 }
1160 seq[len] = '\0';
1161 return 1;
1162 }
1163
1164
1165 int
GCGchecksum(char * seq,int seqlen)1166 GCGchecksum(char *seq, int seqlen)
1167 {
1168 int check = 0, count = 0, i;
1169
1170 for (i = 0; i < seqlen; i++) {
1171 count++;
1172 check += count * sre_toupper((int) seq[i]);
1173 if (count == 57) count = 0;
1174 }
1175 return (check % 10000);
1176 }
1177
1178
1179 /* Function: GCGMultchecksum()
1180 *
1181 * Purpose: Simple modification of GCGchecksum(),
1182 * to create a checksum for multiple sequences.
1183 * Gaps count.
1184 *
1185 * Args: seqs - sequences to be checksummed
1186 * nseq - number of sequences
1187 *
1188 * Return: the checksum, a number between 0 and 9999
1189 */
1190 int
GCGMultchecksum(char ** seqs,int nseq)1191 GCGMultchecksum(char **seqs, int nseq)
1192 {
1193 int check = 0;
1194 int count = 0;
1195 int idx;
1196 char *sptr;
1197
1198 for (idx = 0; idx < nseq; idx++)
1199 for (sptr = seqs[idx]; *sptr; sptr++)
1200 {
1201 count++;
1202 check += count * sre_toupper((int) *sptr);
1203 if (count == 57) count = 0;
1204 }
1205 return (check % 10000);
1206 }
1207
1208
1209
1210
1211 /* Function: Seqtype()
1212 *
1213 * Purpose: Returns a (very good) guess about type of sequence:
1214 * kDNA, kRNA, kAmino, or kOtherSeq.
1215 *
1216 * Modified from, and replaces, Gilbert getseqtype().
1217 */
1218 int
Seqtype(char * seq)1219 Seqtype(char *seq)
1220 {
1221 int saw; /* how many non-gap characters I saw */
1222 char c;
1223 int po = 0; /* count of protein-only */
1224 int nt = 0; /* count of t's */
1225 int nu = 0; /* count of u's */
1226 int na = 0; /* count of nucleotides */
1227 int aa = 0; /* count of amino acids */
1228 int no = 0; /* count of others */
1229
1230 /* Look at the first 300 non-gap characters
1231 */
1232 for (saw = 0; *seq != '\0' && saw < 300; seq++)
1233 {
1234 c = sre_toupper((int) *seq);
1235 if (! isgap(c))
1236 {
1237 if (strchr(protonly, c)) po++;
1238 else if (strchr(primenuc,c)) {
1239 na++;
1240 if (c == 'T') nt++;
1241 else if (c == 'U') nu++;
1242 }
1243 else if (strchr(aminos,c)) aa++;
1244 else if (isalpha(c)) no++;
1245 saw++;
1246 }
1247 }
1248
1249 if (no > 0) return kOtherSeq;
1250 else if (po > 0) return kAmino;
1251 else if (na > aa) {
1252 if (nu > nt) return kRNA;
1253 else return kDNA;
1254 }
1255 else return kAmino;
1256 }
1257
1258 int
WriteSeq(FILE * outf,int outform,char * seq,SQINFO * sqinfo)1259 WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo)
1260 {
1261 int numline = 0;
1262 int lines = 0, spacer = 0, width = 50, tab = 0;
1263 int i, j, l, l1, ibase;
1264 char endstr[10];
1265 char s[100]; /* buffer for sequence */
1266 char ss[100]; /* buffer for structure */
1267 int checksum = 0;
1268 int seqlen;
1269 int which_case; /* 0 = do nothing. 1 = upper case. 2 = lower case */
1270 int dostruc; /* TRUE to print structure lines*/
1271
1272 which_case = 0;
1273 dostruc = FALSE;
1274 seqlen = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq);
1275
1276 /* intercept Selex-format requests - SRE */
1277 if (outform == kSelex) {
1278 fprintf(outf, "%10s %s\n", sqinfo->name, seq);
1279 return 1;
1280 }
1281
1282 if (outform == kClustal || outform == kMSF) {
1283 Warn("Tried to write Clustal or MSF with WriteSeq() -- bad, bad.");
1284 return 1;
1285 }
1286
1287 strcpy( endstr,"");
1288 l1 = 0;
1289
1290 /* 10Nov91: write this out in all possible formats: */
1291 checksum = GCGchecksum(seq, seqlen);
1292
1293 switch (outform) {
1294
1295 case kUnknown: /* no header, just sequence */
1296 strcpy(endstr,"\n"); /* end w/ extra blank line */
1297 break;
1298
1299 case kGenBank:
1300 fprintf(outf,"LOCUS %s %d bp\n",
1301 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name,
1302 seqlen);
1303 fprintf(outf,"DEFINITION %s\n",
1304 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1305 fprintf(outf,"ACCESSION %s\n",
1306 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1307 fprintf(outf,"ORIGIN \n");
1308 spacer = 11;
1309 numline = 1;
1310 strcpy(endstr, "\n//");
1311 break;
1312
1313 case kGCGdata:
1314 fprintf(outf, ">>>>%s 9/95 ASCII Len: %d\n", sqinfo->name, seqlen);
1315 fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1316 break;
1317
1318 case kPIR:
1319 fprintf(outf, "ENTRY %s\n",
1320 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
1321 fprintf(outf, "TITLE %s\n",
1322 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1323 fprintf(outf, "ACCESSION %s\n",
1324 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1325 fprintf(outf, "SUMMARY #Length %d #Checksum %d\n",
1326 sqinfo->len, checksum);
1327 fprintf(outf, "SEQUENCE\n");
1328 fprintf(outf, " 5 10 15 20 25 30\n");
1329 spacer = 2; /* spaces after every residue */
1330 numline = 1; /* number lines w/ coords */
1331 width = 30; /* 30 aa per line */
1332 strcpy(endstr, "\n///");
1333 break;
1334
1335 case kSquid:
1336 fprintf(outf, "NAM %s\n", sqinfo->name);
1337 if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN))
1338 fprintf(outf, "SRC %s %s %d..%d::%d\n",
1339 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : "-",
1340 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-",
1341 (sqinfo->flags & SQINFO_START) ? sqinfo->start : 0,
1342 (sqinfo->flags & SQINFO_STOP) ? sqinfo->stop : 0,
1343 (sqinfo->flags & SQINFO_OLEN) ? sqinfo->olen : 0);
1344 if (sqinfo->flags & SQINFO_DESC)
1345 fprintf(outf, "DES %s\n", sqinfo->desc);
1346 if (sqinfo->flags & SQINFO_SS)
1347 {
1348 fprintf(outf, "SEQ +SS\n");
1349 dostruc = TRUE; /* print structure lines too */
1350 }
1351 else
1352 fprintf(outf, "SEQ\n");
1353 numline = 1; /* number seq lines w/ coords */
1354 strcpy(endstr, "\n++");
1355 break;
1356
1357 case kEMBL:
1358 fprintf(outf,"ID %s\n",
1359 (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
1360 fprintf(outf,"AC %s\n",
1361 (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1362 fprintf(outf,"DE %s\n",
1363 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1364 fprintf(outf,"SQ %d BP\n", seqlen);
1365 strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
1366 tab = 5; /** added 31jan91 */
1367 spacer = 11; /** added 31jan91 */
1368 break;
1369
1370 case kGCG:
1371 fprintf(outf,"%s\n", sqinfo->name);
1372 if (sqinfo->flags & SQINFO_ACC)
1373 fprintf(outf,"ACCESSION %s\n", sqinfo->acc);
1374 if (sqinfo->flags & SQINFO_DESC)
1375 fprintf(outf,"DEFINITION %s\n", sqinfo->desc);
1376 fprintf(outf," %s Length: %d (today) Check: %d ..\n",
1377 sqinfo->name, seqlen, checksum);
1378 spacer = 11;
1379 numline = 1;
1380 strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */
1381 break;
1382
1383 case kStrider: /* ?? map ?*/
1384 fprintf(outf,"; ### from DNA Strider ;-)\n");
1385 fprintf(outf,"; DNA sequence %s, %d bases, %d checksum.\n;\n",
1386 sqinfo->name, seqlen, checksum);
1387 strcpy(endstr, "\n//");
1388 break;
1389
1390 /* SRE: Don had Zuker default to Pearson, which is not
1391 intuitive or helpful, since Zuker's MFOLD can't read
1392 Pearson format. More useful to use kIG */
1393 case kZuker:
1394 which_case = 1; /* MFOLD requires upper case. */
1395 /*FALLTHRU*/
1396 case kIG:
1397 fprintf(outf,";%s %s\n",
1398 sqinfo->name,
1399 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
1400 fprintf(outf,"%s\n", sqinfo->name);
1401 strcpy(endstr,"1"); /* == linear dna */
1402 break;
1403
1404 case kRaw: /* Raw: just print the whole sequence. */
1405 fprintf(outf, "%s\n", seq);
1406 return 1;
1407
1408 default :
1409 case kPearson:
1410 fprintf(outf,">%s %s\n", sqinfo->name,
1411 (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
1412 break;
1413 }
1414
1415 if (which_case == 1) s2upper(seq);
1416 if (which_case == 2) s2lower(seq);
1417
1418
1419 width = MIN(width,100);
1420 for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) {
1421 if (l1 < 0) l1 = 0;
1422 else if (l1 == 0) {
1423 if (numline) fprintf(outf,"%8d ",ibase);
1424 for (j=0; j<tab; j++) fputc(' ',outf);
1425 }
1426 if ((spacer != 0) && ((l+1) % spacer == 1))
1427 { s[l] = ' '; ss[l] = ' '; l++; }
1428 s[l] = seq[i];
1429 ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.';
1430 l++; i++;
1431 l1++; /* don't count spaces for width*/
1432 if (l1 == width || i == seqlen) {
1433 s[l] = ss[l] = '\0';
1434 l = 0; l1 = 0;
1435 if (dostruc)
1436 {
1437 fprintf(outf, "%s\n", s);
1438 if (numline) fprintf(outf," ");
1439 for (j=0; j<tab; j++) fputc(' ',outf);
1440 if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr);
1441 else fprintf(outf,"%s\n",ss);
1442 }
1443 else
1444 {
1445 if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr);
1446 else fprintf(outf,"%s\n",s);
1447 }
1448 lines++;
1449 ibase = i+1;
1450 }
1451 }
1452 return lines;
1453 }
1454
1455
1456 /* Function: ReadMultipleRseqs()
1457 *
1458 * Purpose: Open a data file and
1459 * parse it into an array of rseqs (raw, unaligned
1460 * sequences).
1461 *
1462 * Caller is responsible for free'ing memory allocated
1463 * to ret_rseqs, ret_weights, and ret_names.
1464 *
1465 * Weights are currently only supported for MSF format.
1466 * Sequences read from all other formats will be assigned
1467 * weights of 1.0. If the caller isn't interested in
1468 * weights, it passes NULL as ret_weights.
1469 *
1470 * Returns 1 on success. Returns 0 on failure and sets
1471 * squid_errno to indicate the cause.
1472 */
1473 int
ReadMultipleRseqs(char * seqfile,int fformat,char *** ret_rseqs,SQINFO ** ret_sqinfo,int * ret_num)1474 ReadMultipleRseqs(char *seqfile,
1475 int fformat,
1476 char ***ret_rseqs,
1477 SQINFO **ret_sqinfo,
1478 int *ret_num)
1479 {
1480 SQINFO *sqinfo; /* array of sequence optional info */
1481 SQFILE *dbfp; /* open ptr for sequential access of file */
1482 char **rseqs; /* sequence array */
1483 char **aseqs; /* aligned sequences, if file is aligned */
1484 AINFO ainfo; /* alignment-associated information */
1485 int numalloced; /* num of seqs currently alloced for */
1486 int idx;
1487 int num;
1488
1489 if (fformat == kSelex || fformat == kMSF || fformat == kClustal)
1490 {
1491 if (! ReadAlignment(seqfile, fformat, &aseqs, &ainfo)) return 0;
1492 if (! DealignAseqs(aseqs, ainfo.nseq, &rseqs)) return 0;
1493
1494 /* copy the sqinfo array
1495 */
1496 num = ainfo.nseq;
1497 sqinfo= (SQINFO *) MallocOrDie (sizeof(SQINFO)*ainfo.nseq);
1498 for (idx = 0; idx < ainfo.nseq; idx++)
1499 SeqinfoCopy(&(sqinfo[idx]), &(ainfo.sqinfo[idx]));
1500 FreeAlignment(aseqs, &ainfo);
1501 }
1502 else
1503 {
1504 /* initial alloc */
1505 num = 0;
1506 numalloced = 16;
1507 rseqs = (char **) MallocOrDie (numalloced * sizeof(char *));
1508 sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO));
1509 if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0;
1510
1511 while (ReadSeq(dbfp, fformat, &rseqs[num], &(sqinfo[num])))
1512 {
1513 num++;
1514 if (num == numalloced) /* more seqs coming, alloc more room */
1515 {
1516 numalloced += 16;
1517 rseqs = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *));
1518 sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO));
1519 }
1520 }
1521 SeqfileClose(dbfp);
1522 }
1523
1524 *ret_rseqs = rseqs;
1525 *ret_sqinfo = sqinfo;
1526 *ret_num = num;
1527 return 1;
1528 }
1529
1530
1531
1532
1533 char *
SeqFormatString(int code)1534 SeqFormatString(int code)
1535 {
1536 switch (code) {
1537 case kUnknown: return "Unknown";
1538 case kIG: return "Intelligenetics";
1539 case kGenBank: return "GenBank";
1540 case kEMBL: return "EMBL";
1541 case kGCG: return "GCG";
1542 case kGCGdata: return "GCG datalibrary";
1543 case kStrider: return "Strider";
1544 case kPearson: return "FASTA";
1545 case kZuker: return "Zuker";
1546 case kIdraw: return "Idraw";
1547 case kSelex: return "SELEX";
1548 case kMSF: return "MSF";
1549 case kClustal: return "Clustal";
1550 case kPIR: return "PIR-CODATA";
1551 case kRaw: return "raw";
1552 case kSquid: return "Squid";
1553 default: return "(bad code)";
1554 }
1555 }
1556
1557
1558
1559 /* Function: GSIOpen()
1560 *
1561 * Purpose: Open a GSI file. Returns the number of records in
1562 * the file and a file pointer. Returns NULL on failure.
1563 * The file pointer should be fclose()'d normally.
1564 */
1565 GSIFILE *
GSIOpen(char * gsifile)1566 GSIOpen(char *gsifile)
1567 {
1568 GSIFILE *gsi;
1569 char magic[GSI_KEYSIZE];
1570
1571 gsi = (GSIFILE *) MallocOrDie (sizeof(GSIFILE));
1572 if ((gsi->gsifp = fopen(gsifile, "r")) == NULL)
1573 { squid_errno = SQERR_NOFILE; return NULL; }
1574
1575 if (! fread(magic, sizeof(char), GSI_KEYSIZE, gsi->gsifp))
1576 { squid_errno = SQERR_NODATA; return NULL; }
1577 if (strcmp(magic, "GSI") != 0)
1578 { squid_errno = SQERR_FORMAT; return NULL; }
1579
1580 if (! fread(&(gsi->nfiles), sizeof(sqd_uint16), 1, gsi->gsifp))
1581 { squid_errno = SQERR_NODATA; return NULL; }
1582 if (! fread(&(gsi->recnum), sizeof(sqd_uint32), 1, gsi->gsifp))
1583 { squid_errno = SQERR_NODATA; return NULL; }
1584
1585 gsi->nfiles = sre_ntohs(gsi->nfiles); /* convert from network short */
1586 gsi->recnum = sre_ntohl(gsi->recnum); /* convert from network long */
1587
1588 return gsi;
1589 }
1590
1591 /* Function: GSIGetRecord()
1592 *
1593 * Purpose: Each non-header record of a GSI index files consists
1594 * of 38 bytes: 32 bytes of character string, a 2 byte
1595 * short, and a 4 byte long. This function returns the
1596 * three values.
1597 *
1598 * Args: gsi - open GSI index file, correctly positioned at a record
1599 * f1 - char[32], allocated by caller (or NULL if unwanted)
1600 * f2 - pointer to short (or NULL if unwanted)
1601 * f3 - pointer to long (or NULL if unwanted)
1602 *
1603 * Return: 0 on failure and sets squid_errno.
1604 */
1605 static int
GSIGetRecord(GSIFILE * gsi,char * f1,sqd_uint16 * f2,sqd_uint32 * f3)1606 GSIGetRecord(GSIFILE *gsi, char *f1, sqd_uint16 *f2, sqd_uint32 *f3)
1607 {
1608 if (f1 == NULL) fseek(gsi->gsifp, GSI_KEYSIZE, SEEK_CUR);
1609 else if (! fread(f1, GSI_KEYSIZE, 1, gsi->gsifp))
1610 { squid_errno = SQERR_NODATA; return 0; }
1611
1612 if (f2 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint16), SEEK_CUR);
1613 else if (! fread(f2, sizeof(sqd_uint16), 1, gsi->gsifp))
1614 { squid_errno = SQERR_NODATA; return 0; }
1615
1616 if (f3 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint32), SEEK_CUR);
1617 else if (! fread(f3, sizeof(sqd_uint32), 1, gsi->gsifp))
1618 { squid_errno = SQERR_NODATA; return 0; }
1619
1620 if (f2 != NULL) *f2 = sre_ntohs(*f2);
1621 if (f3 != NULL) *f3 = sre_ntohl(*f3);
1622
1623 return 1;
1624 }
1625
1626
1627 /* Function: GSIGetOffset()
1628 *
1629 * Purpose: From a key (sequence name), find a disk offset
1630 * in an open general sequence index file by binary
1631 * search. Presumably GSI indexing could be even faster
1632 * if we used hashing.
1633 *
1634 * Args: gsi - GSI index file, opened by GSIOpen()
1635 * key - name of key to retrieve indices for
1636 * ret_seqfile - pre-alloced char[32] array for seqfile name
1637 * ret_fmt - format of seqfile
1638 * ret_offset - return: disk offset in seqfile.
1639 */
1640 int
GSIGetOffset(GSIFILE * gsi,char * key,char * ret_seqfile,int * ret_format,long * ret_offset)1641 GSIGetOffset(GSIFILE *gsi, char *key, char *ret_seqfile,
1642 int *ret_format, long *ret_offset)
1643 {
1644 sqd_uint32 left, right, mid;
1645 int cmp;
1646 char name[GSI_KEYSIZE + 1];
1647 sqd_uint32 offset;
1648 sqd_uint16 filenum;
1649 sqd_uint32 fmt;
1650
1651 name[GSI_KEYSIZE] = '\0';
1652
1653 left = gsi->nfiles + 1;
1654 right = gsi->nfiles + gsi->recnum;
1655 mid = (left + right) / 2;
1656 fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET);
1657
1658 while (GSIGetRecord(gsi, name, &filenum, &offset))
1659 {
1660 cmp = strcmp(name, key);
1661 if (cmp == 0) break; /* found it! */
1662 else if (left >= right) return 0; /* oops, missed it; fail. */
1663 else if (cmp < 0) left = mid + 1; /* it's right of mid */
1664 else if (cmp > 0) right = mid - 1; /* it's left of mid */
1665 mid = (left + right) / 2;
1666 fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET);
1667 }
1668
1669 /* Using file number, look up the sequence file and format.
1670 */
1671 fseek(gsi->gsifp, filenum * GSI_RECSIZE, SEEK_SET);
1672 GSIGetRecord(gsi, ret_seqfile, NULL, &fmt);
1673 *ret_format = (int) fmt;
1674 *ret_offset = (long) offset;
1675
1676 return 1;
1677 }
1678
1679 /* Function: GSIClose()
1680 *
1681 * Purpose: Close an open GSI sequence index file.
1682 */
1683 void
GSIClose(GSIFILE * gsi)1684 GSIClose(GSIFILE *gsi)
1685 {
1686 fclose(gsi->gsifp);
1687 free(gsi);
1688 }
1689
1690
1691