1 /* SQUID - A C function library for biological sequence analysis
2  * Copyright (C) 1992-1996 Sean R. Eddy
3  *
4  *    This source code is distributed under terms of the
5  *    GNU General Public License. See the files COPYING
6  *    and GNULICENSE for further details.
7  *
8  */
9 
10 /* File: sqio.c
11  * From: ureadseq.c in Don Gilbert's sequence i/o package
12  *
13  * Reads and writes nucleic/protein sequence in various
14  * formats. Data files may have multiple sequences.
15  *
16  * Heavily modified from READSEQ package
17  * Copyright (C) 1990 by D.G. Gilbert
18  * Biology Dept., Indiana University, Bloomington, IN 47405
19  * email: gilbertd@bio.indiana.edu
20  * Thanks Don!
21  *
22  * SRE: Modifications as noted. Fri Jul  3 09:44:54 1992
23  *      Packaged for squid, Thu Oct  1 10:07:11 1992
24  *      ANSI conversion in full swing, Mon Jul 12 12:22:21 1993
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <ctype.h>
31 
32 #ifndef SEEK_SET
33 #include <unistd.h>	/* may SunOS rot in hell */
34 #endif
35 
36 #include "squid.h"
37 
38 #ifdef MEMDEBUG
39 #include "dbmalloc.h"
40 #endif
41 
42 #define kStartLength  500
43 
44 static char *aminos      = "ABCDEFGHIKLMNPQRSTVWXYZ*";
45 static char *primenuc    = "ACGTUN";
46 static char *protonly    = "EFIPQZ";
47 /* static char  stdsymbols[6]   = "_.-*?"; */
48 static char  allsymbols[32]  = "_.-*?<>{}[]()!@#$%^&=+;:'|`~\"\\";
49 static char *seqsymbols  = allsymbols;
50 /*
51     use general form of isseqchar -- all chars + symbols.
52     no formats except nbrf (?) use symbols in data area as
53     anything other than sequence chars.
54     (wrong. PIR-CODATA does. Remove /)
55 */
56 
57 
58 void
FreeSequence(char * seq,SQINFO * sqinfo)59 FreeSequence(char *seq, SQINFO *sqinfo)
60 {
61   if (seq != NULL) free(seq);
62   if (sqinfo->flags & SQINFO_SS)   free(sqinfo->ss);
63   if (sqinfo->flags & SQINFO_SA)   free(sqinfo->sa);
64 }
65 
66 int
SetSeqinfoString(SQINFO * sqinfo,char * sptr,int flag)67 SetSeqinfoString(SQINFO *sqinfo, char *sptr, int flag)
68 {
69   int len;
70   int pos;
71 
72   while (*sptr == ' ') sptr++; /* ignore leading whitespace */
73   for (pos = strlen(sptr)-1; pos >= 0; pos--)
74     if (! isspace(sptr[pos])) break;
75   sptr[pos+1] = '\0';	       /* ignore trailing whitespace */
76 
77   switch (flag) {
78   case SQINFO_NAME:
79     if (*sptr != '-')
80       {
81 	strncpy(sqinfo->name, sptr, SQINFO_NAMELEN-1);
82 	sqinfo->name[SQINFO_NAMELEN-1] = '\0';
83 	sqinfo->flags   |= SQINFO_NAME;
84       }
85     break;
86 
87   case SQINFO_ID:
88     if (*sptr != '-')
89       {
90 	strncpy(sqinfo->id, sptr, SQINFO_NAMELEN-1);
91 	sqinfo->id[SQINFO_NAMELEN-1] = '\0';
92 	sqinfo->flags |= SQINFO_ID;
93       }
94     break;
95 
96   case SQINFO_ACC:
97     if (*sptr != '-')
98       {
99 	strncpy(sqinfo->acc, sptr, SQINFO_NAMELEN-1);
100 	sqinfo->acc[SQINFO_NAMELEN-1] = '\0';
101 	sqinfo->flags   |= SQINFO_ACC;
102       }
103     break;
104 
105   case SQINFO_DESC:
106     if (*sptr != '-')
107       {
108 	if (sqinfo->flags & SQINFO_DESC) /* append? */
109 	  {
110 	    len = strlen(sqinfo->desc);
111 	    if (len < SQINFO_DESCLEN-2)	/* is there room? */
112 	      {
113 		strncat(sqinfo->desc, " ", SQINFO_DESCLEN-1-len); len++;
114 		strncat(sqinfo->desc, sptr, SQINFO_DESCLEN-1-len);
115 	      }
116 	  }
117 	else			/* else copy */
118 	  strncpy(sqinfo->desc, sptr, SQINFO_DESCLEN-1);
119 	sqinfo->desc[SQINFO_DESCLEN-1] = '\0';
120 	sqinfo->flags   |= SQINFO_DESC;
121       }
122     break;
123 
124   case SQINFO_START:
125     if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
126     sqinfo->start = atoi(sptr);
127     if (sqinfo->start != 0) sqinfo->flags |= SQINFO_START;
128     break;
129 
130   case SQINFO_STOP:
131     if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
132     sqinfo->stop = atoi(sptr);
133     if (sqinfo->stop != 0) sqinfo->flags |= SQINFO_STOP;
134     break;
135 
136   case SQINFO_OLEN:
137     if (!IsInt(sptr)) { squid_errno = SQERR_FORMAT; return 0; }
138     sqinfo->olen = atoi(sptr);
139     if (sqinfo->olen != 0) sqinfo->flags |= SQINFO_OLEN;
140     break;
141 
142   default:
143     Die("Invalid flag %d to SetSeqinfoString()", flag);
144   }
145   return 1;
146 }
147 
148 void
SeqinfoCopy(SQINFO * sq1,SQINFO * sq2)149 SeqinfoCopy(SQINFO *sq1, SQINFO *sq2)
150 {
151   sq1->flags = sq2->flags;
152   if (sq2->flags & SQINFO_NAME)  strcpy(sq1->name, sq2->name);
153   if (sq2->flags & SQINFO_ID)    strcpy(sq1->id,   sq2->id);
154   if (sq2->flags & SQINFO_ACC)   strcpy(sq1->acc,  sq2->acc);
155   if (sq2->flags & SQINFO_DESC)  strcpy(sq1->desc, sq2->desc);
156   if (sq2->flags & SQINFO_LEN)   sq1->len    = sq2->len;
157   if (sq2->flags & SQINFO_START) sq1->start  = sq2->start;
158   if (sq2->flags & SQINFO_STOP)  sq1->stop   = sq2->stop;
159   if (sq2->flags & SQINFO_OLEN)  sq1->olen   = sq2->olen;
160   if (sq2->flags & SQINFO_TYPE)  sq1->type   = sq2->type;
161   if (sq2->flags & SQINFO_SS)    sq1->ss     = Strdup(sq2->ss);
162   if (sq2->flags & SQINFO_SA)    sq1->sa     = Strdup(sq2->sa);
163 }
164 
165 /* Function: ToDNA()
166  *
167  * Purpose:  Convert a sequence to DNA.
168  *           U --> T
169  */
170 void
ToDNA(char * seq)171 ToDNA(char *seq)
172 {
173   for (; *seq != '\0'; seq++)
174     {
175       if      (*seq == 'U') *seq = 'T';
176       else if (*seq == 'u') *seq = 't';
177     }
178 }
179 
180 /* Function: ToRNA()
181  *
182  * Purpose:  Convert a sequence to RNA.
183  *           T --> U
184  */
185 void
ToRNA(char * seq)186 ToRNA(char *seq)
187 {
188   for (; *seq != '\0'; seq++)
189     {
190       if      (*seq == 'T') *seq = 'U';
191       else if (*seq == 't') *seq = 'u';
192     }
193 }
194 
195 
196 static int
isSeqChar(int c)197 isSeqChar(int c)
198 {
199   if (c > 127) return 0;	/* IRIX 4.0 bug! isascii(255) returns TRUE */
200   return (isalpha(c) || strchr(seqsymbols,c));
201 }
202 
203 static void
readline(FILE * f,char * s)204 readline(FILE *f, char *s)
205 {
206   char  *cp;
207 
208   if (NULL == fgets(s, LINEBUFLEN, f))
209     *s = 0;
210   else {
211     cp = strchr(s, '\n');
212     if (cp != NULL) *cp = 0;
213     }
214 }
215 
216 /* Function: get_line()
217  * Date:     SRE, Tue Mar  3 08:30:01 1998 [St. Louis]
218  *
219  * Purpose:  read a line from a sequence file into V->sbuffer.
220  *           If the fgets() is NULL, V->sbuffer is NULL.
221  *           Trailing \n is chopped.
222  *           If a trailing \n is not there, raise the
223  *           lastlinelong flag in V; either we're at EOF or
224  *           we have a very long line, over our fgets() buffer
225  *           length.
226  *
227  * Args:     V
228  *
229  * Returns:  (void)
230  */
231 static void
get_line(struct ReadSeqVars * V)232 get_line(struct ReadSeqVars *V)
233 {
234   char *cp;
235 
236   if (fgets(V->sbuffer, LINEBUFLEN, V->f) == NULL)
237     *(V->sbuffer) = '\0';
238   else {
239     cp = strchr(V->sbuffer, '\n');
240     if (cp != NULL) { *cp = '\0'; V->longline = FALSE; }
241     else            V->longline = TRUE;
242   }
243 }
244 
245 
246 /* Function: addseq()
247  *
248  * Purpose:  Add a line of sequence to the growing string in V.
249  */
250 static void
addseq(char * s,struct ReadSeqVars * V)251 addseq(char *s, struct ReadSeqVars *V)
252 {
253   while (*s != 0) {
254     if (isSeqChar((int) *s)) {
255       if (*s == '-' && V->dash_equals_n) *s = 'N';
256       if (V->seqlen >= V->maxseq) {
257         V->maxseq += kStartLength;
258         V->seq = (char*) ReallocOrDie (V->seq, V->maxseq+1);
259       }
260       V->seq[(V->seqlen)++] = *s;
261     }
262     s++;
263   }
264 }
265 
266 static void
addstruc(char * s,struct ReadSeqVars * V)267 addstruc(char *s, struct ReadSeqVars *V)
268 {
269   char *sptr;
270 
271   if (! (V->sqinfo->flags & SQINFO_SS))
272     {
273       V->sqinfo->ss = (char *) MallocOrDie ((V->maxseq+1) * sizeof(char));
274       V->sqinfo->flags |= SQINFO_SS;
275       sptr = V->sqinfo->ss;
276     }
277   else
278     {
279       V->sqinfo->ss = (char *) ReallocOrDie (V->sqinfo->ss, (V->maxseq+1) * sizeof(char));
280       sptr = V->sqinfo->ss;
281       while (*sptr != '\0') sptr++;
282     }
283 
284   while (*s != 0)
285     {
286       if (isSeqChar((int)*s)) { *sptr = *s; sptr++; }
287       s++;
288     }
289   *sptr = '\0';
290 }
291 
292 
293 static void
readLoop(int addfirst,int (* endTest)(char *,int *),struct ReadSeqVars * V)294 readLoop(int addfirst, int (*endTest)(char *,int *), struct ReadSeqVars *V)
295 {
296   int addend = 0;
297   int done   = 0;
298 
299   V->seqlen = 0;
300   if (addfirst) addseq(V->sbuffer, V);
301   do {
302     get_line(V);
303 	/* feof() alone is a bug; files not necessarily \n terminated */
304     if (*(V->sbuffer) == '\0' && feof(V->f))
305       done = TRUE;
306     done |= (*endTest)(V->sbuffer, &addend);
307     if (addend || !done)
308       addseq(V->sbuffer, V);
309   } while (!done);
310 }
311 
312 
313 static int
endPIR(char * s,int * addend)314 endPIR(char *s, int  *addend)
315 {
316   *addend = 0;
317   if ((strncmp(s, "///", 3) == 0) ||
318       (strncmp(s, "ENTRY", 5) == 0))
319     return 1;
320   else
321     return 0;
322 }
323 
324 static void
readPIR(struct ReadSeqVars * V)325 readPIR(struct ReadSeqVars *V)
326 {
327   char *sptr;
328 				/* load first line of entry  */
329   while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0)
330     get_line(V);
331   if (feof(V->f)) return;
332 
333   if ((sptr = strtok(V->sbuffer + 15, "\n\t ")) != NULL)
334     {
335       SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
336       SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
337     }
338   do {
339     get_line(V);
340     if (!feof(V->f) && strncmp(V->sbuffer, "TITLE", 5) == 0)
341       SetSeqinfoString(V->sqinfo, V->sbuffer+15, SQINFO_DESC);
342     else if (!feof(V->f) && strncmp(V->sbuffer, "ACCESSION", 9) == 0)
343       {
344 	if ((sptr = strtok(V->sbuffer+15, " \t\n")) != NULL)
345 	  SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
346       }
347   } while (! feof(V->f) && (strncmp(V->sbuffer,"SEQUENCE", 8) != 0));
348   get_line(V);			/* skip next line, coords */
349 
350   readLoop(0, endPIR, V);
351 
352   /* reading a real PIR-CODATA database file, we keep the source coords
353    */
354   V->sqinfo->start = 1;
355   V->sqinfo->stop  = V->seqlen;
356   V->sqinfo->olen  = V->seqlen;
357   V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
358 
359   /* get next line
360    */
361   while (!feof(V->f) && strncmp(V->sbuffer, "ENTRY", 5) != 0)
362     get_line(V);
363 }
364 
365 
366 
367 static int
endIG(char * s,int * addend)368 endIG(char *s, int  *addend)
369 {
370   *addend = 1; /* 1 or 2 occur in line w/ bases */
371   return((strchr(s,'1')!=NULL) || (strchr(s,'2')!=NULL));
372 }
373 
374 static void
readIG(struct ReadSeqVars * V)375 readIG(struct ReadSeqVars *V)
376 {
377   char *nm;
378 				/* position past ';' comments */
379   do {
380     get_line(V);
381   } while (! (feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer != ';')) ));
382 
383   if (!feof(V->f))
384     {
385       if ((nm = strtok(V->sbuffer, "\n\t ")) != NULL)
386 	SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
387 
388       readLoop(0, endIG, V);
389     }
390 
391   while (!(feof(V->f) || ((*V->sbuffer != '\0') && (*V->sbuffer == ';'))))
392     get_line(V);
393 }
394 
395 static int
endStrider(char * s,int * addend)396 endStrider(char *s, int *addend)
397 {
398   *addend = 0;
399   return (strstr( s, "//") != NULL);
400 }
401 
402 static void
readStrider(struct ReadSeqVars * V)403 readStrider(struct ReadSeqVars *V)
404 {
405   char *nm;
406 
407   while ((!feof(V->f)) && (*V->sbuffer == ';'))
408     {
409       if (strncmp(V->sbuffer,"; DNA sequence", 14) == 0)
410 	{
411 	  if ((nm = strtok(V->sbuffer+16, ",\n\t ")) != NULL)
412 	    SetSeqinfoString(V->sqinfo, nm, SQINFO_NAME);
413 	}
414       get_line(V);
415     }
416 
417   if (! feof(V->f))
418     readLoop(1, endStrider, V);
419 
420   /* load next line
421    */
422   while ((!feof(V->f)) && (*V->sbuffer != ';'))
423     get_line(V);
424 }
425 
426 
427 static int
endGB(char * s,int * addend)428 endGB(char *s, int *addend)
429 {
430   *addend = 0;
431   return ((strstr(s,"//") != NULL) || (strstr(s,"LOCUS") == s));
432 }
433 
434 static void
readGenBank(struct ReadSeqVars * V)435 readGenBank(struct ReadSeqVars *V)
436 {
437   char *sptr;
438   int   in_definition;
439 
440   while (strncmp(V->sbuffer, "LOCUS", 5) != 0)
441     get_line(V);
442 
443   if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL)
444     {
445       SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
446       SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
447     }
448 
449   in_definition = FALSE;
450   while (! feof(V->f))
451     {
452       get_line(V);
453       if (! feof(V->f) && strstr(V->sbuffer, "DEFINITION") == V->sbuffer)
454 	{
455 	  if ((sptr = strtok(V->sbuffer+12, "\n")) != NULL)
456 	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
457 	  in_definition = TRUE;
458 	}
459       else if (! feof(V->f) && strstr(V->sbuffer, "ACCESSION") == V->sbuffer)
460 	{
461 	  if ((sptr = strtok(V->sbuffer+12, "\n\t ")) != NULL)
462 	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
463 	  in_definition = FALSE;
464 	}
465       else if (strncmp(V->sbuffer,"ORIGIN", 6) != 0)
466 	{
467 	  if (in_definition)
468 	    SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC);
469 	}
470       else
471 	break;
472     }
473 
474   readLoop(0, endGB, V);
475 
476   /* reading a real GenBank database file, we keep the source coords
477    */
478   V->sqinfo->start = 1;
479   V->sqinfo->stop  = V->seqlen;
480   V->sqinfo->olen  = V->seqlen;
481   V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
482 
483 
484   while (!(feof(V->f) || ((*V->sbuffer!=0) && (strstr(V->sbuffer,"LOCUS") == V->sbuffer))))
485     get_line(V);
486 				/* SRE: V->s now holds "//", so sequential
487 				   reads are wedged: fixed Tue Jul 13 1993 */
488   while (!feof(V->f) && strstr(V->sbuffer, "LOCUS  ") != V->sbuffer)
489     get_line(V);
490 }
491 
492 static int
endGCGdata(char * s,int * addend)493 endGCGdata(char *s, int *addend)
494 {
495   *addend = 0;
496   return (*s == '>');
497 }
498 
499 static void
readGCGdata(struct ReadSeqVars * V)500 readGCGdata(struct ReadSeqVars *V)
501 {
502   int   binary = FALSE;		/* whether data are binary or not */
503   int   blen;			/* length of binary sequence */
504 
505 				/* first line contains ">>>>" followed by name */
506   if (Strparse(">>>>([^ ]+) .+2BIT +Len: ([0-9]+)", V->sbuffer, 2) == 0)
507     {
508       binary = TRUE;
509       SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
510       blen = atoi(sqd_parse[2]);
511     }
512   else if (Strparse(">>>>([^ ]+) .+ASCII +Len: [0-9]+", V->sbuffer, 1) == 0)
513     SetSeqinfoString(V->sqinfo, sqd_parse[1], SQINFO_NAME);
514   else
515     Die("bogus GCGdata format? %s", V->sbuffer);
516 
517 				/* second line contains free text description */
518   get_line(V);
519   SetSeqinfoString(V->sqinfo, V->sbuffer, SQINFO_DESC);
520 
521   if (binary) {
522     /* allocate for blen characters +3... (allow for 3 bytes of slop) */
523     if (blen >= V->maxseq) {
524       V->maxseq = blen;
525       if ((V->seq = (char *) realloc (V->seq, sizeof(char)*(V->maxseq+4)))==NULL)
526 	Die("malloc failed");
527     }
528 				/* read (blen+3)/4 bytes from file */
529     if (fread(V->seq, sizeof(char), (blen+3)/4, V->f) < (size_t) ((blen+3)/4))
530       Die("fread failed");
531     V->seqlen = blen;
532 				/* convert binary code to seq */
533     GCGBinaryToSequence(V->seq, blen);
534   }
535   else readLoop(0, endGCGdata, V);
536 
537   while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>'))))
538     get_line(V);
539 }
540 
541 static int
endPearson(char * s,int * addend)542 endPearson(char *s, int *addend)
543 {
544   *addend = 0;
545   return(*s == '>');
546 }
547 
548 static void
readPearson(struct ReadSeqVars * V)549 readPearson(struct ReadSeqVars *V)
550 {
551   char *sptr;
552 
553   if ((sptr = strtok(V->sbuffer+1, "\n\t ")) != NULL)
554     SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
555   if ((sptr = strtok(NULL, "\n")) != NULL)
556     SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
557 				/* workaround for long NCBI NR lines */
558   while (V->longline && ! feof(V->f)) get_line(V);
559 
560   readLoop(0, endPearson, V);
561 
562   while (!(feof(V->f) || ((*V->sbuffer != 0) && (*V->sbuffer == '>'))))
563     get_line(V);
564 }
565 
566 
567 static int
endEMBL(char * s,int * addend)568 endEMBL(char *s, int *addend)
569 {
570   *addend = 0;
571   /* Some people (Berlin 5S rRNA database, f'r instance) use
572    * an extended EMBL format that attaches extra data after
573    * the sequence -- watch out for that. We use the fact that
574    * real EMBL sequence lines begin with five spaces.
575    *
576    * We can use this as the sole end test because readEMBL() will
577    * advance to the next ID line before starting to read again.
578    */
579   return (strncmp(s,"     ",5) != 0);
580 /*  return ((strstr(s,"//") != NULL) || (strstr(s,"ID   ") == s)); */
581 }
582 
583 static void
readEMBL(struct ReadSeqVars * V)584 readEMBL(struct ReadSeqVars *V)
585 {
586   char *sptr;
587 
588 				/* make sure we have first line */
589   while (!feof(V->f) && strncmp(V->sbuffer, "ID  ", 4) != 0)
590     get_line(V);
591 
592   if ((sptr = strtok(V->sbuffer+5, "\n\t ")) != NULL)
593     {
594       SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
595       SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
596     }
597 
598   do {
599     get_line(V);
600     if (!feof(V->f) && strstr(V->sbuffer, "AC  ") == V->sbuffer)
601       {
602 	if ((sptr = strtok(V->sbuffer+5, ";  \t\n")) != NULL)
603 	  SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
604       }
605     else if (!feof(V->f) && strstr(V->sbuffer, "DE  ") == V->sbuffer)
606       {
607 	if ((sptr = strtok(V->sbuffer+5, "\n")) != NULL)
608 	  SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
609       }
610   } while (! feof(V->f) && strncmp(V->sbuffer,"SQ",2) != 0);
611 
612   readLoop(0, endEMBL, V);
613 
614   /* reading a real EMBL database file, we keep the source coords
615    */
616   V->sqinfo->start = 1;
617   V->sqinfo->stop  = V->seqlen;
618   V->sqinfo->olen  = V->seqlen;
619   V->sqinfo->flags |= SQINFO_START | SQINFO_STOP | SQINFO_OLEN;
620 
621 				/* load next record's ID line */
622   while (!feof(V->f) && strncmp(V->sbuffer, "ID  ", 4) != 0)
623     get_line(V);
624 }
625 
626 
627 static int
endZuker(char * s,int * addend)628 endZuker(char *s, int *addend)
629 {
630   *addend = 0;
631   return( *s == '(' );
632 }
633 
634 static void
readZuker(struct ReadSeqVars * V)635 readZuker(struct ReadSeqVars *V)
636 {
637   char *sptr;
638 
639   get_line(V);  /*s == "seqLen seqid string..."*/
640 
641   if ((sptr = strtok(V->sbuffer+6, " \t\n")) != NULL)
642     SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
643 
644   if ((sptr = strtok(NULL, "\n")) != NULL)
645     SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
646 
647   readLoop(0, endZuker, V);
648 
649   while (!(feof(V->f) | ((*V->sbuffer != '\0') & (*V->sbuffer == '('))))
650     get_line(V);
651 }
652 
653 static void
readUWGCG(struct ReadSeqVars * V)654 readUWGCG(struct ReadSeqVars *V)
655 {
656   char  *si;
657   char  *sptr;
658   int    done;
659 
660   V->seqlen = 0;
661 
662   /*writeseq: "    %s  Length: %d  (today)  Check: %d  ..\n" */
663   /*drop above or ".." from id*/
664   if ((si = strstr(V->sbuffer,"  Length: ")) != NULL) *si = 0;
665   else if ((si = strstr(V->sbuffer,"..")) != NULL)    *si = 0;
666 
667   if ((sptr = strtok(V->sbuffer, "\n\t ")) != NULL)
668     SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
669 
670   do {
671     done = feof(V->f);
672     get_line(V);
673     if (! done) addseq(V->sbuffer, V);
674   } while (!done);
675 }
676 
677 
678 static void
readSquid(struct ReadSeqVars * V)679 readSquid(struct ReadSeqVars *V)
680 {
681   char *sptr;
682   int   dostruc = FALSE;
683 
684   while (strncmp(V->sbuffer, "NAM ", 4) != 0) get_line(V);
685 
686   if ((sptr = strtok(V->sbuffer+4, "\n\t ")) != NULL)
687     SetSeqinfoString(V->sqinfo, sptr, SQINFO_NAME);
688 
689   /*CONSTCOND*/
690   while (1)
691     {
692       get_line(V);
693       if (feof(V->f)) {squid_errno = SQERR_FORMAT; return; }
694 
695       if (strncmp(V->sbuffer, "SRC ", 4) == 0)
696 	{
697 	  if ((sptr = strtok(V->sbuffer+4, " \t\n")) != NULL)
698 	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_ID);
699 	  if ((sptr = strtok(NULL, " \t\n")) != NULL)
700 	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_ACC);
701 	  if ((sptr = strtok(NULL, ".")) != NULL)
702 	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_START);
703 	  if ((sptr = strtok(NULL, ".:")) != NULL)
704 	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_STOP);
705 	  if ((sptr = strtok(NULL, ": \t\n")) != NULL)
706 	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_OLEN);
707 	}
708       else if (strncmp(V->sbuffer, "DES ", 4) == 0)
709 	{
710 	  if ((sptr = strtok(V->sbuffer+4, "\n")) != NULL)
711 	    SetSeqinfoString(V->sqinfo, sptr, SQINFO_DESC);
712 	}
713       else if (strncmp(V->sbuffer,"SEQ", 3) == 0)
714 	break;
715   }
716 
717   if (strstr(V->sbuffer, "+SS") != NULL) dostruc = TRUE;
718 
719   V->seqlen = 0;
720   /*CONSTCOND*/
721   while (1)
722     {
723 				/* sequence line */
724       get_line(V);
725       if (feof(V->f) || strncmp(V->sbuffer, "++", 2) == 0)
726 	break;
727       addseq(V->sbuffer, V);
728 				/* structure line */
729       if (dostruc)
730 	{
731 	  get_line(V);
732 	  if (feof(V->f)) { squid_errno = SQERR_FORMAT; return; }
733 	  addstruc(V->sbuffer, V);
734 	}
735     }
736 
737 
738   while (!feof(V->f) && strncmp(V->sbuffer, "NAM ", 4) != 0)
739     get_line(V);
740 }
741 
742 
743 /* Function: SeqfileOpen()
744  *
745  * Purpose : Open a sequence database file and prepare for reading
746  *           sequentially.
747  *
748  * Args:     filename - name of file to open
749  *           format   - format of file
750  *           env      - environment variable for path (e.g. BLASTDB)
751  *
752  *           Returns opened SQFILE ptr, or NULL on failure.
753  */
754 SQFILE *
SeqfileOpen(char * filename,int format,char * env)755 SeqfileOpen(char *filename, int format, char *env)
756 {
757   SQFILE *dbfp;
758 
759   dbfp = (SQFILE *) MallocOrDie (sizeof(SQFILE));
760   dbfp->format   = format;
761   dbfp->longline = FALSE;
762 
763   /* Open our file handle.
764    * Three possibilities:
765    *    1. normal file open
766    *    2. filename = "-";    read from stdin
767    *    3. filename = "*.gz"; read thru pipe from gzip
768    * If we're reading from stdin or a pipe, we can't reliably
769    * back up, so we can't do two-pass parsers like the interleaved alignment
770    * formats.
771    */
772   if (strcmp(filename, "-") == 0)
773     {
774       if (IsInterleavedFormat(format))
775 	Die("Can't read interleaved alignment formats thru stdin, sorry");
776 
777       dbfp->f         = stdin;
778       dbfp->do_stdin  = TRUE;
779       dbfp->do_gzip   = FALSE;
780     }
781   else if (Strparse("^.*\\.gz$", filename, 0) == 0)
782     {
783       char cmd[256];
784 
785       if (IsInterleavedFormat(format))
786 	Die("Can't read interleaved alignment formats thru gunzip, sorry");
787 
788       if (strlen(filename) + strlen("gzip -dc ") >= 256)
789 	{ squid_errno = SQERR_PARAMETER; return NULL; }
790       sprintf(cmd, "gzip -dc %s", filename);
791       if ((dbfp->f = popen(cmd, "r")) == NULL)
792 	{ squid_errno = SQERR_NOFILE; return NULL; } /* file (or gzip!) doesn't exist */
793       dbfp->do_stdin = FALSE;
794       dbfp->do_gzip  = TRUE;
795     }
796   else
797     {
798       if ((dbfp->f = fopen(filename, "r")) == NULL &&
799 	  (dbfp->f = EnvFileOpen(filename, env)) == NULL)
800 	{  squid_errno = SQERR_NOFILE; return NULL; }
801       dbfp->do_stdin = FALSE;
802       dbfp->do_gzip  = FALSE;
803     }
804 
805   /* The hack for sequential access of an interleaved alignment file:
806    * read the alignment in, we'll copy sequences out one at a time.
807    */
808   dbfp->ali_aseqs = NULL;
809   if (IsInterleavedFormat(format))
810     {
811       if (! ReadAlignment(filename, format, &(dbfp->ali_aseqs), &(dbfp->ali_ainfo)))
812 	return NULL;
813       dbfp->ali_curridx = 0;
814       return dbfp;
815     }
816 
817   /* Load the first line.
818    */
819   get_line(dbfp);
820 
821   return dbfp;
822 }
823 
824 /* Function: SeqfilePosition()
825  *
826  * Purpose:  Move to a particular offset in a seqfile.
827  *           Will not work on interleaved files (SELEX, MSF).
828  */
829 void
SeqfilePosition(SQFILE * sqfp,long offset)830 SeqfilePosition(SQFILE *sqfp, long offset)
831 {
832   if (sqfp->do_stdin || sqfp->do_gzip || IsInterleavedFormat(sqfp->format))
833     Die("SeqfilePosition() failed: in a nonrewindable data file or stream");
834 
835   fseek(sqfp->f, offset, SEEK_SET);
836   get_line(sqfp);
837 }
838 
839 
840 /* Function: SeqfileRewind()
841  *
842  * Purpose:  Set a sequence file back to the first sequence.
843  *           How to do this varies with whether the file is an
844  *           unaligned file, or an alignment file for which
845  *           we're hacking sequential access.
846  */
847 void
SeqfileRewind(SQFILE * sqfp)848 SeqfileRewind(SQFILE *sqfp)
849 {
850   if (sqfp->do_stdin || sqfp->do_gzip)
851     Die("SeqfileRewind() failed: in a nonrewindable data file or stream");
852 
853   if (sqfp->ali_aseqs != NULL) sqfp->ali_curridx = 0;
854   else {
855     rewind(sqfp->f);
856     get_line(sqfp);
857   }
858 }
859 
860 void
SeqfileClose(SQFILE * sqfp)861 SeqfileClose(SQFILE *sqfp)
862 {
863 				/* free static ptrs if we used them */
864   if (sqfp->ali_aseqs != NULL)
865     FreeAlignment(sqfp->ali_aseqs, &(sqfp->ali_ainfo));
866 
867   if (sqfp->do_gzip)         pclose(sqfp->f);
868   else if (! sqfp->do_stdin) fclose(sqfp->f);
869   free(sqfp);
870 }
871 
872 
873 /* Function: ReadSeq()
874  *
875  * Purpose:  Read next sequence from an open database file.
876  *           Return the sequence and associated info.
877  *
878  * Args:     fp      - open sequence database file pointer
879  *           format  - format of the file (previously determined
880  *                      by call to SeqfileFormat())
881  *           ret_seq - RETURN: sequence
882  *           sqinfo  - RETURN: filled in w/ other information
883  *
884  * Return:   1 on success, 0 on failure.
885  *           ret_seq and some field of sqinfo are allocated here,
886  *           The preferred call mechanism to properly free the memory is:
887  *
888  *           SQINFO sqinfo;
889  *           char  *seq;
890  *
891  *           ReadSeq(fp, format, &seq, &sqinfo);
892  *           ... do something...
893  *           FreeSequence(seq, &sqinfo);
894  */
895 int
ReadSeq(SQFILE * V,int format,char ** ret_seq,SQINFO * sqinfo)896 ReadSeq(SQFILE *V, int format, char **ret_seq, SQINFO *sqinfo)
897 {
898   int    gotuw;
899   int    apos, rpos;
900 
901   squid_errno = SQERR_OK;
902   if (format < kMinFormat || format > kMaxFormat)
903     {
904       squid_errno = SQERR_FORMAT;
905       *ret_seq    = NULL;
906       return 0;
907     }
908 
909   /* Here's the hack for sequential access of sequences from
910    * the multiple sequence alignment formats
911    */
912   if (format == kMSF || format == kSelex || format == kClustal) {
913     if (V->ali_curridx >= V->ali_ainfo.nseq)
914       return 0; /* out of aseqs */
915 
916     SeqinfoCopy(sqinfo, &(V->ali_ainfo.sqinfo[V->ali_curridx]));
917 
918 				/* copy and dealign the appropriate aligned seq */
919     V->seq = MallocOrDie (sizeof(char) * (V->ali_ainfo.alen+1));
920     for (rpos = apos = 0; apos < V->ali_ainfo.alen; apos++)
921       if (!isgap(V->ali_aseqs[V->ali_curridx][apos]))
922 	V->seq[rpos++] = V->ali_aseqs[V->ali_curridx][apos];
923     V->seq[rpos]   = '\0';
924     V->seqlen      = rpos;
925     V->ali_curridx++;
926   }
927   else {
928     if (feof(V->f)) return 0;
929 
930     V->seq           = (char*) calloc (kStartLength+1, sizeof(char));
931     V->maxseq        = kStartLength;
932     V->seqlen        = 0;
933     V->sqinfo        = sqinfo;
934     V->sqinfo->flags = 0;
935     V->dash_equals_n = (format == kEMBL) ? TRUE : FALSE;
936 
937     switch (format) {
938     case kIG      : readIG(V);      break;
939     case kStrider : readStrider(V); break;
940     case kGenBank : readGenBank(V); break;
941     case kPearson : readPearson(V); break;
942     case kEMBL    : readEMBL(V);    break;
943     case kZuker   : readZuker(V);   break;
944     case kPIR     : readPIR(V);     break;
945     case kSquid   : readSquid(V);   break;
946     case kGCGdata : readGCGdata(V); break;
947 
948     case kGCG:
949       do {			/* skip leading comments on GCG file */
950 	gotuw = (strstr(V->sbuffer,"..") != NULL);
951 	if (gotuw) readUWGCG(V);
952 	get_line(V);
953       } while (! feof(V->f));
954       break;
955 
956     case kIdraw:   /* SRE: no attempt to read idraw postscript */
957     default:
958       squid_errno = SQERR_FORMAT;
959       free(V->seq);
960       return 0;
961     }
962     V->seq[V->seqlen] = 0; /* stick a string terminator on it */
963   }
964 
965   /* Cleanup
966    */
967   sqinfo->len    = V->seqlen;
968   sqinfo->flags |= SQINFO_LEN;
969   *ret_seq = V->seq;
970   if (squid_errno == SQERR_OK) return 1; else return 0;
971 }
972 
973 /* Function: SeqfileFormat()
974  *
975  * Purpose:  Determine format of seqfile, and return it
976  *           through ret_format. From Gilbert's seqFileFormat().
977  *
978  *           If filename is "-", we will read from stdin and
979  *           assume that the stream is coming in FASTA format --
980  *           either unaligned or aligned.
981  *
982  * Args:     filename   - name of sequence file
983  *           ret_format - RETURN: format code for file, see squid.h
984  *                        for codes.
985  *           env        - name of environment variable containing
986  *                        a directory path that filename might also be
987  *                        found in. "BLASTDB", for example. Can be NULL.
988  *
989  * Return:   1 on success, 0 on failure.
990  */
991 int
SeqfileFormat(char * filename,int * ret_format,char * env)992 SeqfileFormat(char *filename, int  *ret_format, char *env)
993 {
994   int   foundIG      = 0;
995   int   foundStrider = 0;
996   int   foundGB      = 0;
997   int   foundEMBL    = 0;
998   int   foundPearson = 0;
999   int   foundZuker   = 0;
1000   int   gotGCGdata   = 0;
1001   int   gotPIR       = 0;
1002   int   gotSquid     = 0;
1003   int   gotuw        = 0;
1004   int   gotMSF       = 0;
1005   int   gotClustal   = 0;
1006   int   done         = 0;
1007   int   format       = kUnknown;
1008   int   nlines= 0, dnalines= 0;
1009   int   splen = 0;
1010   char  sp[LINEBUFLEN];
1011   FILE *fseq;
1012 
1013   /* First check if filename is "-": special case indicating
1014    * a FASTA pipe.
1015    */
1016   if (strcmp(filename, "-") == 0)
1017     { *ret_format = kPearson; return 1; }
1018 
1019 #define ReadOneLine(sp)   \
1020   { done |= (feof(fseq)); \
1021     readline( fseq, sp);  \
1022     if (!done) { splen = (int) strlen(sp); ++nlines; } }
1023 
1024   if ((fseq = fopen(filename, "r")) == NULL &&
1025       (fseq = EnvFileOpen(filename, env)) == NULL)
1026     { squid_errno = SQERR_NOFILE;  return 0; }
1027 
1028   /* Look at a line at a time
1029    */
1030   while ( !done ) {
1031     ReadOneLine(sp);
1032 
1033     if (sp==NULL || *sp=='\0')
1034       /*EMPTY*/ ;
1035 
1036     /* high probability identities: */
1037 
1038     else if (strstr(sp, " MSF:")   != NULL &&
1039 	     strstr(sp, " Type:")  != NULL &&
1040 	     strstr(sp, " Check:") != NULL)
1041       gotMSF = 1;
1042 
1043     else if (strncmp(sp, "CLUSTAL ", 8) == 0 &&
1044 	     strstr( sp, "multiple sequence alignment"))
1045       gotClustal = 1;
1046 
1047     else if (strstr(sp," Check: ") != NULL)
1048       gotuw= 1;
1049 
1050     else if (strncmp(sp, "///", 3) == 0 || strncmp(sp, "ENTRY ", 6) == 0)
1051       gotPIR = 1;
1052 
1053     else if (strncmp(sp, "++", 2) == 0 || strncmp(sp, "NAM ", 4) == 0)
1054       gotSquid = 1;
1055 
1056     else if (strncmp(sp, ">>>>", 4) == 0 && strstr(sp, "Len: "))
1057       gotGCGdata = 1;
1058 
1059     /* uncertain identities: */
1060 
1061     else if (*sp ==';') {
1062       if (strstr(sp,"Strider") !=NULL) foundStrider= 1;
1063       else foundIG= 1;
1064     }
1065     else if (strncmp(sp,"LOCUS",5) == 0 || strncmp(sp,"ORIGIN",5) == 0)
1066       foundGB= 1;
1067 
1068     else if (*sp == '>') {
1069       foundPearson  = 1;
1070     }
1071 
1072     else if (strstr(sp,"ID   ") == sp || strstr(sp,"SQ   ") == sp)
1073       foundEMBL= 1;
1074 
1075     else if (*sp == '(')
1076       foundZuker= 1;
1077 
1078     else {
1079       switch (Seqtype( sp )) {
1080       case kDNA:
1081       case kRNA: if (splen>20) dnalines++; break;
1082       default:   break;
1083       }
1084     }
1085 
1086     if      (gotMSF)     {format = kMSF;     done = 1; }
1087     else if (gotClustal) {format = kClustal; done = 1; }
1088     else if (gotSquid)   {format = kSquid;   done = 1; }
1089     else if (gotPIR)     {format = kPIR;     done = 1; }
1090     else if (gotGCGdata) {format = kGCGdata; done = 1; }
1091     else if (gotuw)
1092       {
1093 	if (foundIG) format= kIG;  /* a TOIG file from GCG for certain */
1094 	else format= kGCG;
1095 	done= 1;
1096       }
1097     else if ((dnalines > 1) || done || (nlines > 500)) {
1098       /* decide on most likely format */
1099       /* multichar idents: */
1100       if (foundStrider)      format= kStrider;
1101       else if (foundGB)      format= kGenBank;
1102       else if (foundEMBL)    format= kEMBL;
1103       /* single char idents: */
1104       else if (foundIG)      format= kIG;
1105       else if (foundPearson) format= kPearson;
1106       else if (foundZuker)   format= kZuker;
1107       /* spacing ident: */
1108       else if (IsSELEXFormat(filename)) format= kSelex;
1109       /* no format chars: */
1110       else
1111 	{
1112 	  squid_errno = SQERR_FORMAT;
1113 	  return 0;
1114 	}
1115 
1116       done= 1;
1117     }
1118   }
1119 
1120   if (fseq!=NULL) fclose(fseq);
1121 
1122   *ret_format = format;
1123   return 1;
1124 #undef  ReadOneLine
1125 }
1126 
1127 /* Function: GCGBinaryToSequence()
1128  *
1129  * Purpose:  Convert a GCG 2BIT binary string to DNA sequence.
1130  *           0 = C  1 = T  2 = A  3 = G
1131  *           4 nts/byte
1132  *
1133  * Args:     seq  - binary sequence. Converted in place to DNA.
1134  *           len  - length of DNA. binary is (len+3)/4 bytes
1135  */
1136 int
GCGBinaryToSequence(char * seq,int len)1137 GCGBinaryToSequence(char *seq, int len)
1138 {
1139   int   bpos;			/* position in binary   */
1140   int   spos;			/* position in sequence */
1141   char  twobit;
1142   int   i;
1143 
1144   for (bpos = (len-1)/4; bpos >= 0; bpos--)
1145     {
1146       twobit = seq[bpos];
1147       spos   = bpos*4;
1148 
1149       for (i = 3; i >= 0; i--)
1150 	{
1151 	  switch (twobit & 0x3) {
1152 	  case 0: seq[spos+i] = 'C'; break;
1153 	  case 1: seq[spos+i] = 'T'; break;
1154 	  case 2: seq[spos+i] = 'A'; break;
1155 	  case 3: seq[spos+i] = 'G'; break;
1156 	  }
1157 	  twobit = twobit >> 2;
1158 	}
1159     }
1160   seq[len] = '\0';
1161   return 1;
1162 }
1163 
1164 
1165 int
GCGchecksum(char * seq,int seqlen)1166 GCGchecksum(char *seq, int   seqlen)
1167 {
1168   int  check = 0, count = 0, i;
1169 
1170   for (i = 0; i < seqlen; i++) {
1171     count++;
1172     check += count * sre_toupper((int) seq[i]);
1173     if (count == 57) count = 0;
1174     }
1175   return (check % 10000);
1176 }
1177 
1178 
1179 /* Function: GCGMultchecksum()
1180  *
1181  * Purpose:  Simple modification of GCGchecksum(),
1182  *           to create a checksum for multiple sequences.
1183  *           Gaps count.
1184  *
1185  * Args:     seqs - sequences to be checksummed
1186  *           nseq - number of sequences
1187  *
1188  * Return:   the checksum, a number between 0 and 9999
1189  */
1190 int
GCGMultchecksum(char ** seqs,int nseq)1191 GCGMultchecksum(char **seqs, int nseq)
1192 {
1193   int check = 0;
1194   int count = 0;
1195   int idx;
1196   char *sptr;
1197 
1198   for (idx = 0; idx < nseq; idx++)
1199     for (sptr = seqs[idx]; *sptr; sptr++)
1200       {
1201 	count++;
1202 	check += count * sre_toupper((int) *sptr);
1203 	if (count == 57) count = 0;
1204       }
1205   return (check % 10000);
1206 }
1207 
1208 
1209 
1210 
1211 /* Function: Seqtype()
1212  *
1213  * Purpose:  Returns a (very good) guess about type of sequence:
1214  *           kDNA, kRNA, kAmino, or kOtherSeq.
1215  *
1216  *           Modified from, and replaces, Gilbert getseqtype().
1217  */
1218 int
Seqtype(char * seq)1219 Seqtype(char *seq)
1220 {
1221   int  saw;			/* how many non-gap characters I saw */
1222   char c;
1223   int  po = 0;			/* count of protein-only */
1224   int  nt = 0;			/* count of t's */
1225   int  nu = 0;			/* count of u's */
1226   int  na = 0;			/* count of nucleotides */
1227   int  aa = 0;			/* count of amino acids */
1228   int  no = 0;			/* count of others */
1229 
1230   /* Look at the first 300 non-gap characters
1231    */
1232   for (saw = 0; *seq != '\0' && saw < 300; seq++)
1233     {
1234       c = sre_toupper((int) *seq);
1235       if (! isgap(c))
1236 	{
1237 	  if (strchr(protonly, c)) po++;
1238 	  else if (strchr(primenuc,c)) {
1239 	    na++;
1240 	    if (c == 'T') nt++;
1241 	    else if (c == 'U') nu++;
1242 	  }
1243 	  else if (strchr(aminos,c)) aa++;
1244 	  else if (isalpha(c)) no++;
1245 	  saw++;
1246 	}
1247     }
1248 
1249   if (no > 0) return kOtherSeq;
1250   else if (po > 0) return kAmino;
1251   else if (na > aa) {
1252     if (nu > nt) return kRNA;
1253     else return kDNA;
1254     }
1255   else return kAmino;
1256 }
1257 
1258 int
WriteSeq(FILE * outf,int outform,char * seq,SQINFO * sqinfo)1259 WriteSeq(FILE *outf, int outform, char *seq, SQINFO *sqinfo)
1260 {
1261   int   numline = 0;
1262   int   lines = 0, spacer = 0, width  = 50, tab = 0;
1263   int   i, j, l, l1, ibase;
1264   char  endstr[10];
1265   char  s[100];			/* buffer for sequence  */
1266   char  ss[100];		/* buffer for structure */
1267   int   checksum = 0;
1268   int   seqlen;
1269   int   which_case;    /* 0 = do nothing. 1 = upper case. 2 = lower case */
1270   int   dostruc;		/* TRUE to print structure lines*/
1271 
1272   which_case = 0;
1273   dostruc    = FALSE;
1274   seqlen     = (sqinfo->flags & SQINFO_LEN) ? sqinfo->len : strlen(seq);
1275 
1276 				/* intercept Selex-format requests - SRE */
1277   if (outform == kSelex) {
1278     fprintf(outf, "%10s %s\n", sqinfo->name, seq);
1279     return 1;
1280   }
1281 
1282   if (outform == kClustal || outform == kMSF) {
1283     Warn("Tried to write Clustal or MSF with WriteSeq() -- bad, bad.");
1284     return 1;
1285   }
1286 
1287   strcpy( endstr,"");
1288   l1 = 0;
1289 
1290   /* 10Nov91: write this out in all possible formats: */
1291   checksum = GCGchecksum(seq, seqlen);
1292 
1293   switch (outform) {
1294 
1295     case kUnknown:    /* no header, just sequence */
1296       strcpy(endstr,"\n"); /* end w/ extra blank line */
1297       break;
1298 
1299     case kGenBank:
1300       fprintf(outf,"LOCUS       %s       %d bp\n",
1301 	      (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name,
1302 	      seqlen);
1303       fprintf(outf,"DEFINITION  %s\n",
1304 	      (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1305       fprintf(outf,"ACCESSION   %s\n",
1306 	      (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1307       fprintf(outf,"ORIGIN      \n");
1308       spacer = 11;
1309       numline = 1;
1310       strcpy(endstr, "\n//");
1311       break;
1312 
1313     case kGCGdata:
1314       fprintf(outf, ">>>>%s  9/95  ASCII  Len: %d\n", sqinfo->name, seqlen);
1315       fprintf(outf, "%s\n", (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1316       break;
1317 
1318     case kPIR:
1319       fprintf(outf, "ENTRY          %s\n",
1320 	      (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
1321       fprintf(outf, "TITLE          %s\n",
1322 	      (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1323       fprintf(outf, "ACCESSION      %s\n",
1324 	      (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1325       fprintf(outf, "SUMMARY                                #Length %d  #Checksum  %d\n",
1326 	      sqinfo->len, checksum);
1327       fprintf(outf, "SEQUENCE\n");
1328       fprintf(outf, "                  5        10        15        20        25        30\n");
1329       spacer  = 2;		/* spaces after every residue */
1330       numline = 1;              /* number lines w/ coords     */
1331       width   = 30;             /* 30 aa per line             */
1332       strcpy(endstr, "\n///");
1333       break;
1334 
1335     case kSquid:
1336       fprintf(outf, "NAM  %s\n", sqinfo->name);
1337       if (sqinfo->flags & (SQINFO_ID | SQINFO_ACC | SQINFO_START | SQINFO_STOP | SQINFO_OLEN))
1338 	fprintf(outf, "SRC  %s %s %d..%d::%d\n",
1339 		(sqinfo->flags & SQINFO_ID)    ? sqinfo->id     : "-",
1340 		(sqinfo->flags & SQINFO_ACC)   ? sqinfo->acc    : "-",
1341 		(sqinfo->flags & SQINFO_START) ? sqinfo->start  : 0,
1342 		(sqinfo->flags & SQINFO_STOP)  ? sqinfo->stop   : 0,
1343 		(sqinfo->flags & SQINFO_OLEN)  ? sqinfo->olen   : 0);
1344       if (sqinfo->flags & SQINFO_DESC)
1345 	fprintf(outf, "DES  %s\n", sqinfo->desc);
1346       if (sqinfo->flags & SQINFO_SS)
1347 	{
1348 	  fprintf(outf, "SEQ  +SS\n");
1349 	  dostruc = TRUE;	/* print structure lines too */
1350 	}
1351       else
1352 	fprintf(outf, "SEQ\n");
1353       numline = 1;                /* number seq lines w/ coords  */
1354       strcpy(endstr, "\n++");
1355       break;
1356 
1357     case kEMBL:
1358       fprintf(outf,"ID   %s\n",
1359 	      (sqinfo->flags & SQINFO_ID) ? sqinfo->id : sqinfo->name);
1360       fprintf(outf,"AC   %s\n",
1361 	      (sqinfo->flags & SQINFO_ACC) ? sqinfo->acc : "-");
1362       fprintf(outf,"DE   %s\n",
1363 	      (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "-");
1364       fprintf(outf,"SQ             %d BP\n", seqlen);
1365       strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/
1366       tab = 5;     /** added 31jan91 */
1367       spacer = 11; /** added 31jan91 */
1368       break;
1369 
1370     case kGCG:
1371       fprintf(outf,"%s\n", sqinfo->name);
1372       if (sqinfo->flags & SQINFO_ACC)
1373 	fprintf(outf,"ACCESSION   %s\n", sqinfo->acc);
1374       if (sqinfo->flags & SQINFO_DESC)
1375 	fprintf(outf,"DEFINITION  %s\n", sqinfo->desc);
1376       fprintf(outf,"    %s  Length: %d  (today)  Check: %d  ..\n",
1377 	      sqinfo->name, seqlen, checksum);
1378       spacer = 11;
1379       numline = 1;
1380       strcpy(endstr, "\n");  /* this is insurance to help prevent misreads at eof */
1381       break;
1382 
1383     case kStrider: /* ?? map ?*/
1384       fprintf(outf,"; ### from DNA Strider ;-)\n");
1385       fprintf(outf,"; DNA sequence  %s, %d bases, %d checksum.\n;\n",
1386 	      sqinfo->name, seqlen, checksum);
1387       strcpy(endstr, "\n//");
1388       break;
1389 
1390 			/* SRE: Don had Zuker default to Pearson, which is not
1391 			   intuitive or helpful, since Zuker's MFOLD can't read
1392 			   Pearson format. More useful to use kIG */
1393     case kZuker:
1394       which_case = 1;			/* MFOLD requires upper case. */
1395       /*FALLTHRU*/
1396     case kIG:
1397       fprintf(outf,";%s %s\n",
1398 	      sqinfo->name,
1399 	      (sqinfo->flags & SQINFO_DESC) ? sqinfo->desc : "");
1400       fprintf(outf,"%s\n", sqinfo->name);
1401       strcpy(endstr,"1"); /* == linear dna */
1402       break;
1403 
1404     case kRaw:			/* Raw: just print the whole sequence. */
1405       fprintf(outf, "%s\n", seq);
1406       return 1;
1407 
1408     default :
1409     case kPearson:
1410       fprintf(outf,">%s  %s\n", sqinfo->name,
1411 	      (sqinfo->flags & SQINFO_DESC)  ? sqinfo->desc   : "");
1412       break;
1413     }
1414 
1415   if (which_case == 1) s2upper(seq);
1416   if (which_case == 2) s2lower(seq);
1417 
1418 
1419   width = MIN(width,100);
1420   for (i=0, l=0, ibase = 1, lines = 0; i < seqlen; ) {
1421     if (l1 < 0) l1 = 0;
1422     else if (l1 == 0) {
1423       if (numline) fprintf(outf,"%8d ",ibase);
1424       for (j=0; j<tab; j++) fputc(' ',outf);
1425       }
1426     if ((spacer != 0) && ((l+1) % spacer == 1))
1427       { s[l] = ' '; ss[l] = ' '; l++; }
1428     s[l]  = seq[i];
1429     ss[l] = (sqinfo->flags & SQINFO_SS) ? sqinfo->ss[i] : '.';
1430     l++; i++;
1431     l1++;                 /* don't count spaces for width*/
1432     if (l1 == width || i == seqlen) {
1433       s[l] = ss[l] = '\0';
1434       l = 0; l1 = 0;
1435       if (dostruc)
1436 	{
1437 	  fprintf(outf, "%s\n", s);
1438 	  if (numline) fprintf(outf,"         ");
1439 	  for (j=0; j<tab; j++) fputc(' ',outf);
1440 	  if (i == seqlen) fprintf(outf,"%s%s\n",ss,endstr);
1441 	  else fprintf(outf,"%s\n",ss);
1442 	}
1443       else
1444 	{
1445 	  if (i == seqlen) fprintf(outf,"%s%s\n",s,endstr);
1446 	  else fprintf(outf,"%s\n",s);
1447 	}
1448       lines++;
1449       ibase = i+1;
1450       }
1451     }
1452   return lines;
1453 }
1454 
1455 
1456 /* Function: ReadMultipleRseqs()
1457  *
1458  * Purpose:  Open a data file and
1459  *           parse it into an array of rseqs (raw, unaligned
1460  *           sequences).
1461  *
1462  *           Caller is responsible for free'ing memory allocated
1463  *           to ret_rseqs, ret_weights, and ret_names.
1464  *
1465  *           Weights are currently only supported for MSF format.
1466  *           Sequences read from all other formats will be assigned
1467  *           weights of 1.0. If the caller isn't interested in
1468  *           weights, it passes NULL as ret_weights.
1469  *
1470  * Returns 1 on success. Returns 0 on failure and sets
1471  * squid_errno to indicate the cause.
1472  */
1473 int
ReadMultipleRseqs(char * seqfile,int fformat,char *** ret_rseqs,SQINFO ** ret_sqinfo,int * ret_num)1474 ReadMultipleRseqs(char              *seqfile,
1475 		  int                fformat,
1476 		  char            ***ret_rseqs,
1477 		  SQINFO **ret_sqinfo,
1478 		  int               *ret_num)
1479 {
1480   SQINFO *sqinfo;               /* array of sequence optional info         */
1481   SQFILE *dbfp;                 /* open ptr for sequential access of file  */
1482   char  **rseqs;                /* sequence array                          */
1483   char  **aseqs;                /* aligned sequences, if file is aligned   */
1484   AINFO   ainfo;      /* alignment-associated information        */
1485   int     numalloced;           /* num of seqs currently alloced for       */
1486   int     idx;
1487   int     num;
1488 
1489   if (fformat == kSelex || fformat == kMSF || fformat == kClustal)
1490     {
1491       if (! ReadAlignment(seqfile, fformat, &aseqs, &ainfo)) return 0;
1492       if (! DealignAseqs(aseqs, ainfo.nseq, &rseqs))                return 0;
1493 
1494       /* copy the sqinfo array
1495        */
1496       num = ainfo.nseq;
1497       sqinfo= (SQINFO *) MallocOrDie (sizeof(SQINFO)*ainfo.nseq);
1498       for (idx = 0; idx < ainfo.nseq; idx++)
1499 	SeqinfoCopy(&(sqinfo[idx]), &(ainfo.sqinfo[idx]));
1500       FreeAlignment(aseqs, &ainfo);
1501     }
1502   else
1503     {
1504 				/* initial alloc */
1505       num        = 0;
1506       numalloced = 16;
1507       rseqs  = (char **) MallocOrDie (numalloced * sizeof(char *));
1508       sqinfo = (SQINFO *) MallocOrDie (numalloced * sizeof(SQINFO));
1509       if ((dbfp = SeqfileOpen(seqfile, fformat, NULL)) == NULL) return 0;
1510 
1511       while (ReadSeq(dbfp, fformat, &rseqs[num], &(sqinfo[num])))
1512 	{
1513 	  num++;
1514 	  if (num == numalloced) /* more seqs coming, alloc more room */
1515 	    {
1516 	      numalloced += 16;
1517 	      rseqs  = (char **) ReallocOrDie (rseqs, numalloced*sizeof(char *));
1518 	      sqinfo = (SQINFO *) ReallocOrDie (sqinfo, numalloced * sizeof(SQINFO));
1519 	    }
1520 	}
1521       SeqfileClose(dbfp);
1522     }
1523 
1524   *ret_rseqs  = rseqs;
1525   *ret_sqinfo = sqinfo;
1526   *ret_num    = num;
1527   return 1;
1528 }
1529 
1530 
1531 
1532 
1533 char *
SeqFormatString(int code)1534 SeqFormatString(int code)
1535 {
1536   switch (code) {
1537   case kUnknown: return "Unknown";
1538   case kIG:      return "Intelligenetics";
1539   case kGenBank: return "GenBank";
1540   case kEMBL:    return "EMBL";
1541   case kGCG:     return "GCG";
1542   case kGCGdata: return "GCG datalibrary";
1543   case kStrider: return "Strider";
1544   case kPearson: return "FASTA";
1545   case kZuker:   return "Zuker";
1546   case kIdraw:   return "Idraw";
1547   case kSelex:   return "SELEX";
1548   case kMSF:     return "MSF";
1549   case kClustal: return "Clustal";
1550   case kPIR:     return "PIR-CODATA";
1551   case kRaw:     return "raw";
1552   case kSquid:   return "Squid";
1553   default:       return "(bad code)";
1554   }
1555 }
1556 
1557 
1558 
1559 /* Function: GSIOpen()
1560  *
1561  * Purpose:  Open a GSI file. Returns the number of records in
1562  *           the file and a file pointer. Returns NULL on failure.
1563  *           The file pointer should be fclose()'d normally.
1564  */
1565 GSIFILE *
GSIOpen(char * gsifile)1566 GSIOpen(char *gsifile)
1567 {
1568   GSIFILE    *gsi;
1569   char        magic[GSI_KEYSIZE];
1570 
1571   gsi = (GSIFILE *) MallocOrDie (sizeof(GSIFILE));
1572   if ((gsi->gsifp = fopen(gsifile, "r")) == NULL)
1573     { squid_errno = SQERR_NOFILE; return NULL; }
1574 
1575   if (! fread(magic, sizeof(char), GSI_KEYSIZE, gsi->gsifp))
1576     { squid_errno = SQERR_NODATA; return NULL; }
1577   if (strcmp(magic, "GSI") != 0)
1578     { squid_errno = SQERR_FORMAT; return NULL; }
1579 
1580   if (! fread(&(gsi->nfiles), sizeof(sqd_uint16), 1, gsi->gsifp))
1581     { squid_errno = SQERR_NODATA; return NULL; }
1582   if (! fread(&(gsi->recnum), sizeof(sqd_uint32), 1, gsi->gsifp))
1583     { squid_errno = SQERR_NODATA; return NULL; }
1584 
1585   gsi->nfiles = sre_ntohs(gsi->nfiles); /* convert from network short */
1586   gsi->recnum = sre_ntohl(gsi->recnum); /* convert from network long  */
1587 
1588   return gsi;
1589 }
1590 
1591 /* Function: GSIGetRecord()
1592  *
1593  * Purpose:  Each non-header record of a GSI index files consists
1594  *           of 38 bytes: 32 bytes of character string, a 2 byte
1595  *           short, and a 4 byte long. This function returns the
1596  *           three values.
1597  *
1598  * Args:     gsi  - open GSI index file, correctly positioned at a record
1599  *           f1   - char[32], allocated by caller (or NULL if unwanted)
1600  *           f2   - pointer to short (or NULL if unwanted)
1601  *           f3   - pointer to long  (or NULL if unwanted)
1602  *
1603  * Return:   0 on failure and sets squid_errno.
1604  */
1605 static int
GSIGetRecord(GSIFILE * gsi,char * f1,sqd_uint16 * f2,sqd_uint32 * f3)1606 GSIGetRecord(GSIFILE *gsi, char *f1, sqd_uint16 *f2, sqd_uint32 *f3)
1607 {
1608   if (f1 == NULL) fseek(gsi->gsifp, GSI_KEYSIZE, SEEK_CUR);
1609   else if (! fread(f1, GSI_KEYSIZE, 1, gsi->gsifp))
1610     { squid_errno = SQERR_NODATA; return 0; }
1611 
1612   if (f2 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint16), SEEK_CUR);
1613   else if (! fread(f2, sizeof(sqd_uint16), 1, gsi->gsifp))
1614     { squid_errno = SQERR_NODATA; return 0; }
1615 
1616   if (f3 == NULL) fseek(gsi->gsifp, sizeof(sqd_uint32), SEEK_CUR);
1617   else if (! fread(f3, sizeof(sqd_uint32), 1, gsi->gsifp))
1618     { squid_errno = SQERR_NODATA; return 0; }
1619 
1620   if (f2 != NULL) *f2 = sre_ntohs(*f2);
1621   if (f3 != NULL) *f3 = sre_ntohl(*f3);
1622 
1623   return 1;
1624 }
1625 
1626 
1627 /* Function: GSIGetOffset()
1628  *
1629  * Purpose:  From a key (sequence name), find a disk offset
1630  *           in an open general sequence index file by binary
1631  *           search. Presumably GSI indexing could be even faster
1632  *           if we used hashing.
1633  *
1634  * Args:     gsi         - GSI index file, opened by GSIOpen()
1635  *           key         - name of key to retrieve indices for
1636  *           ret_seqfile - pre-alloced char[32] array for seqfile name
1637  *           ret_fmt     - format of seqfile
1638  *           ret_offset  - return: disk offset in seqfile.
1639  */
1640 int
GSIGetOffset(GSIFILE * gsi,char * key,char * ret_seqfile,int * ret_format,long * ret_offset)1641 GSIGetOffset(GSIFILE *gsi, char *key, char *ret_seqfile,
1642 	     int *ret_format, long *ret_offset)
1643 {
1644   sqd_uint32  left, right, mid;
1645   int         cmp;
1646   char        name[GSI_KEYSIZE + 1];
1647   sqd_uint32  offset;
1648   sqd_uint16  filenum;
1649   sqd_uint32  fmt;
1650 
1651   name[GSI_KEYSIZE] = '\0';
1652 
1653   left  = gsi->nfiles + 1;
1654   right = gsi->nfiles + gsi->recnum;
1655   mid   = (left + right) / 2;
1656   fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET);
1657 
1658   while (GSIGetRecord(gsi, name, &filenum, &offset))
1659     {
1660       cmp = strcmp(name, key);
1661       if      (cmp == 0)      break;	       /* found it!              */
1662       else if (left >= right) return 0;        /* oops, missed it; fail. */
1663       else if (cmp < 0)       left = mid + 1;  /* it's right of mid      */
1664       else if (cmp > 0)	      right = mid - 1; /* it's left of mid       */
1665       mid = (left + right) / 2;
1666       fseek(gsi->gsifp, mid * GSI_RECSIZE, SEEK_SET);
1667     }
1668 
1669   /* Using file number, look up the sequence file and format.
1670    */
1671   fseek(gsi->gsifp, filenum * GSI_RECSIZE, SEEK_SET);
1672   GSIGetRecord(gsi, ret_seqfile, NULL, &fmt);
1673   *ret_format =  (int) fmt;
1674   *ret_offset = (long) offset;
1675 
1676   return 1;
1677 }
1678 
1679 /* Function: GSIClose()
1680  *
1681  * Purpose:  Close an open GSI sequence index file.
1682  */
1683 void
GSIClose(GSIFILE * gsi)1684 GSIClose(GSIFILE *gsi)
1685 {
1686   fclose(gsi->gsifp);
1687   free(gsi);
1688 }
1689 
1690 
1691