1 /*
2  * SEQIO.C  -  A C Package for Performing Sequence File I/O  (Version 1.2)
3  *
4  *   Copyright (c) 1996 by James Knight at Univ. of California, Davis
5  *
6  *   Permission to use, copy, modify, distribute and sell this software
7  *   and its documentation is hereby granted, subject to the following
8  *   restrictions and understandings:
9  *
10  *     1) Any copy of this software or any copy of software derived
11  *        from it must include this copyright notice in full.
12  *
13  *     2) All materials or software developed as a consequence of the
14  *        use of this software or software derived from it must duly
15  *        acknowledge such use, in accordance with the usual standards
16  *        of acknowledging credit in academic research.
17  *
18  *     3) The software may be used freely by anyone for any purpose,
19  *        commercial or non-commercial.  That includes, but is not
20  *        limited to, its incorporation into software sold for a profit
21  *        or the development of commercial software derived from it.
22  *
23  *     4) This software is provided AS IS with no warranties of any
24  *        kind.  The author shall have no liability with respect to the
25  *        infringement of copyrights, trade secrets or any patents by
26  *        this software or any part thereof.  In no event will the
27  *        author be liable for any lost revenue or profits or other
28  *        special, indirect and consequential damages.
29  */
30 
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <ctype.h>
34 #include <fcntl.h>
35 #include <stdarg.h>
36 #include <string.h>
37 #include <time.h>
38 #include <errno.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #ifdef __unix
42 #include <unistd.h>
43 #include <dirent.h>
44 #ifdef SYSV
45 #include <sys/dirent.h>
46 #endif
47 #endif
48 #ifdef WIN32
49 #include <windows.h>
50 #endif
51 #include "seqio.h"
52 
53 
54 /*
55  * Portability Issues.
56  *
57  * Integers must be 4 bytes long (they will take values larger than 65536).
58  *
59  * The character dividing directories in a filepath (in Unix, '/') must
60  * be specified as the value of variable "dirch" below.
61  *
62  * The structure used when reading a raw file using open and read must
63  * be specified as the value of typedef FILEPTR.  The structure used
64  * when reading a directory file must be specified as the value of
65  * typedef DIRPTR.
66  *
67  * Current set of external calls in main section of code:
68  *      exit, fclose, fopen, fputc, fputc, fprintf, free, fwrite,
69  *      getenv, isalpha, isalnum, isdigit, isspace,
70  *      malloc, memcpy, memset, realloc, sizeof, sprintf,
71  *      strcpy, strcmp, strlen, strncmp, tolower, va_arg, va_end,
72  *      va_start, vsprintf
73  *      mmap, munmap (these are ifdef'd inside `ISMAPABLE')
74  *
75  * Current set of (unusual?) data-structures/variables in main section:
76  *      errno, va_list, __LINE__,
77  *      caddr_t (this is ifdef'd inside `ISMAPABLE')
78  *
79  * Procedures found at the end of this file which cover all of the file I/O:
80  *      open_raw_file, read_raw_file, seek_raw_file, close_raw_file,
81  *      open_raw_stdin, open_stdout, puterror, read_small_file, open_directory,
82  *      read_dirname, close_directory, isa_file, get_filesize, isa_dir,
83  *      get_truename, is_absolute, get_today
84  *
85  * Current set of external calls in end section of code:
86  *      close, ctime, open, lseek, read, stat, time
87  *
88  *      closedir, opendir, readdir  (these are ifdef'd inside `__unix')
89  *
90  *      GetCurrentDirectory, SetCurrentDirectory,
91  *      FindFirstFile, FindNextFile, CloseHandle
92  *                              (these are ifdef'd inside `WIN32')
93  *
94  * Current set of (unusual?) data-structures/variables in end section:
95  *      stat structure, time_t, stdin, stdout, stderr
96  *      DIR, dirent structure  (these are ifdef'd inside `__unix')
97  *      WIN32_FIND_DATA, HANDLE   (these are ifdef'd inside `WIN32')
98  *
99  */
100 
101 #ifdef WIN32
102 
103 static char dirch = '\\';
104 typedef struct {
105   int init_flag;
106   WIN32_FIND_DATA dirinfo;
107   HANDLE handle;
108 } DIRSTRUCT, *DIRPTR;
109 
110 int open(), read(), close();
111 
112 #else
113 
114 static char dirch = '/';
115 typedef DIR *DIRPTR;
116 
117 #endif
118 
119 
120 typedef int FILEPTR;
121 
122 static int open_raw_file(char *filename, FILEPTR *ptr_out);
123 static int read_raw_file(FILEPTR ptr, char *buffer, int size);
124 static int seek_raw_file(FILEPTR, int pos);
125 static int close_raw_file(FILEPTR ptr);
126 static int open_raw_stdin(FILEPTR *ptr_out);
127 static int open_stdout(FILE **ptr_out);
128 static void puterror(char *s);
129 static char *read_small_file(char *filename);
130 
131 
132 static int open_directory(char *dirname, DIRPTR *dp_out);
133 static char *read_dirname(DIRPTR dp);
134 static void close_directory(DIRPTR dp);
135 static int isa_file(char *filename);
136 static int get_filesize(char *filename);
137 static char *get_truename(char *filename, char *fileend);
138 static int is_absolute(char *path);
139 static char *get_today();
140 
141 
142 
143 
144 /*
145  *
146  * Prototypes for external functions that are not declared in the include
147  * files, and replacement functions for system calls that don't exist on
148  * one or more machines.
149  */
150 
151 #if defined(__sun) && !defined(FILENAME_MAX)
152 #include <sys/param.h>
153 #define FILENAME_MAX MAXPATHLEN
154 #endif
155 
156 
157 static int ctype_initflag = 0;
158 static char tubuf[384], *tuary;
159 
160 
init_ctype(void)161 static void init_ctype(void)
162 {
163   int i;
164   char j;
165 
166   tuary = tubuf + 128;
167 
168   for (i=-128; i < 255; i++)
169     tuary[i] = i;
170   for (i='a',j='A'; i <= 'z'; i++,j++)
171     tuary[i] = j;
172 
173   ctype_initflag = 1;
174 }
175 
mycasecmp(char * s,char * t)176 static int mycasecmp(char *s, char *t)
177 {
178   int diff;
179 
180   for ( ; !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++) ;
181   return diff;
182 }
183 
myncasecmp(char * s,char * t,int n)184 static int myncasecmp(char *s, char *t, int n)
185 {
186   int diff, i;
187 
188   diff = 0;
189   for (i=0; i < n && !(diff = toupper(*s) - toupper(*t)) && *s; s++,t++,i++) ;
190   return diff;
191 }
192 
mystrdup(char * s)193 static char *mystrdup(char *s)
194 {
195   char *temp;
196 
197   temp = (char *) malloc(strlen(s)+1);
198   return (temp == NULL ? NULL : strcpy(temp, s));
199 }
200 
mystrdup2(char * s,char * t)201 static char *mystrdup2(char *s, char *t)
202 {
203   char *temp;
204 
205   if ((temp = (char *) malloc(t - s + 1)) == NULL)
206     return NULL;
207 
208   memcpy(temp, s, t - s);
209   temp[t - s] = '\0';
210   return temp;
211 }
212 
213 #define mystreq(s1,ch,s2)  (toupper(*(s1)) == (ch) && mystreqfn((s1),(s2)))
mystreqfn(char * s1,char * s2)214 static int mystreqfn(char *s1, char *s2)
215 {
216   int diff;
217   while (!(diff = toupper(*++s1) - *++s2) && *s2) ;
218   return !*s2;
219 }
220 
myatoi(char * s,int base,char basechar)221 static int myatoi(char *s, int base, char basechar)
222 {
223   int num, sign;
224 
225   while (isspace(*s)) s++;
226 
227   sign = 0;
228   if (*s == '+' || *s == '-') {
229     sign = (*s == '-');
230     s++;
231   }
232 
233   for (num=0; *s >= basechar && *s < basechar + base; s++) {
234     num *= base;
235     num += *s - basechar;
236   }
237 
238   return (sign ? -num : num);
239 }
240 
myitoa(char * s,int num,int base,char basechar)241 char *myitoa(char *s, int num, int base, char basechar)
242 {
243   int pos, digit;
244   char buffer[128];
245 
246   if (num < 0) {
247     *s++ = '-';
248     num *= -1;
249   }
250 
251   pos = 0;
252   do {
253     digit = num % base;
254     buffer[pos++] = (char) (digit + basechar);
255     num /= base;
256   } while (num != 0);
257 
258   for (pos--; pos >= 0; pos--)
259     *s++ = buffer[pos];
260 
261   return s;
262 }
263 
264 
265 
266 /*
267  *
268  * Includes and defines for the mmap operation.
269  *
270  */
271 #if defined(__sgi) || defined(__sun) || defined(__alpha)
272 #define ISMAPABLE 1
273 #endif
274 
275 #ifdef ISMAPABLE
276 
277 #include <sys/mman.h>
278 
279 #ifdef __sgi
280 void *mmap();
281 int munmap();
282 #endif
283 #ifdef __sun
284 int munmap();
285 #endif
286 
287 /*
288  * Largest number under 1 million which is a multiple of
289  * 512, 1024, 2048, 4096, 8192 and 16384, in order to
290  * guarantee that it matches the page size.
291  */
292 #define MYPAGESIZE 16384
293 #define MAXMAPSIZE 999424
294 
295 #endif
296 
297 
298 
299 
300 
301 /*
302  *
303  * The internal SEQFILE data structure.
304  *
305  */
306 typedef enum { OP_READ, OP_DB, OP_WRITE } OPTYPE;
307 typedef enum { OP_ACTIVE, OP_EOF, OP_ERROR, OP_TEMPERR, OP_FREED } OPSTATUS;
308 typedef enum { INFO_NONE, INFO_ANY, INFO_ALL, INFO_ALLINFO } INFOSTATUS;
309 
310 typedef struct {
311   OPTYPE optype;
312   OPSTATUS opstatus;
313 
314   char *db_files, *db_currentfile;
315   char *db_spec, *db_name, *db_format;
316   char *db_alpha, *db_idprefix;
317 
318   char *filename;
319   FILEPTR input_fd;
320   FILE *output_fp;
321   int format, openflag, prettyflag;
322   int autodetermined, initreadflag;
323   int randaccessflag, *byteoffsets, currentoffset, num_offsets;
324 
325   char *fp_buffer;
326   int fp_bufsize, fp_bytepos;
327   char *fp_current, *fp_top;
328   char *fp_entrystart, *fp_seqstart, *fp_entryend;
329   char savech, *savech_loc;
330   int isendtagged;
331 
332   int ismapped, mapsize, filepos, filesize, mapentflag, mapentsize;
333   char *mapentry;
334 
335   char *seq;
336   int seqlen, seqsize, isseqcurrent, rawseqflag;
337 
338   int entry_count, entry_seqlen, entry_seqno, entry_numseqs;
339   int entry_truelen, entry_rawlen, iflag_truelen, iflag_rawlen;
340 
341   SEQINFO *info;
342   int infosize, infobufsize;
343   char *idbuffer;
344 
345   INFOSTATUS istatus;
346   int iflag_date, iflag_idlist, iflag_description;
347   int iflag_comment, iflag_organism, iflag_fragment;
348   int iflag_circular, iflag_alphabet, iflag_fragstart;
349 
350   char *nbrf_header;
351 
352   int fout_mode, fout_markx, fout_len1, fout_alpha1, fout_len2, fout_alpha2;
353   char *fout_id1, *fout_descr1, *fout_id2, *fout_descr2, fout_progname[64];
354 
355   int malign_count, malign_size, malign_seqno;
356   char **malign_seqs, **malign_ids;
357   int *malign_seqlens;
358 
359   int phylip_origfmt;
360 
361   int gcg_subformat;
362   char *gcg_infoline;
363 } INTSEQFILE;
364 
365 #define INIT_BUFSIZE 65537
366 #define INIT_SEQSIZE 16384
367 #define CONCAT_READ_POINT 2048
368 
369 #define SEQINFO_ALL 0
370 #define SEQINFO_ALLINFO 1
371 #define SEQINFO_DATE 2
372 #define SEQINFO_IDLIST 3
373 #define SEQINFO_DESCRIPTION 4
374 #define SEQINFO_COMMENT 5
375 #define SEQINFO_ORGANISM 6
376 #define SEQINFO_HISTORY 7
377 #define SEQINFO_FRAGMENT 8
378 #define SEQINFO_CIRCULAR 9
379 #define SEQINFO_ALPHABET 10
380 #define SEQINFO_FILENAME 11
381 #define SEQINFO_DBNAME 12
382 #define SEQINFO_FORMAT 13
383 #define SEQINFO_ENTRYNO 14
384 #define SEQINFO_SEQNO 15
385 #define SEQINFO_NUMSEQS 16
386 #define SEQINFO_STARTPOS 17
387 #define SEQINFO_TRUELEN 18
388 #define SEQINFO_RAWLEN 19
389 #define SEQINFO_MAINID 20
390 #define SEQINFO_MAINACC 21
391 
392 #define GETSEQ_SEQUENCE 0
393 #define GETSEQ_RAWSEQ 1
394 #define GETSEQ_LENGTHS 2
395 
396 
397 
398 /*
399  * The file table.  Gives the C functions which parses particular file
400  * formats.  Also gives the command line and pattern file option strings, and
401  * the determinant string used to determine the format of an unknown file.
402  *
403  * This determinant is matched against the first line in the sequence file
404  * whose first character is not a space (all of the formats supported so far
405  * have a non-space character occuring in the first position of the first line
406  * of a file).
407  *
408  * In the determinant strings below, a question mark '?' denotes a wildcard
409  * character (to distinguish the nbrf and fasta formats).  The code in
410  * file.c must be changed to support a new file format which actually
411  * uses period for its determinant string.
412  *
413  * (NOTE:  The FORMAT defines must have values corresponding to the indices
414  *         into the file_table.
415  */
416 
417 static int databank_read(INTSEQFILE *, int), basic_read(INTSEQFILE *, int);
418 static int basic_getseq(INTSEQFILE *, int);
419 static int databank_fast_read(INTSEQFILE *, int);
420 static int databank_fast_getseq(INTSEQFILE *, int);
421 
422 static int raw_read(INTSEQFILE *, int), raw_getseq(INTSEQFILE *, int);
423 static int raw_getinfo(INTSEQFILE *, char *, int, int);
424 static int raw_putseq(INTSEQFILE *, char *, int, SEQINFO *);
425 
426 static int plain_putseq(INTSEQFILE *, char *, int, SEQINFO *);
427 
428 static int genbank_getinfo(INTSEQFILE *, char *, int, int);
429 static int genbank_putseq(INTSEQFILE *, char *, int, SEQINFO *);
430 static int genbank_annotate(FILE *, char *, int, char *, int);
431 
432 static int nbrf_getinfo(INTSEQFILE *, char *, int, int);
433 static int nbrf_putseq(INTSEQFILE *, char *, int, SEQINFO *);
434 static int nbrfold_putseq(INTSEQFILE *, char *, int, SEQINFO *);
435 static int nbrf_annotate(FILE *, char *, int, char *, int);
436 
437 static int fasta_getinfo(INTSEQFILE *, char *, int, int);
438 static int fasta_putseq(INTSEQFILE *, char *, int, SEQINFO *);
439 static int fastaold_putseq(INTSEQFILE *, char *, int, SEQINFO *);
440 static int fasta_annotate(FILE *, char *, int, char *, int);
441 
442 static int embl_getinfo(INTSEQFILE *, char *, int, int);
443 static int embl_putseq(INTSEQFILE *, char *, int, SEQINFO *);
444 static int sprot_putseq(INTSEQFILE *, char *, int, SEQINFO *);
445 static int embl_annotate(FILE *, char *, int, char *, int);
446 static int sprot_annotate(FILE *, char *, int, char *, int);
447 
448 static int pir_getinfo(INTSEQFILE *, char *, int, int);
449 static int pir_putseq(INTSEQFILE *, char *, int, SEQINFO *);
450 static int pir_annotate(FILE *, char *, int, char *, int);
451 
452 static int stanford_getinfo(INTSEQFILE *, char *, int, int);
453 static int stanford_putseq(INTSEQFILE *, char *, int, SEQINFO *);
454 static int stanfordold_putseq(INTSEQFILE *, char *, int, SEQINFO *);
455 static int stanford_annotate(FILE *, char *, int, char *, int);
456 
457 static int asn_read(INTSEQFILE *, int), asn_getseq(INTSEQFILE *, int);
458 static int asn_getinfo(INTSEQFILE *, char *, int, int);
459 static int asn_putseq(INTSEQFILE *, char *, int, SEQINFO *);
460 static int asn_putseqend(INTSEQFILE *);
461 static int asn_annotate(FILE *, char *, int, char *, int);
462 
463 static int fastaout_read(INTSEQFILE *, int);
464 static int fastaout_getseq(INTSEQFILE *, int);
465 static int fastaout_getinfo(INTSEQFILE *, char *, int, int);
466 
467 static int blastout_read(INTSEQFILE *, int);
468 static int blastout_getseq(INTSEQFILE *, int);
469 static int blastout_getinfo(INTSEQFILE *, char *, int, int);
470 
471 static int phylip_read(INTSEQFILE *, int);
472 static int phyint_getseq(INTSEQFILE *, int), physeq_getseq(INTSEQFILE *, int);
473 static int phyint_getinfo(INTSEQFILE *, char *, int, int);
474 static int physeq_getinfo(INTSEQFILE *, char *, int, int);
475 static int phylip_putseq(INTSEQFILE *, char *, int, SEQINFO *);
476 static int phyint_putseqend(INTSEQFILE *);
477 static int physeq_putseqend(INTSEQFILE *);
478 
479 static int clustal_read(INTSEQFILE *, int), clustal_getseq(INTSEQFILE *, int);
480 static int clustal_getinfo(INTSEQFILE *, char *, int, int);
481 static int clustal_putseq(INTSEQFILE *, char *, int, SEQINFO *);
482 static int clustal_putseqend(INTSEQFILE *);
483 
484 static int gcg_getseq(INTSEQFILE *, int);
485 static int gcg_getinfo(INTSEQFILE *, char *, int, int);
486 static int gcg_putseq(INTSEQFILE *, char *, int, SEQINFO *);
487 
488 static int msf_read(INTSEQFILE *, int), msf_getseq(INTSEQFILE *, int);
489 static int msf_getinfo(INTSEQFILE *, char *, int, int);
490 static int msf_putseq(INTSEQFILE *, char *, int, SEQINFO *);
491 static int msf_putseqend(INTSEQFILE *);
492 
493 
494 #define FORMAT_UNKNOWN -1
495 #define FORMAT_RAW 0
496 #define FORMAT_PLAIN 1
497 #define FORMAT_GENBANK 2
498 #define FORMAT_GBFAST 4
499 #define FORMAT_NBRF 5
500 #define FORMAT_NBRFOLD 6
501 #define FORMAT_FASTA 8
502 #define FORMAT_FASTAOLD 10
503 #define FORMAT_EMBL 12
504 #define FORMAT_EMBLFAST 13
505 #define FORMAT_SPROT 14
506 #define FORMAT_SPFAST 17
507 #define FORMAT_PIR 18
508 #define FORMAT_PIRFAST 20
509 #define FORMAT_STANFORD 21
510 #define FORMAT_STANFORDOLD 24
511 #define FORMAT_ASN 28
512 #define FORMAT_FOUT 30
513 #define FORMAT_PHYLIP 34
514 #define FORMAT_PHYSEQ 35
515 #define FORMAT_PHYINT 38
516 #define FORMAT_CLUSTAL 41
517 #define FORMAT_GCG 43
518 #define FORMAT_MSF 44
519 #define FORMAT_BOUT 45
520 
521 typedef struct {
522   char *ident;
523   int format, type;
524   char *determinant;
525   int (*read_fn)(INTSEQFILE *, int);
526   int (*getseq_fn)(INTSEQFILE *, int);
527   int (*getinfo_fn)(INTSEQFILE *, char *, int, int);
528   int (*putseq_fn)(INTSEQFILE *, char *, int, SEQINFO *);
529   int (*annotate_fn)(FILE *, char *, int, char *, int);
530 } FILE_TABLE;
531 
532 
533 #define file_table_size 49
534 static FILE_TABLE file_table[file_table_size] = {
535   { "Raw",  FORMAT_RAW,  T_SEQONLY,
536       NULL,
537       raw_read,  raw_getseq,  raw_getinfo,  raw_putseq, NULL },
538   { "Plain",  FORMAT_PLAIN,  T_SEQONLY,
539       NULL,
540       raw_read,  basic_getseq,  raw_getinfo,  plain_putseq, NULL },
541   { "GenBank",  FORMAT_GENBANK,  T_DATABANK,
542       "LOCUS |GB???.SEQ          Genetic Sequence Data Bank",
543       databank_read,  basic_getseq,  genbank_getinfo,  genbank_putseq,
544       genbank_annotate },
545   { "gb",  FORMAT_GENBANK,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
546   { "gbfast",  FORMAT_GBFAST,  T_DATABANK,
547       NULL,
548       databank_fast_read,  databank_fast_getseq,  genbank_getinfo,
549       genbank_putseq,  genbank_annotate },
550   { "NBRF",  FORMAT_NBRF,  T_GENERAL,
551       ">??;",
552       basic_read,  basic_getseq,  nbrf_getinfo,  nbrf_putseq, nbrf_annotate },
553   { "NBRF-old",  FORMAT_NBRFOLD,  T_LIMITED,
554       NULL,
555       basic_read,  basic_getseq,  nbrf_getinfo,  nbrfold_putseq,  NULL },
556   { "NBRFold",  FORMAT_NBRFOLD,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
557   { "FASTA",  FORMAT_FASTA,  T_GENERAL,
558       ">",
559       basic_read,  basic_getseq,  fasta_getinfo,  fasta_putseq,
560       fasta_annotate },
561   { "Pearson",  FORMAT_FASTA,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
562   { "FASTA-old",  FORMAT_FASTAOLD,  T_LIMITED,
563       NULL,
564       basic_read,  basic_getseq,  fasta_getinfo,  fastaold_putseq, NULL },
565   { "FASTAold",  FORMAT_FASTAOLD,  0,  NULL,  NULL,  NULL,  NULL,  NULL,
566       NULL },
567   { "EMBL",  FORMAT_EMBL,  T_DATABANK,
568       "ID   |CC |XX ",
569       databank_read,  basic_getseq,  embl_getinfo,  embl_putseq,
570       embl_annotate },
571   { "emblfast",  FORMAT_EMBLFAST,  T_DATABANK,
572       NULL,
573       databank_fast_read,  databank_fast_getseq,  embl_getinfo,  embl_putseq,
574       embl_annotate },
575   { "Swiss-Prot",  FORMAT_SPROT,  T_DATABANK,
576       NULL,
577       databank_read,  basic_getseq,  embl_getinfo,  sprot_putseq,
578       sprot_annotate },
579   { "swissprot",  FORMAT_SPROT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
580   { "sprot",  FORMAT_SPROT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
581   { "spfast",  FORMAT_SPFAST,  T_DATABANK,
582       NULL,
583       databank_fast_read,  databank_fast_getseq,  embl_getinfo,  sprot_putseq,
584       sprot_annotate },
585   { "PIR",  FORMAT_PIR,  T_DATABANK,
586       "\\\\\\|ENTRY|P R O T E I N  S E Q U E N C E  D A T A B A S E",
587       databank_read,  basic_getseq,  pir_getinfo,  pir_putseq, pir_annotate },
588   { "CODATA",  FORMAT_PIR,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
589   { "pirfast",  FORMAT_PIRFAST,  T_DATABANK,
590       NULL,
591       databank_fast_read,  databank_fast_getseq,  pir_getinfo,  pir_putseq,
592       pir_annotate },
593   { "IG/Stanford",  FORMAT_STANFORD,  T_GENERAL,
594       ";",
595       basic_read,  basic_getseq,  stanford_getinfo,  stanford_putseq,
596       stanford_annotate },
597   { "IG",  FORMAT_STANFORD,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
598   { "Stanford",  FORMAT_STANFORD,  0,  NULL,  NULL,  NULL,  NULL,  NULL,
599       NULL },
600   { "Stanford-old",  FORMAT_STANFORDOLD,  T_LIMITED,
601       NULL,
602       basic_read,  basic_getseq,  stanford_getinfo,  stanfordold_putseq,
603       NULL },
604   { "Stanfordold",  FORMAT_STANFORDOLD, 0,  NULL,  NULL,  NULL,  NULL,  NULL,
605       NULL },
606   { "IG-old",  FORMAT_STANFORDOLD,  0,  NULL,  NULL,  NULL,  NULL,  NULL,
607       NULL },
608   { "IGold",  FORMAT_STANFORDOLD,  0,  NULL,  NULL,  NULL,  NULL,  NULL,
609       NULL },
610   { "ASN.1",  FORMAT_ASN,  T_DATABANK,
611       "Bioseq-set ::= {|Seq-set ::= {",
612       asn_read,  asn_getseq,  asn_getinfo,  asn_putseq, asn_annotate },
613   { "ASN",  FORMAT_ASN,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
614   { "FASTA-output",  FORMAT_FOUT,  T_OUTPUT,
615       "FASTA|TFASTA|SSEARCH|LFASTA|LALIGN|ALIGN|FASTX",
616       fastaout_read,  fastaout_getseq,  fastaout_getinfo,  NULL,  NULL },
617   { "FASTA-out",  FORMAT_FOUT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
618   { "FASTAout",  FORMAT_FOUT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
619   { "Fout",  FORMAT_FOUT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
620   { "PHYLIP",  FORMAT_PHYLIP,  T_ALIGNMENT,
621       "0|1|2|3|4|5|6|7|8|9",
622       phylip_read,  phyint_getseq,  phyint_getinfo,  phylip_putseq,  NULL },
623   { "PHYLIP-seq",  FORMAT_PHYSEQ,  T_ALIGNMENT,
624       NULL,
625       phylip_read,  physeq_getseq,  physeq_getinfo,  phylip_putseq,  NULL },
626   { "PHYLIPseq",  FORMAT_PHYSEQ,  0,  NULL,  NULL,  NULL,  NULL,  NULL,
627       NULL },
628   { "PHYLIPs",  FORMAT_PHYSEQ,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
629   { "PHYLIP-int",  FORMAT_PHYINT,  T_ALIGNMENT,
630       NULL,
631       phylip_read,  phyint_getseq,  phyint_getinfo,  phylip_putseq,  NULL },
632   { "PHYLIPint",  FORMAT_PHYINT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,
633       NULL },
634   { "PHYLIPi",  FORMAT_PHYINT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
635   { "Clustalw",  FORMAT_CLUSTAL,  T_ALIGNMENT,
636       "CLUSTAL",
637       clustal_read,  clustal_getseq,  clustal_getinfo,  clustal_putseq,
638       NULL },
639   { "Clustal",  FORMAT_CLUSTAL,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
640   { "GCG", FORMAT_GCG, T_GENERAL,
641       NULL,
642       raw_read,  gcg_getseq,  gcg_getinfo,  gcg_putseq,  NULL },
643   { "MSF", FORMAT_MSF, T_ALIGNMENT,
644       "PileUp",
645       msf_read, msf_getseq, msf_getinfo, msf_putseq, NULL },
646   { "BLAST-output",  FORMAT_BOUT,  T_OUTPUT,
647       "BLASTN|BLASTP|BLASTX",
648       blastout_read,  blastout_getseq,  blastout_getinfo,  NULL,  NULL },
649   { "BLAST-out",  FORMAT_BOUT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
650   { "BLASTout",  FORMAT_BOUT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL },
651   { "Bout",  FORMAT_BOUT,  0,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL }
652 };
653 
654 
655 typedef struct {
656   char *ident;
657   int format;
658 } GCG_TABLE;
659 
660 #define gcg_table_size 23
661 static GCG_TABLE gcg_table[gcg_table_size] = {
662   { "GCG-GenBank", FORMAT_GENBANK },
663   { "GCG-gb", FORMAT_GENBANK },
664   { "GCG-PIR",  FORMAT_PIR },
665   { "GCG-CODATA",  FORMAT_PIR },
666   { "GCG-EMBL",  FORMAT_EMBL },
667   { "GCG-Swiss-Prot",  FORMAT_SPROT },
668   { "GCG-swissprot",  FORMAT_SPROT },
669   { "GCG-sprot",  FORMAT_SPROT },
670   { "GCG-NBRF",  FORMAT_NBRF },
671   { "GCG-NBRF-old",  FORMAT_NBRFOLD },
672   { "GCG-NBRFold",  FORMAT_NBRFOLD },
673   { "GCG-FASTA",  FORMAT_FASTA },
674   { "GCG-Pearson",  FORMAT_FASTA },
675   { "GCG-FASTA-old",  FORMAT_FASTAOLD },
676   { "GCG-FASTAold",  FORMAT_FASTAOLD },
677   { "GCG-IG/Stanford",  FORMAT_STANFORD },
678   { "GCG-IG",  FORMAT_STANFORD },
679   { "GCG-Stanford",  FORMAT_STANFORD },
680   { "GCG-Stanford-old",  FORMAT_STANFORDOLD },
681   { "GCG-Stanfordold",  FORMAT_STANFORDOLD },
682   { "GCG-IG-old",  FORMAT_STANFORDOLD },
683   { "GCG-IGold",  FORMAT_STANFORDOLD },
684   { "GCG-MSF", FORMAT_MSF }
685 };
686 
687 
688 
689 /*
690  * The idprefix table.
691  */
692 
693 #define idpref_table_size 39
694 struct {
695   char *idprefix, *dbname;
696 } idpref_table[idpref_table_size] = {
697   { "acc", "Accession" },
698   { "ag2d", "AARHUS/GHENT-2DPAGE" },
699   { "agis", "AGIS" },
700   { "bbs", "GIBBSQ" },
701   { "bbm", "GIBBMT" },
702   { "blks", "BLOCKS" },
703   { "cpg", "CpGIsle" },
704   { "ddb", "DICTYDB" },
705   { "ddbj" "DDBJ" },
706   { "ec", "ENZYME" },
707   { "eco", "ECOGENE" },
708   { "embl", "EMBL" },
709   { "epd", "EPD" },
710   { "est", "dbEST" },
711   { "fly", "FlyBase" },
712   { "gb", "GenBank" },
713   { "gcr", "GCRDB" },
714   { "gdb", "GDB" },
715   { "gp", "GenPept" },
716   { "gi", "GI" },
717   { "giim", "GIIM" },
718   { "hiv", "HIV" },
719   { "imgt", "IMGT" },
720   { "mdb", "MaizeDB" },
721   { "muid", "MEDLINE" },
722   { "nid", "NID" },
723   { "omim", "OMIM" },
724   { "pat", "Patent" },
725   { "pdb", "PDB" },
726   { "pir", "PIR" },
727   { "prf", "PRF" },
728   { "pros", "PROSITE" },
729   { "reb", "REBASE" },
730   { "rpb", "REPBASE" },
731   { "sp", "SWISSPROT" },
732   { "sts", "dbSTS" },
733   { "tfd", "TRANSFAC" },
734   { "wpep", "WORMPEP" },
735   { "yepd", "YEPD" }
736 };
737 
738 /*
739  * The defines, constants and data structures used to handle error reporting.
740  */
741 #define STATUS_OK 0
742 #define STATUS_WARNING 1
743 #define STATUS_EOF 2
744 #define STATUS_ERROR 3
745 #define STATUS_FATAL 4
746 
747 
748 int seqferrno = E_NOERROR;
749 char seqferrstr[1024];
750 
751 static int pe_flag = PE_ALL;
752 static int err_batchmode = 0;
753 static void (*perror_fn)(char *) = puterror;
754 
755 #define reset_errors()  seqferrno = E_NOERROR; seqferrstr[0] = '\0'
756 #define set_error(errnum)  seqferrno = errnum
757 
758 static void print_fatal(char *format, ...);
759 static void print_error(char *format, ...);
760 static void print_warning(char *format, ...);
761 
762 #define raise_error(errorval,retcmd,printcmd) \
763           { \
764             set_error(errorval); \
765             printcmd; \
766             retcmd; \
767           }
768 
769 #define error_test(expr,errorval,retcmd,printcmd) \
770           { \
771             if (expr) { \
772               set_error(errorval); \
773               printcmd; \
774               retcmd; \
775             } \
776           }
777 
778 #define param_error(expr,retcmd,function,string) \
779           { \
780             if (expr) { \
781               set_error(E_PARAMERROR); \
782               print_error("Parameter Error in %s:  %s\n", function, string); \
783               retcmd; \
784             } \
785           }
786 
787 #define status_error(retcmd,function) \
788           { \
789             set_error(E_PROGRAMERROR); \
790             err_batchmode = 1; \
791             print_error("SEQIO Program Error in %s:  " \
792                         "Invalid status return value.\n", function); \
793             print_fatal("\n   *** This is probably a bug in the SEQIO " \
794                         "package, and not a user error." \
795                         "\n   *** Please report the error to the authors " \
796                         "of this software.\n\n"); \
797             err_batchmode = 0; \
798             retcmd; \
799           }
800 
801 #define program_error(expr,retcmd,printcmd) \
802           { \
803             if (expr) { \
804               set_error(E_PROGRAMERROR); \
805               err_batchmode = 1; \
806               print_error("SEQIO Program Error, line %d:\n", __LINE__); \
807               printcmd; \
808               print_fatal("\n   *** This is probably a bug in the SEQIO " \
809                           "package, and not a user error." \
810                           "\n   *** Please report the error to the authors " \
811                           "of this software.\n\n"); \
812               err_batchmode = 0; \
813               retcmd; \
814             } \
815           }
816 
817 #define memory_error(expr,retcmd) \
818           { \
819             if (expr) { \
820               set_error(E_NOMEMORY); \
821               print_fatal("Memory Error:  Ran out of memory.\n"); \
822               retcmd; \
823             } \
824           }
825 
826 #define preverror_test(expr,retcmd) \
827           { \
828             if (expr) { \
829               set_error(E_PREVERROR); \
830               retcmd; \
831             } \
832           }
833 
834 #define eof_test(expr,retcmd) \
835           { \
836             if (expr) { \
837               set_error(E_EOF); \
838               retcmd; \
839             } \
840           }
841 
842 /*
843  * Internal Prototypes and miscellaneous variables.
844  */
845 static int intseqf_open(INTSEQFILE *, char *, char *);
846 static int intseqf_open_for_writing(INTSEQFILE *, char *, char *, char *);
847 static int determine_format(INTSEQFILE *);
848 static int resolve_offsets(INTSEQFILE *, char *, char *);
849 static void intseqf_close(INTSEQFILE *);
850 static int intseqf_read(INTSEQFILE *, int);
851 static char *intseqf_info(INTSEQFILE *, int, int);
852 
853 static int fp_get_line(INTSEQFILE *, char **, char **);
854 static int fp_read_more(INTSEQFILE *, char **, char **, char **);
855 static int fp_read_all(INTSEQFILE *);
856 
857 
858 static char *months[13] = { "", "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
859                                 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
860                           };
861 static char *full_months[13] = { "", "JANUARY", "FEBRUARY", "MARCH",
862                                      "APRIL",   "MAY",      "JUNE",
863                                      "JULY",    "AUGUST",   "SEPTEMBER",
864                                      "OCTOBER", "NOVEMBER", "DECEMBER"
865                                };
866 static char *gcg_full_months[13] = { "", "January", "February", "March",
867                                          "April",   "May",      "June",
868                                          "July",    "August",   "September",
869                                          "October", "November", "December"
870                                    };
871 
isamonth(char * s)872 static int isamonth(char *s)
873 {
874   int i;
875 
876   switch (toupper(*s)) {
877   case 'J': case 'F': case 'M': case 'A':
878   case 'S': case 'O': case 'N': case 'D':
879     for (i=1; i <= 12; i++)
880       if (mystreq(s, full_months[i][0], full_months[i]))
881         return i;
882 
883   default:
884     return 0;
885   }
886 }
887 
888 
889 
890 /*
891  *
892  *
893  * The File Input Procedures:
894  *     seqfopen, dbopen, seqfclose, seqfread, seqfgetseq, seqfgetent
895  *
896  * The File/Entry Access Procedures:
897  *     seqfformat, seqfsequence, seqfentry
898  *
899  *
900  *
901  */
902 
903 /*
904  * seqfopen
905  *
906  * Open the given file and return a structure which can be used to
907  * read or write sequence entries.  Similar to the fopen function,
908  * except that only the simple read, write and append modes are
909  * permitted, and an extra file format parameter is needed.
910  *
911  * Parameters:  filename  -  name of the file to be opened.
912  *              mode      -  opening mode, either "r", "w" or "a".
913  *              format    -  file format to use (could be NULL if mode is "r").
914  *
915  * Returns:  A SEQFILE structure.
916  */
seqfopen(char * filename,char * mode,char * format)917 SEQFILE *seqfopen(char *filename, char *mode, char *format)
918 {
919   int status;
920   INTSEQFILE *isfp;
921 
922   if (!ctype_initflag)
923     init_ctype();
924 
925   reset_errors();
926   param_error(filename == NULL, return NULL, "seqfopen", "arg 1 is NULL");
927   param_error(filename[0] == '\0', return NULL, "seqfopen",
928               "arg 1 is an empty string");
929   param_error(mode == NULL, return NULL, "seqfopen", "arg 2 is NULL");
930   param_error(mode[0] != 'r' && mode[0] != 'w' && mode[0] != 'a', return NULL,
931               "seqfopen", "arg 2 is not \"r\", \"w\" or \"a\"");
932   param_error((mode[0] == 'w' || mode[0] == 'a') && format == NULL,
933               return NULL, "seqfopen",
934               "arg 2 is \"w\" or \"a\", but no file format specified");
935   param_error(format != NULL && format[0] == '\0', return NULL, "seqfopen",
936               "arg 3 is an empty string");
937 
938   /*
939    * Allocate the sequence-file structure, and initialize all fields.
940    */
941   isfp = (INTSEQFILE *) malloc(sizeof(INTSEQFILE));
942   memory_error(isfp == NULL, return NULL);
943   memset(isfp, 0, sizeof(INTSEQFILE));
944   isfp->opstatus = OP_ACTIVE;
945 
946   /*
947    * Do the file opening and buffer allocation.
948    */
949   if (mode[0] == 'r') {
950     isfp->optype = OP_READ;
951     status = intseqf_open(isfp, filename, format);
952     if (status != STATUS_OK && status != STATUS_WARNING) {
953       intseqf_close(isfp);
954       return NULL;
955     }
956   }
957   else {
958     isfp->optype = OP_WRITE;
959     status = intseqf_open_for_writing(isfp, filename, format, mode);
960     if (status != STATUS_OK && status != STATUS_WARNING) {
961       intseqf_close(isfp);
962       return NULL;
963     }
964   }
965 
966   return (SEQFILE *) isfp;
967 }
968 
969 
970 /*
971  * seqfopendb
972  *
973  * Opens a whole database for reading, instead of just a single file.
974  * It uses the BIOSEQ procedures to get the database information.
975  *
976  * Parameters:   dbname   - name of the database (plus optional spec. string)
977  *
978  * Returns:  A SEQFILE structure.
979  */
seqfopendb(char * dbname)980 SEQFILE *seqfopendb(char *dbname)
981 {
982   int status, len;
983   char *s;
984   INTSEQFILE *isfp;
985 
986   if (!ctype_initflag)
987     init_ctype();
988 
989   reset_errors();
990   param_error(dbname == NULL, return NULL, "seqfopendb", "arg 1 is NULL");
991   param_error(dbname[0] == '\0', return NULL, "seqfopendb",
992               "arg 1 is an empty string");
993 
994   /*
995    * Allocate the sequence-file structure, and initialize all the fields.
996    */
997   isfp = (INTSEQFILE *) malloc(sizeof(INTSEQFILE));
998   memory_error(isfp == NULL, return NULL);
999   memset(isfp, 0, sizeof(INTSEQFILE));
1000   isfp->opstatus = OP_ACTIVE;
1001   isfp->optype = OP_DB;
1002 
1003   isfp->db_spec = mystrdup(dbname);
1004   if (isfp->db_spec == NULL) {
1005     free(isfp);
1006     memory_error(1, return NULL);
1007   }
1008 
1009   /*
1010    * Parse the database name (and specification), get the list of files,
1011    * the database name, the file format and the database title.
1012    */
1013   isfp->db_files = bioseq_parse(dbname);
1014   if (isfp->db_files == NULL) {
1015     free(isfp->db_spec);
1016     free(isfp);
1017     return NULL;
1018   }
1019   for (s=isfp->db_files; *s; s++)
1020     if (*s == '\n')
1021       *s = '\0';
1022 
1023   isfp->db_name = bioseq_info(dbname, "Name");
1024   isfp->db_format = bioseq_info(dbname, "Format");
1025   isfp->db_alpha = bioseq_info(dbname, "Alphabet");
1026   isfp->db_idprefix = bioseq_info(dbname, "IdPrefix");
1027 
1028   if (isfp->db_format != NULL && !seqfisaformat(isfp->db_format)) {
1029     set_error(E_INVFORMAT);
1030     print_error("`%s':  BIOSEQ Entry specifies invalid format `%s'.\n",
1031                 dbname, isfp->db_format);
1032     intseqf_close(isfp);
1033     return NULL;
1034   }
1035   if (isfp->db_idprefix != NULL) {
1036     for (s=isfp->db_idprefix,len=0; len < 6 && isalnum(*s); s++,len++)
1037       if (isupper(*s))
1038         *s = tolower(*s);
1039     if (len < 2 || len > 4 || (*s && !isalnum(*s))) {
1040       set_error(E_INVINFO);
1041       print_error("`%s':  BIOSEQ Entry specifies invalid id prefix `%s'.\n",
1042                   dbname, isfp->db_idprefix);
1043       intseqf_close(isfp);
1044       return NULL;
1045     }
1046   }
1047 
1048   /*
1049    * Open the first file, and allocate the buffers.
1050    */
1051   isfp->db_currentfile = isfp->db_files;
1052   status = intseqf_open(isfp, isfp->db_currentfile, isfp->db_format);
1053   switch (status) {
1054   case STATUS_OK:
1055   case STATUS_WARNING:
1056     return isfp;
1057 
1058   case STATUS_FATAL:
1059     intseqf_close(isfp);
1060     return NULL;
1061 
1062   case STATUS_EOF:
1063   case STATUS_ERROR:
1064     set_error(E_DBFILEERROR);
1065 
1066     /*
1067      * If searching a database, close the current file and goto the next
1068      * file in the list.  Return the eof signal if no more files appear
1069      * in the list.
1070      */
1071 #ifdef ISMAPABLE
1072     if (isfp->ismapped) {
1073       munmap(isfp->fp_buffer, isfp->mapsize);
1074       isfp->fp_buffer = NULL;
1075       isfp->fp_bufsize = 0;
1076       isfp->ismapped = 0;
1077     }
1078 #endif
1079     close_raw_file(isfp->input_fd);
1080     isfp->openflag = 0;
1081 
1082     /*
1083      * Construct the correct path to the next file, open the file and
1084      * recursively call seqfread to get the first entry in that file.
1085      */
1086     while (1) {
1087       for (s=isfp->db_currentfile; *s; s++) ;
1088       isfp->db_currentfile = ++s;
1089       if (*s == '\0') {
1090         intseqf_close(isfp);
1091         return NULL;
1092       }
1093 
1094       status = intseqf_open(isfp, isfp->db_currentfile, isfp->db_format);
1095       switch (status) {
1096       case STATUS_OK:
1097       case STATUS_WARNING:
1098         return isfp;
1099 
1100       case STATUS_FATAL:
1101         intseqf_close(isfp);
1102         return NULL;
1103 
1104       case STATUS_ERROR:
1105       case STATUS_EOF:
1106         set_error(E_DBFILEERROR);
1107         break;
1108 
1109       default:
1110         intseqf_close(isfp);
1111         status_error(return NULL, "seqfopendb");
1112       }
1113     }
1114 
1115   default:
1116     intseqf_close(isfp);
1117     status_error(return NULL, "seqfopendb");
1118   }
1119 }
1120 
1121 
1122 /*
1123  * seqfopen2
1124  *
1125  * A simple interface to open a file/database for reading.  A single
1126  * string is given as the argument.  If that string specifies a valid
1127  * file, then that file is opened.  Otherwise, the string is considered
1128  * a database name and the database is opened.
1129  *
1130  * Parameters:   str  -  either a filename or database string
1131  *
1132  * Returns:  A SEQFILE structure.
1133  */
seqfopen2(char * str)1134 SEQFILE *seqfopen2(char *str)
1135 {
1136   if (!ctype_initflag)
1137     init_ctype();
1138 
1139   if ((str[0] == '-' && str[1] == '\0') || seqfisafile(str))
1140     return seqfopen(str, "r", NULL);
1141   else if (bioseq_check(str))
1142     return seqfopendb(str);
1143   else {
1144     set_error(E_OPENFAILED);
1145     print_error("%s:  No such file or database exists.\n", str);
1146     return NULL;
1147   }
1148 }
1149 
1150 
1151 /*
1152  * seqfclose
1153  *
1154  * Close the open file pointer (if not stdin or stdout), and free the
1155  * SEQFILE structure (and its dynamically allocated elements).
1156  *
1157  * Parameters:   sfp  -  an opened SEQFILE structure
1158  *
1159  * Returns:  nothing
1160  */
seqfclose(SEQFILE * sfp)1161 void seqfclose(SEQFILE *sfp)
1162 {
1163   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1164 
1165   if (!ctype_initflag)
1166     init_ctype();
1167 
1168   reset_errors();
1169   param_error(isfp == NULL, return, "seqfclose", "arg 1 is NULL");
1170   param_error(isfp->opstatus == OP_FREED, return, "seqfclose",
1171               "arg 1 is already closed");
1172 
1173   intseqf_close(isfp);
1174 }
1175 
1176 
1177 /*
1178  * seqfread
1179  *
1180  * Read the next entry in the sequence file/database into memory.
1181  *
1182  * Parameters:   sfp  - an opened SEQFILE structure
1183  *               flag - non-zero says read the next entry, zero for just
1184  *                      the next sequence.
1185  *
1186  * Returns:  a 0 if the read was successful, a -1 on EOF or error.
1187  */
seqfread(SEQFILE * sfp,int flag)1188 int seqfread(SEQFILE *sfp, int flag)
1189 {
1190   int status, offset;
1191   char *s;
1192   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1193 
1194   if (!ctype_initflag)
1195     init_ctype();
1196 
1197   reset_errors();
1198   param_error(isfp == NULL, return -1, "seqfread", "arg 1 is NULL");
1199   param_error(isfp->opstatus == OP_FREED, return -1, "seqfread",
1200               "arg 1 is not an open SEQFILE");
1201   param_error(isfp->optype == OP_WRITE, return -1, "seqfread",
1202               "arg 1 is not open for reading");
1203 
1204   preverror_test(isfp->opstatus == OP_ERROR, return -1);
1205   eof_test(isfp->opstatus == OP_EOF, return -1);
1206 
1207   if (isfp->opstatus == OP_TEMPERR) {
1208     if (isfp->optype == OP_READ) {
1209       isfp->opstatus = OP_EOF;
1210       set_error(E_EOF);
1211       return -1;
1212     }
1213 
1214     while (1) {
1215       for (s=isfp->db_currentfile; *s; s++) ;
1216       isfp->db_currentfile = ++s;
1217       if (*s == '\0') {
1218         isfp->opstatus = OP_EOF;
1219         set_error(E_EOF);
1220         return -1;
1221       }
1222 
1223       status = intseqf_open(isfp, isfp->db_currentfile, isfp->db_format);
1224       switch (status) {
1225       case STATUS_OK:
1226       case STATUS_WARNING:
1227         isfp->opstatus = OP_ACTIVE;
1228         return 0;
1229 
1230       case STATUS_FATAL:
1231         isfp->opstatus = OP_ERROR;
1232         return -1;
1233 
1234       case STATUS_ERROR:
1235       case STATUS_EOF:
1236         set_error(E_DBFILEERROR);
1237         break;
1238 
1239       default:
1240         status_error(return -1, "seqfread");
1241       }
1242     }
1243   }
1244 
1245   /*
1246    * If we've already read the first entry (as part of the open), just return.
1247    */
1248   if (isfp->initreadflag) {
1249     isfp->initreadflag = 0;
1250     return 0;
1251   }
1252 
1253   /*
1254    * In the basic reading mode or if there are more sequences to read in the
1255    * current entry, just read the next sequence/entry.
1256    *
1257    * In the random access mode (accessing single entries in a file), first
1258    * get the byte offset of the next entry to read, seek to that offset,
1259    * reset all of the file pointers and then read that next entry.
1260    */
1261   if (!isfp->randaccessflag ||
1262       (!flag && isfp->entry_seqno < isfp->entry_numseqs))
1263     status = intseqf_read(isfp, flag);
1264   else {
1265     if (isfp->currentoffset == isfp->num_offsets)
1266       status = STATUS_EOF;
1267     else {
1268       offset = isfp->byteoffsets[isfp->currentoffset++];
1269       status = seek_raw_file(isfp->input_fd, offset);
1270       error_test(status != STATUS_OK, E_READFAILED, return -1,
1271                  print_error("%s:  %s\n", isfp->filename, sys_errlist[errno]));
1272 
1273       isfp->fp_bytepos = offset;
1274       isfp->fp_current = isfp->fp_top = isfp->fp_buffer;
1275       isfp->fp_buffer[0] = '\n';
1276       isfp->isendtagged = 1;
1277 
1278       isfp->fp_entrystart = isfp->fp_seqstart = isfp->fp_entryend = NULL;
1279       isfp->entry_seqno = isfp->entry_numseqs = 0;
1280       status = intseqf_read(isfp, 1);
1281     }
1282   }
1283 
1284   switch (status) {
1285   case STATUS_OK:
1286   case STATUS_WARNING:
1287     return 0;
1288 
1289   case STATUS_EOF:
1290     if (isfp->optype == OP_READ) {
1291       set_error(E_EOF);
1292       isfp->opstatus = OP_EOF;
1293       return -1;
1294     }
1295 
1296     /*
1297      * If searching a database, close the current file and goto the next
1298      * file in the list.  Return the eof signal if no more files appear
1299      * in the list.
1300      */
1301 #ifdef ISMAPABLE
1302     if (isfp->ismapped) {
1303       munmap(isfp->fp_buffer, isfp->mapsize);
1304       isfp->fp_buffer = NULL;
1305       isfp->fp_bufsize = 0;
1306       isfp->ismapped = 0;
1307     }
1308 #endif
1309     close_raw_file(isfp->input_fd);
1310     isfp->openflag = 0;
1311 
1312     /*
1313      * Construct the correct path to the next file, open the file and
1314      * recursively call seqfread to get the first entry in that file.
1315      */
1316     while (1) {
1317       for (s=isfp->db_currentfile; *s; s++) ;
1318       isfp->db_currentfile = ++s;
1319       if (*s == '\0') {
1320         set_error(E_EOF);
1321         isfp->opstatus = OP_EOF;
1322         return -1;
1323       }
1324 
1325       status = intseqf_open(isfp, isfp->db_currentfile, isfp->db_format);
1326       isfp->initreadflag = 0;
1327       switch (status) {
1328       case STATUS_OK:
1329       case STATUS_WARNING:
1330         return 0;
1331 
1332       case STATUS_FATAL:
1333         isfp->opstatus = OP_ERROR;
1334         return -1;
1335 
1336       case STATUS_ERROR:
1337       case STATUS_EOF:
1338         set_error(E_DBFILEERROR);
1339         break;
1340 
1341       default:
1342         status_error(return -1, "seqfread");
1343       }
1344     }
1345     return -1;
1346 
1347   case STATUS_ERROR:
1348     isfp->opstatus = OP_TEMPERR;
1349     return -1;
1350 
1351   case STATUS_FATAL:
1352     isfp->opstatus = OP_ERROR;
1353     return -1;
1354 
1355   default:
1356     status_error(return -1, "seqfread");
1357   }
1358 }
1359 
1360 
1361 /*
1362  * seqfgetseq
1363  *
1364  * Read the next entry in the sequence file/database and then get it's
1365  * sequence.
1366  *
1367  * Parameters:  sfp         -  an opened SEQFILE structure
1368  *              length_out  -  the location to store the sequence length
1369  *                             (can be NULL)
1370  *              newbuffer   -  should a new, dynamically allocated buffer
1371  *                             be created to hold the sequence
1372  *
1373  * Returns:  the next entry's sequence.
1374  */
seqfgetseq(SEQFILE * sfp,int * length_out,int newbuffer)1375 char *seqfgetseq(SEQFILE *sfp, int *length_out, int newbuffer)
1376 {
1377   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1378 
1379   if (!ctype_initflag)
1380     init_ctype();
1381 
1382   reset_errors();
1383   param_error(isfp == NULL, return NULL, "seqfgetseq", "arg 1 is NULL");
1384   param_error(isfp->opstatus == OP_FREED, return NULL, "seqfgetseq",
1385               "arg 1 is not an open SEQFILE");
1386   param_error(isfp->optype == OP_WRITE, return NULL, "seqfgetseq",
1387               "arg 1 is not open for reading");
1388 
1389   if (seqfread(sfp, 0) == -1) {
1390     if (isfp->opstatus == OP_EOF || isfp->opstatus == OP_ERROR)
1391       return NULL;
1392     else {
1393       if (length_out != NULL)
1394         *length_out = 0;
1395       return "";
1396     }
1397   }
1398   return seqfsequence(sfp, length_out, newbuffer);
1399 }
1400 
1401 
1402 /*
1403  * seqfgetrawseq
1404  *
1405  * Read the next entry in the sequence file/database and then get it's
1406  * raw sequence.
1407  *
1408  * Parameters:  sfp         -  an opened SEQFILE structure
1409  *              length_out  -  the location to store the sequence length
1410  *                             (can be NULL)
1411  *              newbuffer   -  should a new, dynamically allocated buffer
1412  *                             be created to hold the sequence
1413  *
1414  * Returns:  the next entry's raw sequence.
1415  */
seqfgetrawseq(SEQFILE * sfp,int * length_out,int newbuffer)1416 char *seqfgetrawseq(SEQFILE *sfp, int *length_out, int newbuffer)
1417 {
1418   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1419 
1420   if (!ctype_initflag)
1421     init_ctype();
1422 
1423   reset_errors();
1424   param_error(isfp == NULL, return NULL, "seqfgetrawseq", "arg 1 is NULL");
1425   param_error(isfp->opstatus == OP_FREED, return NULL, "seqfgetrawseq",
1426               "arg 1 is not an open SEQFILE");
1427   param_error(isfp->optype == OP_WRITE, return NULL, "seqfgetrawseq",
1428               "arg 1 is not open for reading");
1429 
1430   if (seqfread(sfp, 0) == -1) {
1431     if (isfp->opstatus == OP_EOF || isfp->opstatus == OP_ERROR)
1432       return NULL;
1433     else {
1434       if (length_out != NULL)
1435         *length_out = 0;
1436       return "";
1437     }
1438   }
1439   return seqfrawseq(sfp, length_out, newbuffer);
1440 }
1441 
1442 
1443 /*
1444  * seqfgetentry
1445  *
1446  * Read the next entry in the sequence file/database and return it.
1447  *
1448  * Parameters:  sfp         -  an opened SEQFILE structure
1449  *              length_out  -  the location to store the entry's length
1450  *                             (can be NULL)
1451  *              newbuffer   -  should a new, dynamically allocated buffer
1452  *                             be created to hold the entry
1453  *
1454  * Returns:  the text of the next entry.
1455  */
seqfgetentry(SEQFILE * sfp,int * length_out,int newbuffer)1456 char *seqfgetentry(SEQFILE *sfp, int *length_out, int newbuffer)
1457 {
1458   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1459 
1460   if (!ctype_initflag)
1461     init_ctype();
1462 
1463   reset_errors();
1464   param_error(isfp == NULL, return NULL, "seqfgetentry", "arg 1 is NULL");
1465   param_error(isfp->opstatus == OP_FREED, return NULL, "seqfgetentry",
1466               "arg 1 is not an open SEQFILE");
1467   param_error(isfp->optype == OP_WRITE, return NULL, "seqfgetentry",
1468               "arg 1 is not open for reading");
1469 
1470   if (seqfread(sfp, 1) == -1) {
1471     if (isfp->opstatus == OP_EOF || isfp->opstatus == OP_ERROR)
1472       return NULL;
1473     else {
1474       if (length_out != NULL)
1475         *length_out = 0;
1476       return "";
1477     }
1478   }
1479   return seqfentry(sfp, length_out, newbuffer);
1480 }
1481 
1482 
1483 /*
1484  * seqfgetinfo
1485  *
1486  * Read the next entry in the sequence file/database, parse it and
1487  * return the information found in the entry.
1488  *
1489  * Parameters:  sfp         -  an opened SEQFILE structure
1490  *              newbuffer   -  should a new, dynamically allocated buffer
1491  *                             be created to hold the entry
1492  *
1493  * Returns:  the information about the next entry.
1494  */
seqfgetinfo(SEQFILE * sfp,int newbuffer)1495 SEQINFO *seqfgetinfo(SEQFILE *sfp, int newbuffer)
1496 {
1497   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1498 
1499   if (!ctype_initflag)
1500     init_ctype();
1501 
1502   reset_errors();
1503   param_error(isfp == NULL, return NULL, "seqfgetinfo", "arg 1 is NULL");
1504   param_error(isfp->opstatus == OP_FREED, return NULL, "seqfgetinfo",
1505               "arg 1 is not an open SEQFILE");
1506   param_error(isfp->optype == OP_WRITE, return NULL, "seqfgetinfo",
1507               "arg 1 is not open for reading");
1508 
1509   if (seqfread(sfp, 0) == -1)
1510     return NULL;
1511   return seqfinfo(sfp, newbuffer);
1512 }
1513 
1514 
1515 /*
1516  * seqfsequence & seqfrawseq
1517  *
1518  * Get the sequence for the current entry and return it.
1519  *
1520  * Parameters:  sfp         -  an opened SEQFILE structure
1521  *              length_out  -  the location to store the sequence length
1522  *                             (can be NULL)
1523  *              newbuffer   -  should a new, dynamically allocated buffer
1524  *                             be created to hold the sequence
1525  *              rawseqflag  -  GETSEQ_SEQUENCE for the basic sequence, or
1526  *                             GETSEQ_RAWSEQ for the raw seq.
1527  *                              (there is also GETSEQ_LENGTHS for the lengths
1528  *                               only, but that should not be used here)
1529  *              fnname      -  the name of the function.
1530  *
1531  * Returns:  the current entry's sequence.
1532  */
intseqf_seq(SEQFILE * sfp,int * length_out,int newbuffer,int rawseqflag,char * fnname)1533 char *intseqf_seq(SEQFILE *sfp, int *length_out, int newbuffer,
1534                   int rawseqflag, char *fnname)
1535 {
1536   int status;
1537   char *seq;
1538   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1539 
1540   if (!ctype_initflag)
1541     init_ctype();
1542 
1543   reset_errors();
1544   param_error(isfp == NULL, return NULL, fnname, "arg 1 is NULL");
1545   param_error(isfp->opstatus == OP_FREED, return NULL, fnname,
1546               "arg 1 is not an open SEQFILE");
1547   param_error(isfp->optype == OP_WRITE, return NULL, fnname,
1548               "arg 1 is not open for reading");
1549 
1550   preverror_test(isfp->opstatus == OP_ERROR || isfp->opstatus == OP_TEMPERR,
1551                  return NULL);
1552   eof_test(isfp->opstatus == OP_EOF, return NULL);
1553 
1554   /*
1555    * If the sequence has already been read and the sequence in the buffer
1556    * is the desired sequence, just return it (or a copy of it).
1557    */
1558   if (isfp->isseqcurrent && isfp->rawseqflag == rawseqflag) {
1559     if (length_out != NULL)
1560       *length_out = isfp->seqlen;
1561 
1562     if (!newbuffer)
1563       return isfp->seq;
1564     else {
1565       seq = (char *) malloc(isfp->seqlen + 1);
1566       memory_error(seq == NULL, return NULL);
1567       memcpy(seq, isfp->seq, isfp->seqlen + 1);
1568       return seq;
1569     }
1570   }
1571 
1572   /*
1573    * Otherwise, allocate space for the sequence, if necessary, and then
1574    * call the getseq function.
1575    */
1576   if (isfp->seq == NULL) {
1577     isfp->seq = (char *) malloc(INIT_SEQSIZE);
1578     memory_error(isfp->seq == NULL, return NULL);
1579     isfp->seqsize = INIT_SEQSIZE;
1580   }
1581   isfp->seq[0] = '\0';
1582   isfp->seqlen = 0;
1583 
1584   status = (*file_table[isfp->format].getseq_fn)(isfp, rawseqflag);
1585   switch (status) {
1586   case STATUS_OK:
1587   case STATUS_WARNING:
1588     isfp->isseqcurrent = 1;
1589     isfp->rawseqflag = rawseqflag;
1590     if (length_out != NULL)
1591       *length_out = isfp->seqlen;
1592     seq = isfp->seq;
1593     if (newbuffer) {
1594       isfp->seq = NULL;
1595       isfp->seqlen = isfp->seqsize = 0;
1596       isfp->isseqcurrent = 0;
1597     }
1598     return seq;
1599 
1600   case STATUS_ERROR:
1601     if (length_out != NULL)
1602       *length_out = 0;
1603     return "";
1604 
1605   case STATUS_FATAL:
1606     return NULL;
1607 
1608   default:
1609     status_error(return NULL, fnname);
1610   }
1611 }
1612 
seqfsequence(SEQFILE * sfp,int * length_out,int newbuffer)1613 char *seqfsequence(SEQFILE *sfp, int *length_out, int newbuffer)
1614 {  return intseqf_seq(sfp, length_out, newbuffer,
1615                       GETSEQ_SEQUENCE, "seqfsequence");  }
seqfrawseq(SEQFILE * sfp,int * length_out,int newbuffer)1616 char *seqfrawseq(SEQFILE *sfp, int *length_out, int newbuffer)
1617 {  return intseqf_seq(sfp, length_out, newbuffer,
1618                       GETSEQ_RAWSEQ, "seqfrawseq");  }
1619 
1620 
1621 /*
1622  * seqfentry
1623  *
1624  * Return the text for the current entry.
1625  *
1626  * Parameters:  sfp         -  an opened SEQFILE structure
1627  *              length_out  -  the location to store the sequence length
1628  *                             (can be NULL)
1629  *              newbuffer   -  should a new, dynamically allocated buffer
1630  *                             be created to hold the entry
1631  *
1632  * Returns:  the current entry's text.
1633  */
seqfentry(SEQFILE * sfp,int * length_out,int newbuffer)1634 char *seqfentry(SEQFILE *sfp, int *length_out, int newbuffer)
1635 {
1636   int len;
1637   char *buffer;
1638   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1639 
1640   if (!ctype_initflag)
1641     init_ctype();
1642 
1643   reset_errors();
1644   param_error(isfp == NULL, return NULL, "seqfentry", "arg 1 is NULL");
1645   param_error(isfp->opstatus == OP_FREED, return NULL, "seqfentry",
1646               "arg 1 is not an open SEQFILE");
1647   param_error(isfp->optype == OP_WRITE, return NULL, "seqfentry",
1648               "arg 1 is not open for reading");
1649 
1650   preverror_test(isfp->opstatus == OP_ERROR || isfp->opstatus == OP_TEMPERR,
1651                  return NULL);
1652   eof_test(isfp->opstatus == OP_EOF, return NULL);
1653 
1654   len = isfp->fp_entryend - isfp->fp_entrystart;
1655 
1656   /*
1657    * If the file is mapped and a copy of the current entry is
1658    * stored in the writable buffer "mapentflag", just use that copy.
1659    */
1660   if (isfp->ismapped && isfp->mapentflag) {
1661     buffer = isfp->mapentry;
1662     if (newbuffer) {
1663       isfp->mapentflag = 0;
1664       isfp->mapentsize = 0;
1665       isfp->mapentry = NULL;
1666     }
1667 
1668     if (length_out != NULL)
1669       *length_out = len;
1670     return buffer;
1671   }
1672 
1673   /*
1674    * If a new buffer is requested, malloc the space and copy the text.
1675    * Otherwise, if the file is mapped, copy it into a writable buffer,
1676    * If not, just return a pointer into the internal buffer after
1677    * '\0'-terminating the entry.
1678    */
1679   if (newbuffer) {
1680     buffer = (char *) malloc(len + 1);
1681     memory_error(buffer == NULL, return NULL);
1682     memcpy(buffer, isfp->fp_entrystart, len);
1683     buffer[len] = '\0';
1684 
1685     if (length_out != NULL)
1686       *length_out = len;
1687     return buffer;
1688   }
1689   else if (isfp->ismapped) {
1690     if (isfp->mapentsize < len + 1) {
1691       isfp->mapentsize += len + 1;
1692       if (isfp->mapentry == NULL)
1693         isfp->mapentry = (char *) malloc(isfp->mapentsize);
1694       else
1695         isfp->mapentry = (char *) realloc(isfp->mapentry, isfp->mapentsize);
1696 
1697       if (isfp->mapentry == NULL) {
1698         isfp->mapentsize = 0;
1699         memory_error(1, return NULL);
1700       }
1701     }
1702     memcpy(isfp->mapentry, isfp->fp_entrystart, len);
1703     isfp->mapentry[len] = '\0';
1704     isfp->mapentflag = 1;
1705 
1706     if (length_out != NULL)
1707       *length_out = len;
1708     return isfp->mapentry;
1709   }
1710   else {
1711     if (*isfp->fp_entryend != '\0') {
1712       if (isfp->savech_loc != NULL)
1713         *isfp->savech_loc = isfp->savech;
1714 
1715       isfp->savech_loc = isfp->fp_entryend;
1716       isfp->savech = *isfp->fp_entryend;
1717       *isfp->fp_entryend = '\0';
1718     }
1719 
1720     if (length_out != NULL)
1721       *length_out = len;
1722     return isfp->fp_entrystart;
1723   }
1724 }
1725 
1726 
1727 /*
1728  * intseqf_field1
1729  *
1730  * The implementation of seqfinfo, seqfallinfo, seqfdate, seqfmainid,
1731  * seqfmainacc, seqfidlist, seqfdescription, seqfcomment and seqforganism.
1732  * The stub functions for those functions occur after the code for
1733  * intseqf_field1.
1734  *
1735  * Parameters:  sfp         -  an opened SEQFILE structure
1736  *              newbuffer   -  should a new, dynamically allocated buffer
1737  *                             be created to hold the sequence
1738  *              fnname      -  the name of the stub function
1739  *              field       -  the requested information field
1740  *
1741  * Returns:  the requested string (or SEQINFO structure)
1742  */
intseqf_field1(SEQFILE * sfp,int newbuffer,char * fnname,int field)1743 static char *intseqf_field1(SEQFILE *sfp, int newbuffer, char *fnname,
1744                             int field)
1745 {
1746   char *s, *t, *id, *idend, *idlist, *temp;
1747   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1748 
1749   if (!ctype_initflag)
1750     init_ctype();
1751 
1752   reset_errors();
1753   param_error(isfp == NULL, return NULL, fnname, "arg 1 is NULL");
1754   param_error(isfp->opstatus == OP_FREED, return NULL, fnname,
1755               "arg 1 is not an open SEQFILE");
1756   param_error(isfp->optype == OP_WRITE, return NULL, fnname,
1757               "arg 1 is not open for reading");
1758 
1759   preverror_test(isfp->opstatus == OP_ERROR || isfp->opstatus == OP_TEMPERR,
1760                  return NULL);
1761   eof_test(isfp->opstatus == OP_EOF, return NULL);
1762 
1763   if (field != SEQINFO_MAINID && field != SEQINFO_MAINACC)
1764     return intseqf_info(isfp, newbuffer, field);
1765   else {
1766     idlist = intseqf_info(isfp, 0, SEQINFO_IDLIST);
1767     if (idlist == NULL)
1768       return NULL;
1769 
1770     id = idend = NULL;
1771     for (s=idlist; *s; ) {
1772       for (t=s; *s && *s != '|'; s++) ;
1773       if ((field == SEQINFO_MAINID && !mystreq(t, 'A', "ACC:")) ||
1774           (field == SEQINFO_MAINACC && mystreq(t, 'A', "ACC:"))) {
1775         id = t;
1776         idend = s;
1777         break;
1778       }
1779       if (*s) s++;
1780     }
1781     if (id == NULL && field == SEQINFO_MAINID)
1782       for (id=idend=idlist; *idend && *idend == '|'; idend++) ;
1783 
1784     if (id == NULL)
1785       return NULL;
1786 
1787     temp = mystrdup2(id, idend);
1788     memory_error(temp == NULL, return NULL);
1789 
1790     if (!newbuffer) {
1791       if (isfp->idbuffer != NULL)
1792         free(isfp->idbuffer);
1793       isfp->idbuffer = temp;
1794     }
1795 
1796     return temp;
1797   }
1798 }
1799 
seqfinfo(SEQFILE * sfp,int newbuffer)1800 SEQINFO *seqfinfo(SEQFILE *sfp, int newbuffer)
1801 {  return (SEQINFO *) intseqf_field1(sfp, newbuffer, "seqfinfo",
1802                                      SEQINFO_ALL); }
seqfallinfo(SEQFILE * sfp,int newbuffer)1803 SEQINFO *seqfallinfo(SEQFILE *sfp, int newbuffer)
1804 {  return (SEQINFO *) intseqf_field1(sfp, newbuffer, "seqfallinfo",
1805                                      SEQINFO_ALLINFO); }
seqfdate(SEQFILE * sfp,int newbuffer)1806 char *seqfdate(SEQFILE *sfp, int newbuffer)
1807 {  return intseqf_field1(sfp, newbuffer, "seqfdate", SEQINFO_DATE); }
seqfmainid(SEQFILE * sfp,int newbuffer)1808 char *seqfmainid(SEQFILE *sfp, int newbuffer)
1809 {  return intseqf_field1(sfp, newbuffer, "seqfmainid", SEQINFO_MAINID); }
seqfmainacc(SEQFILE * sfp,int newbuffer)1810 char *seqfmainacc(SEQFILE *sfp, int newbuffer)
1811 {  return intseqf_field1(sfp, newbuffer, "seqfmainacc", SEQINFO_MAINACC); }
seqfidlist(SEQFILE * sfp,int newbuffer)1812 char *seqfidlist(SEQFILE *sfp, int newbuffer)
1813 {  return intseqf_field1(sfp, newbuffer, "seqfidlist", SEQINFO_IDLIST); }
seqfdescription(SEQFILE * sfp,int newbuffer)1814 char *seqfdescription(SEQFILE *sfp, int newbuffer)
1815 {  return intseqf_field1(sfp, newbuffer, "seqfdescription",
1816                         SEQINFO_DESCRIPTION); }
seqfcomment(SEQFILE * sfp,int newbuffer)1817 char *seqfcomment(SEQFILE *sfp, int newbuffer)
1818 {  return intseqf_field1(sfp, newbuffer, "seqfcomment", SEQINFO_COMMENT); }
seqforganism(SEQFILE * sfp,int newbuffer)1819 char *seqforganism(SEQFILE *sfp, int newbuffer)
1820 {  return intseqf_field1(sfp, newbuffer, "seqforganism", SEQINFO_ORGANISM); }
1821 
1822 
1823 /*
1824  * intseqf_field2
1825  *
1826  * The implementation of seqfdbname, filename and format.  The stub
1827  * functions for those functions occur after the code for intseqf_field2.
1828  *
1829  * Parameters:  sfp         -  an opened SEQFILE structure
1830  *              newbuffer   -  should a new, dynamically allocated buffer
1831  *                             be created to hold the sequence
1832  *              fnname      -  the name of the stub function
1833  *              field       -  the requested information field
1834  *
1835  * Returns:  the requested string
1836  */
intseqf_field2(SEQFILE * sfp,int newbuffer,char * fnname,int field)1837 static char *intseqf_field2(SEQFILE *sfp, int newbuffer, char *fnname,
1838                             int field)
1839 {
1840   int i;
1841   char *s;
1842   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1843 
1844   if (!ctype_initflag)
1845     init_ctype();
1846 
1847   reset_errors();
1848   param_error(isfp == NULL, return NULL, fnname, "arg 1 is NULL");
1849   param_error(isfp->opstatus == OP_FREED, return NULL, fnname,
1850               "arg 1 is not an open SEQFILE");
1851 
1852   preverror_test(isfp->opstatus == OP_ERROR || isfp->opstatus == OP_TEMPERR,
1853                  return NULL);
1854   eof_test(isfp->opstatus == OP_EOF, return NULL);
1855 
1856   s = NULL;
1857   switch (field) {
1858   case SEQINFO_DBNAME:    s = isfp->db_name;  break;
1859   case SEQINFO_FILENAME:  s = isfp->filename;  break;
1860   case SEQINFO_FORMAT:
1861     switch (isfp->format) {
1862     case FORMAT_GBFAST:  s = file_table[FORMAT_GENBANK].ident;  break;
1863     case FORMAT_PIRFAST:  s = file_table[FORMAT_PIR].ident;  break;
1864     case FORMAT_EMBLFAST:  s = file_table[FORMAT_EMBL].ident;  break;
1865     case FORMAT_SPFAST:  s = file_table[FORMAT_SPROT].ident;  break;
1866     case FORMAT_GCG:
1867       if (isfp->gcg_subformat == FORMAT_UNKNOWN)
1868         s = "GCG";
1869       else {
1870         for (i=0; i < gcg_table_size; i++)
1871           if (isfp->gcg_subformat == gcg_table[i].format)
1872             break;
1873         s = (i < gcg_table_size ? gcg_table[i].ident : "GCG");
1874       }
1875       break;
1876 
1877     default:
1878       s = file_table[isfp->format].ident;
1879       break;
1880     }
1881   }
1882 
1883   if (s != NULL && newbuffer) {
1884     s = mystrdup(s);
1885     memory_error(s == NULL, return NULL);
1886   }
1887 
1888   return s;
1889 }
1890 
seqfdbname(SEQFILE * sfp,int newbuffer)1891 char *seqfdbname(SEQFILE *sfp, int newbuffer)
1892 {  return intseqf_field2(sfp, newbuffer, "seqfdbname", SEQINFO_DBNAME); }
seqffilename(SEQFILE * sfp,int newbuffer)1893 char *seqffilename(SEQFILE *sfp, int newbuffer)
1894 {  return intseqf_field2(sfp, newbuffer, "seqffilename", SEQINFO_FILENAME); }
seqfformat(SEQFILE * sfp,int newbuffer)1895 char *seqfformat(SEQFILE *sfp, int newbuffer)
1896 {  return intseqf_field2(sfp, newbuffer, "seqfformat", SEQINFO_FORMAT); }
1897 
1898 
1899 /*
1900  * intseqf_field3
1901  *
1902  * The implementation of seqfisfragment, seqfiscircular, seqfalphabet,
1903  * seqftruelen, seqfentryno, seqfseqno and seqfnumseqs.  The stub functions
1904  * for those functions occur after the code for intseqf_field3.
1905  *
1906  * Parameters:  sfp         -  an opened SEQFILE structure
1907  *              fnname      -  the name of the stub function
1908  *              field       -  the requested information field
1909  *
1910  * Returns:  the requested string
1911  */
intseqf_field3(SEQFILE * sfp,char * fnname,int field)1912 static int intseqf_field3(SEQFILE *sfp, char *fnname, int field)
1913 {
1914   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
1915 
1916   if (!ctype_initflag)
1917     init_ctype();
1918 
1919   reset_errors();
1920   param_error(isfp == NULL, return 0, fnname, "arg 1 is NULL");
1921   param_error(isfp->opstatus == OP_FREED, return 0, fnname,
1922               "arg 1 is not an open SEQFILE");
1923   param_error(isfp->optype == OP_WRITE, return 0, fnname,
1924               "arg 1 is not open for reading");
1925 
1926   preverror_test(isfp->opstatus == OP_ERROR || isfp->opstatus == OP_TEMPERR,
1927                  return 0);
1928   eof_test(isfp->opstatus == OP_EOF, return 0);
1929 
1930   if (field == SEQINFO_ENTRYNO ||
1931       field == SEQINFO_SEQNO || field == SEQINFO_NUMSEQS ||
1932       (field == SEQINFO_TRUELEN && isfp->iflag_truelen) ||
1933       (field == SEQINFO_RAWLEN && isfp->iflag_rawlen))
1934     ;
1935   else if (intseqf_info(isfp, 0, field) == NULL)
1936     return 0;
1937 
1938   switch (field) {
1939   case SEQINFO_FRAGMENT:  return isfp->info->isfragment;
1940   case SEQINFO_CIRCULAR:  return isfp->info->iscircular;
1941   case SEQINFO_ALPHABET:  return isfp->info->alphabet;
1942   case SEQINFO_STARTPOS:  return isfp->info->fragstart;
1943   case SEQINFO_TRUELEN:   return isfp->entry_truelen;
1944   case SEQINFO_RAWLEN:    return isfp->entry_rawlen;
1945   case SEQINFO_ENTRYNO:   return isfp->entry_count;
1946   case SEQINFO_SEQNO:     return isfp->entry_seqno;
1947   case SEQINFO_NUMSEQS:   return isfp->entry_numseqs;
1948   default:                return 0;
1949   }
1950 }
1951 
seqfisfragment(SEQFILE * sfp)1952 int seqfisfragment(SEQFILE *sfp)
1953 {  return intseqf_field3(sfp, "seqfisfragment", SEQINFO_FRAGMENT); }
seqfiscircular(SEQFILE * sfp)1954 int seqfiscircular(SEQFILE *sfp)
1955 {  return intseqf_field3(sfp, "seqfiscircular", SEQINFO_CIRCULAR); }
seqfalphabet(SEQFILE * sfp)1956 int seqfalphabet(SEQFILE *sfp)
1957 {  return intseqf_field3(sfp, "seqfalphabet", SEQINFO_ALPHABET); }
seqffragstart(SEQFILE * sfp)1958 int seqffragstart(SEQFILE *sfp)
1959 {  return intseqf_field3(sfp, "seqffragstart", SEQINFO_STARTPOS); }
seqftruelen(SEQFILE * sfp)1960 int seqftruelen(SEQFILE *sfp)
1961 {  return intseqf_field3(sfp, "seqftruelen", SEQINFO_TRUELEN); }
seqfrawlen(SEQFILE * sfp)1962 int seqfrawlen(SEQFILE *sfp)
1963 {  return intseqf_field3(sfp, "seqfrawlen", SEQINFO_RAWLEN); }
seqfentryno(SEQFILE * sfp)1964 int seqfentryno(SEQFILE *sfp)
1965 {  return intseqf_field3(sfp, "seqfentryno", SEQINFO_ENTRYNO); }
seqfseqno(SEQFILE * sfp)1966 int seqfseqno(SEQFILE *sfp)
1967 {  return intseqf_field3(sfp, "seqfseqno", SEQINFO_SEQNO); }
seqfnumseqs(SEQFILE * sfp)1968 int seqfnumseqs(SEQFILE *sfp)
1969 {  return intseqf_field3(sfp, "seqfnumseqs", SEQINFO_NUMSEQS); }
1970 
1971 
1972 /*
1973  * seqfoneline
1974  *
1975  * Constructs a one-line description of the information given
1976  * in the SEQINFO structure.  That description is stored in the
1977  * given buffer.
1978  *
1979  * The description will always be NULL-terminated, and is guaranteed
1980  * to fit within the buffer length (including the NULL character).
1981  *
1982  * Parameter:  info      -  A SEQINFO structure containing information
1983  *             buffer    -  A character buffer to store the description
1984  *             buflen    -  The length of the buffer
1985  *             idonly    -  just return an identifier for the string
1986  *
1987  * Returns:  The length of the constructed description, or -1 on error.
1988  */
seqfoneline(SEQINFO * info,char * buffer,int buflen,int idonly)1989 int seqfoneline(SEQINFO *info, char *buffer, int buflen, int idonly)
1990 {
1991   int len, idlen, descrlen, orglen, taillen, fraglen;
1992   int flag, totallen, trunclen, templen, descrflag, orgflag;
1993   int midlen, descrslack, orgslack;
1994   char *s, *t, *t2, *t3, *t4, tailbuf[32], fragbuf[64];
1995 
1996   if (!ctype_initflag)
1997     init_ctype();
1998 
1999   reset_errors();
2000   param_error(info == NULL, return -1, "seqfoneline", "arg 1 is NULL");
2001   param_error(buffer == NULL, return -1, "seqfoneline", "arg 2 is NULL");
2002   param_error(buflen <= 0, return -1, "seqfoneline",
2003               "arg 3 is not a positive length");
2004 
2005   /*
2006    * Take the case of just an identifier separately.  Return one of
2007    * the following (in order):  idlist, description, organism.
2008    */
2009   if (idonly) {
2010     if (info->idlist && info->idlist[0]) {
2011       s = info->idlist;
2012       for (t=s,len=0; *t && *t != '|'; t++,len++) ;
2013 
2014       if (buflen - 1 < len || mystreq(t, 'O', "OTH:")) {
2015         for ( ; s < t && *s != ':'; s++,len--) ;
2016         if (*s && s[1]) {
2017           s++;  len--;
2018         }
2019         else {
2020           s = info->idlist;
2021           len = t - s;
2022         }
2023       }
2024     }
2025     else if (((s = info->description) && info->description[0]) ||
2026              ((s = info->organism) && info->organism[0]))
2027       for (t=s,len=0; *t && !isspace(*t); t++,len++) ;
2028     else
2029       return 0;
2030 
2031     if (buflen - 1 < len)
2032       len = buflen - 1;
2033 
2034     memcpy(buffer, s, len);
2035     buffer[len] = '\0';
2036     return len;
2037   }
2038 
2039   /*
2040    * The general algorithm, first add one or two identifiers, compute the
2041    * lengths of the description, organism name, seqlen string and fragment
2042    * string.  Then, do one of the following things:
2043    *
2044    *    1) If only the identifiers fit, just add them.
2045    *    2) If everything fits, add everything.
2046    *    3) If 30 chars of the description, 20 chars of the organism and
2047    *       both the seqlen and fragment string fits, do a construction
2048    *       truncating the description and organism.
2049    *    4) Fill things in left to right until less than 15 chars available.
2050    */
2051   s = buffer;
2052   len = buflen - 1;
2053 
2054   /*
2055    * First, add one or two identifiers from idlist.
2056    */
2057   if (info->idlist && info->idlist[0]) {
2058     for (t2=t=info->idlist; *t2 && *t2 != '|'; t2++) ;
2059     idlen = t2 - t;
2060 
2061     if (len < idlen) {
2062       for ( ; t < t2 && *t != ':'; t++,idlen--) ;
2063       if (t + 1 < t2) {
2064         t++; idlen--;
2065       }
2066       else {
2067         t = info->idlist;
2068         idlen = t2 - t;
2069       }
2070 
2071       if (len < idlen)
2072         idlen = len;
2073 
2074       if (idlen < len && t != info->idlist &&
2075           mystreq(info->idlist, 'A', "ACC:")) {
2076         *s++ = '~';  len--;
2077       }
2078       memcpy(s, t, idlen);
2079       s[idlen] = '\0';
2080 
2081       s += idlen;
2082       len -= idlen;
2083     }
2084     else if (!*t2) {
2085       strcpy(s, t);
2086       s += idlen;
2087       len -= idlen;
2088     }
2089     else {
2090       for (t3=t4=t2+1; *t4 && *t4 != '|'; t4++) ;
2091       error_test(t3 == t4, E_INVINFO, return -1,
2092                  print_error("seqfoneline:  `%s':  Invalid identifier "
2093                              "format.\n", info->idlist));
2094 
2095       if (len < t4 - t || len < 128) {
2096         memcpy(s, t, t2 - t);
2097         s[t2 - t] = '\0';
2098         s += t2 - t;
2099         len -= t2 - t;
2100       }
2101       else {
2102         memcpy(s, t, t4 - t);
2103         s += t4 - t;
2104         len -= t4 - t;
2105       }
2106     }
2107   }
2108 
2109   /*
2110    * Construct the seqlen and fragment strings.
2111    */
2112   descrlen = orglen = 0;
2113   if (info->description && info->description[0])
2114     descrlen = strlen(info->description);
2115   if (info->organism && info->organism[0])
2116     orglen = strlen(info->organism);
2117 
2118   taillen = fraglen = 0;
2119   if (info->truelen > 0) {
2120     sprintf(tailbuf, ", %d %s", info->truelen,
2121             (info->alphabet == PROTEIN ? "aa"
2122                 : (info->alphabet == DNA || info->alphabet == RNA ? "bp"
2123                                                                   : "ch")));
2124     taillen = strlen(tailbuf);
2125 
2126     if (info->isfragment || info->iscircular ||
2127         info->alphabet == DNA || info->alphabet == RNA) {
2128       t = fragbuf;
2129       *t++ = '(';
2130       flag = 0;
2131       if (info->iscircular) {
2132         strcpy(t, "circular");
2133         t += 8;
2134         flag = 1;
2135       }
2136       if (info->alphabet == DNA || info->alphabet == RNA) {
2137         if (flag)
2138           *t++ = ' ';
2139         strcpy(t, (info->alphabet == DNA ? "DNA" : "RNA"));
2140         t += 3;
2141         flag = 1;
2142       }
2143       if (info->isfragment) {
2144         if (info->fragstart > 0) {
2145           if (flag) {
2146             *t++ = ',';
2147             *t++ = ' ';
2148           }
2149           sprintf(t, "f. %d-%d", info->fragstart,
2150                   info->fragstart + info->truelen - 1);
2151           while (*t) t++;
2152         }
2153         else {
2154           if (flag)
2155             *t++ = ' ';
2156           strcpy(t, "fragment");
2157           t += 8;
2158         }
2159       }
2160       *t++ = ')';
2161       *t = '\0';
2162       fraglen = t - fragbuf;
2163     }
2164   }
2165 
2166   /*
2167    * Decide whether to do the truncated construction or the left to
2168    * right construction.
2169    */
2170   totallen = (descrlen ? descrlen + 1 : 0) +
2171              (orglen ? orglen + 3 : 0) + taillen +
2172              (fraglen ? fraglen + 1 : 0) + 1;
2173   trunclen = (descrlen < 28 ? descrlen + 2 : 30) +
2174              (orglen < 16 ? orglen + 4 : 20) + taillen +
2175              (fraglen ? fraglen + 1 : 0) + 1;
2176   if (totallen > len && trunclen <= len) {
2177     templen = len - taillen - (fraglen ? fraglen + 1 : 0) - 1;
2178     descrflag = orgflag = 0;
2179     if (descrlen && orglen) {
2180       midlen = templen * 6 / 10;
2181       descrslack = (descrlen < midlen - 1 ? midlen - descrlen - 1 : 0);
2182       orgslack = (orglen < templen - midlen - 3
2183                     ? (templen - midlen) - orglen - 3 : 0);
2184 
2185       if (descrlen > midlen + orgslack - 1) {
2186         descrlen = midlen + orgslack - 2;
2187         descrflag = 1;
2188       }
2189       if (orglen > (templen - midlen) + descrslack - 3) {
2190         orglen = (templen - midlen) + descrslack - 4;
2191         orgflag = 1;
2192       }
2193     }
2194     else if (descrlen && descrlen > templen - 1) {
2195       descrlen = templen - 2;
2196       descrflag = 1;
2197     }
2198     else if (orglen && orglen > templen - 3) {
2199       orglen = templen - 4;
2200       orgflag = 1;
2201     }
2202 
2203     if (descrlen) {
2204       *s++ = ' ';  len--;
2205       memcpy(s, info->description, descrlen);
2206       s += descrlen;
2207       len -= descrlen;
2208       if (descrflag) {
2209         *s++ = '*';  len--;
2210       }
2211     }
2212     if (orglen) {
2213       *s++ = ' ';  len--;
2214       *s++ = '-';  len--;
2215       *s++ = ' ';  len--;
2216       memcpy(s, info->organism, orglen);
2217       s += orglen;
2218       len -= orglen;
2219       if (orgflag) {
2220         *s++ = '*';  len--;
2221       }
2222     }
2223 
2224   }
2225   else {
2226     if (len >= 15 && descrlen > 0) {
2227       if (descrlen + 1 > len)
2228         descrlen = len - 2;
2229       *s++ = ' ';  len--;
2230       memcpy(s, info->description, descrlen);
2231       s += descrlen;
2232       len -= descrlen;
2233     }
2234     if (len >= 15 && orglen > 0) {
2235       if (orglen + 3 > len)
2236         orglen = len - 4;
2237       *s++ = ' ';  len--;
2238       *s++ = '-';  len--;
2239       *s++ = ' ';  len--;
2240       memcpy(s, info->organism, orglen);
2241       s += orglen;
2242       len -= orglen;
2243     }
2244   }
2245 
2246   if (len >= taillen) {
2247     memcpy(s, tailbuf, taillen);
2248     s += taillen;
2249     len -= taillen;
2250 
2251     if (fraglen > 0 && len >= fraglen + 1) {
2252       *s++ = ' ';  len--;
2253       memcpy(s, fragbuf, fraglen);
2254       s += fraglen;
2255       len -= fraglen;
2256     }
2257     if (len > 0) {
2258       *s++ = '.';
2259       len--;
2260     }
2261   }
2262 
2263   *s = '\0';
2264   return s - buffer;
2265 }
2266 
2267 
2268 /*
2269  * seqfsetidpref
2270  *
2271  * Sets the identifier prefix used when reading entries.
2272  *
2273  * Parameters:  sfp         -  an opened SEQFILE structure
2274  *
2275  * Returns:  nothing
2276  */
seqfsetidpref(SEQFILE * sfp,char * idprefix)2277 void seqfsetidpref(SEQFILE *sfp, char *idprefix)
2278 {
2279   int len;
2280   char *s;
2281   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
2282 
2283   if (!ctype_initflag)
2284     init_ctype();
2285 
2286   reset_errors();
2287   param_error(isfp == NULL, return, "seqfsetidpref", "arg 1 is NULL");
2288   param_error(isfp->opstatus == OP_FREED, return, "seqfsetidpref",
2289               "arg 1 is not an open SEQFILE");
2290   param_error(isfp->optype == OP_WRITE, return, "seqfsetidpref",
2291               "arg 1 is not open for reading");
2292   if (idprefix != NULL) {
2293     for (len=0,s=idprefix; len < 6 && *s; s++) ;
2294     param_error(len == 1, return, "seqfsetidpref",
2295                 "arg 2 is too short to be a valid identifier prefix");
2296     param_error(len > 4, return, "seqfsetidpref",
2297                 "arg 2 is too long to be a valid identifier prefix");
2298   }
2299 
2300   preverror_test(isfp->opstatus == OP_ERROR, return);
2301   eof_test(isfp->opstatus == OP_EOF, return);
2302 
2303   /*
2304    * Free an old idprefix, and then make a copy of the new one.
2305    */
2306   if (isfp->db_idprefix != NULL)
2307     free(isfp->db_idprefix);
2308   isfp->db_idprefix = NULL;
2309 
2310   if (idprefix != NULL && idprefix[0] != '\0') {
2311     isfp->db_idprefix = mystrdup(idprefix);
2312     for (s=isfp->db_idprefix; *s; s++)
2313       *s = tolower(*s);
2314     memory_error(isfp->db_idprefix == NULL, return);
2315   }
2316 }
2317 
2318 
2319 /*
2320  * seqfsetdbname
2321  *
2322  * Sets the database name used when reading entries.
2323  *
2324  * Parameters:  sfp         -  an opened SEQFILE structure
2325  *
2326  * Returns:  nothing
2327  */
seqfsetdbname(SEQFILE * sfp,char * dbname)2328 void seqfsetdbname(SEQFILE *sfp, char *dbname)
2329 {
2330   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
2331 
2332   if (!ctype_initflag)
2333     init_ctype();
2334 
2335   reset_errors();
2336   param_error(isfp == NULL, return, "seqfsetdbname", "arg 1 is NULL");
2337   param_error(isfp->opstatus == OP_FREED, return, "seqfsetdbname",
2338               "arg 1 is not an open SEQFILE");
2339   param_error(isfp->optype == OP_WRITE, return, "seqfsetdbname",
2340               "arg 1 is not open for reading");
2341 
2342   preverror_test(isfp->opstatus == OP_ERROR, return);
2343   eof_test(isfp->opstatus == OP_EOF, return);
2344 
2345   /*
2346    * Free an old dbname, and then make a copy of the new one.
2347    */
2348   if (isfp->db_name != NULL)
2349     free(isfp->db_name);
2350   isfp->db_name = NULL;
2351 
2352   if (dbname != NULL && dbname[0] != '\0') {
2353     isfp->db_name = mystrdup(dbname);
2354     memory_error(isfp->db_name == NULL, return);
2355   }
2356 }
2357 
2358 
2359 /*
2360  * seqfsetalpha
2361  *
2362  * Sets the alphabet used when reading entries.
2363  *
2364  * Parameters:  sfp         -  an opened SEQFILE structure
2365  *
2366  * Returns:  nothing
2367  */
seqfsetalpha(SEQFILE * sfp,char * alphabet)2368 void seqfsetalpha(SEQFILE *sfp, char *alphabet)
2369 {
2370   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
2371 
2372   if (!ctype_initflag)
2373     init_ctype();
2374 
2375   reset_errors();
2376   param_error(isfp == NULL, return, "seqfsetalpha", "arg 1 is NULL");
2377   param_error(isfp->opstatus == OP_FREED, return, "seqfsetalpha",
2378               "arg 1 is not an open SEQFILE");
2379   param_error(isfp->optype == OP_WRITE, return, "seqfsetalpha",
2380               "arg 1 is not open for reading");
2381 
2382   preverror_test(isfp->opstatus == OP_ERROR, return);
2383   eof_test(isfp->opstatus == OP_EOF, return);
2384 
2385   /*
2386    * Free an old alphabet, and then make a copy of the new one.
2387    */
2388   if (isfp->db_alpha != NULL)
2389     free(isfp->db_alpha);
2390   isfp->db_alpha = NULL;
2391 
2392   if (alphabet != NULL && alphabet[0] != '\0') {
2393     isfp->db_alpha = mystrdup(alphabet);
2394     memory_error(isfp->db_alpha == NULL, return);
2395   }
2396 }
2397 
2398 
2399 /*
2400  *
2401  * seqfwrite
2402  *
2403  * Writes the given sequence and information to the output file
2404  * (in the specified format).
2405  *
2406  * Parameters:   sfp     - a SEQFILE structure opened for writing
2407  *               seq     - a sequence
2408  *               seqlen  - the sequence's length
2409  *               info    - information about the sequence
2410  *
2411  * Returns:  a 0 if the write was successful, a -1 on an error.
2412  *                                             (seqferrno is set on an error)
2413  */
seqfwrite(SEQFILE * sfp,char * seq,int seqlen,SEQINFO * info)2414 int seqfwrite(SEQFILE *sfp, char *seq, int seqlen, SEQINFO *info)
2415 {
2416   int status;
2417   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
2418 
2419   if (!ctype_initflag)
2420     init_ctype();
2421 
2422   reset_errors();
2423   param_error(isfp == NULL, return -1, "seqfwrite", "arg 1 is NULL");
2424   param_error(isfp->opstatus == OP_FREED, return -1, "seqfwrite",
2425               "arg 1 is not an open SEQFILE");
2426   param_error(isfp->optype != OP_WRITE, return -1, "seqfwrite",
2427               "arg 1 is not open for writing");
2428   param_error(seq == NULL, return -1, "seqfwrite", "arg 2 is NULL");
2429   param_error(seqlen <= 0, return -1, "seqfwrite",
2430               "arg 3 is less than or equal to 0");
2431   param_error(info == NULL, return -1, "seqfwrite", "arg 4 is NULL");
2432 
2433   preverror_test(isfp->opstatus == OP_ERROR, return -1);
2434   eof_test(isfp->opstatus == OP_EOF, return -1);
2435 
2436   status = (*file_table[isfp->format].putseq_fn)(isfp, seq, seqlen, info);
2437   switch (status) {
2438   case STATUS_OK:
2439   case STATUS_WARNING:
2440     return 0;
2441 
2442   case STATUS_EOF:
2443   case STATUS_ERROR:
2444   case STATUS_FATAL:
2445     isfp->opstatus = OP_ERROR;
2446     return -1;
2447 
2448   default:
2449     status_error(return -1, "seqfwrite");
2450   }
2451 }
2452 
2453 
2454 /*
2455  *
2456  * seqfconvert
2457  *
2458  * Write the sequence and entry information of the current entry to
2459  * the output file (in the specified format).
2460  *
2461  * Parameters:   sfpin  - a SEQFILE structure opened for reading
2462  *               sfpout - a SEQFILE structure opened for writing
2463  *
2464  * Returns:  a 0 if the read was successful, a -1 on an error.
2465  *                                            (seqferrno is set on an error)
2466  */
seqfconvert(SEQFILE * sfpin,SEQFILE * sfpout)2467 int seqfconvert(SEQFILE *sfpin, SEQFILE *sfpout)
2468 {
2469   int seqlen, status;
2470   char *seq;
2471   SEQINFO *info;
2472   INTSEQFILE *isfpin, *isfpout;
2473 
2474   isfpin = (INTSEQFILE *) sfpin;
2475   isfpout = (INTSEQFILE *) sfpout;
2476 
2477   if (!ctype_initflag)
2478     init_ctype();
2479 
2480   reset_errors();
2481   param_error(isfpin == NULL, return -1, "seqfconvert", "arg 1 is NULL");
2482   param_error(isfpin->opstatus == OP_FREED, return -1, "seqfconvert",
2483               "arg 1 is not an open SEQFILE");
2484   param_error(isfpin->optype == OP_WRITE, return -1, "seqfconvert",
2485               "arg 1 is not open for reading");
2486   param_error(isfpout == NULL, return -1, "seqfconvert", "arg 2 is NULL");
2487   param_error(isfpout->opstatus == OP_FREED, return -1, "seqfconvert",
2488               "arg 2 is not an open SEQFILE");
2489   param_error(isfpout->optype != OP_WRITE, return -1, "seqfconvert",
2490               "arg 1 is not open for writing");
2491 
2492   preverror_test(isfpin->opstatus == OP_ERROR ||
2493                  isfpin->opstatus == OP_TEMPERR, return -1);
2494   eof_test(isfpin->opstatus == OP_EOF, return -1);
2495 
2496   /*
2497    * Get the current sequence and info, then call the putseq function.
2498    */
2499   if (isfpin->isseqcurrent && !isfpin->rawseqflag) {
2500     seq = isfpin->seq;
2501     seqlen = isfpin->seqlen;
2502   }
2503   else {
2504     if ((seq = seqfsequence(sfpin, &seqlen, 0)) == NULL || seqlen == 0)
2505       return -1;
2506   }
2507 
2508   if (isfpin->istatus == INFO_ALL)
2509     info = isfpin->info;
2510   else {
2511     if ((info = seqfinfo(sfpin, 0)) == NULL)
2512       return -1;
2513   }
2514 
2515   status = (*file_table[isfpout->format].putseq_fn)(isfpout, seq, seqlen,
2516                                                     info);
2517   switch (status) {
2518   case STATUS_OK:
2519   case STATUS_WARNING:
2520     return 0;
2521 
2522   case STATUS_EOF:
2523   case STATUS_ERROR:
2524   case STATUS_FATAL:
2525     isfpout->opstatus = OP_ERROR;
2526     return -1;
2527 
2528   default:
2529     status_error(return -1, "seqfconvert");
2530   }
2531 }
2532 
2533 
2534 /*
2535  *
2536  * seqfputs
2537  *
2538  * Writes the given string to the output file
2539  *
2540  * Parameters:   sfp  - a SEQFILE structure opened for writing
2541  *               s    - a sequence
2542  *               len  - the sequence's length
2543  *
2544  * Returns:  a 0 if the write was successful, a -1 on an error.
2545  *                                             (seqferrno is set on an error)
2546  */
seqfputs(SEQFILE * sfp,char * s,int len)2547 int seqfputs(SEQFILE *sfp, char *s, int len)
2548 {
2549   int status;
2550   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
2551 
2552   if (!ctype_initflag)
2553     init_ctype();
2554 
2555   reset_errors();
2556   param_error(isfp == NULL, return -1, "seqfputs", "arg 1 is NULL");
2557   param_error(isfp->opstatus == OP_FREED, return -1, "seqfputs",
2558               "arg 1 is not an open SEQFILE");
2559   param_error(isfp->optype != OP_WRITE, return -1, "seqfputs",
2560               "arg 1 is not open for writing");
2561   param_error(s == NULL, return -1, "seqfputs", "arg 2 is NULL");
2562   param_error(len < 0, return -1, "seqfputs", "arg 3 is less than 0");
2563 
2564   preverror_test(isfp->opstatus == OP_ERROR, return -1);
2565   eof_test(isfp->opstatus == OP_EOF, return -1);
2566 
2567   if (len == 0) {
2568     fputs(s, isfp->output_fp);
2569     return 0;
2570   }
2571   else {
2572     status = fwrite(s, len, 1, isfp->output_fp);
2573     return (status == 1 ? 0 : -1);
2574   }
2575 }
2576 
2577 
2578 /*
2579  * seqfannotate
2580  *
2581  * This function adds extra comment text to an entry, as it's outputting that
2582  * entry.
2583  *
2584  * Parameters:      sfp        -  a SEQFILE structure open for writing
2585  *                  entry      -  an entry
2586  *                  entrylen   -  the length of the entry
2587  *                  newcomment -  the new comment to add to the entry
2588  *                  flag       -  should an existing comment be retained
2589  *
2590  * Returns:  0 on success and -1 on failure
2591  */
seqfannotate(SEQFILE * sfp,char * entry,int entrylen,char * newcomment,int flag)2592 int seqfannotate(SEQFILE *sfp, char *entry, int entrylen, char *newcomment,
2593                  int flag)
2594 {
2595   int status;
2596   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
2597 
2598   if (!ctype_initflag)
2599     init_ctype();
2600 
2601   reset_errors();
2602   param_error(isfp == NULL, return -1, "seqfannotate", "arg 1 is NULL");
2603   param_error(isfp->opstatus == OP_FREED, return -1, "seqfannotate",
2604               "arg 1 is not an open SEQFILE");
2605   param_error(isfp->optype != OP_WRITE, return -1, "seqfannotate",
2606               "arg 1 is not open for writing");
2607   param_error(entry == NULL, return -1, "seqfannotate", "arg 2 is NULL");
2608   param_error(entrylen <= 0, return -1, "seqfannotate",
2609               "arg 3 is less than or equal to 0");
2610   param_error(newcomment == NULL, return -1, "seqfannotate", "arg 4 is NULL");
2611   param_error(newcomment[0] == '\0', return -1, "seqfannotate",
2612               "arg 4 is an empty string");
2613 
2614   error_test(file_table[isfp->format].annotate_fn == NULL,
2615              E_PARAMERROR, return -1,
2616              print_error("%s:  Cannot annotate entries in this file format.\n",
2617                          file_table[isfp->format].ident));
2618 
2619   /*
2620    * Call the annotate function.
2621    */
2622   status = (*file_table[isfp->format].annotate_fn)(isfp->output_fp, entry,
2623                                                    entrylen, newcomment, flag);
2624   switch (status) {
2625   case STATUS_OK:
2626   case STATUS_WARNING:
2627     return 0;
2628 
2629   case STATUS_ERROR:
2630   case STATUS_FATAL:
2631     return -1;
2632 
2633   default:
2634     status_error(return -1, "seqfannotate");
2635   }
2636 }
2637 
2638 
2639 /*
2640  * seqfgcgify
2641  *
2642  * Convert an entry into a GCG entry and output the converted text to
2643  * the given file.
2644  *
2645  * Parameters:  sfp       - A SEQFILE pointer open for writing
2646  *              entry     - The entry
2647  *              entrylen  - The entry's length
2648  *
2649  * Returns:  0 on success, -1 on EOF or error.
2650  */
seqfgcgify(SEQFILE * sfpout,char * entry,int entrylen)2651 int seqfgcgify(SEQFILE *sfpout, char *entry, int entrylen)
2652 {
2653   int i, j, k, count, seqlen, checksum, alpha;
2654   int fmt, dna, others, oldpe, status;
2655   char ch, *s, *seq, *start, *end, *mainid, *date, buffer[128];
2656   FILE *fp;
2657   INTSEQFILE isfbuffer, *isfp;
2658   INTSEQFILE *isfpout = (INTSEQFILE *) sfpout;
2659 
2660   if (!ctype_initflag)
2661     init_ctype();
2662 
2663   reset_errors();
2664   param_error(isfpout == NULL, return -1, "seqfgcgify", "arg 1 is NULL");
2665   param_error(isfpout->opstatus == OP_FREED, return -1, "seqfgcgify",
2666               "arg 1 is not an open SEQFILE");
2667   param_error(isfpout->optype != OP_WRITE, return -1, "seqfgcgify",
2668               "arg 1 is not open for writing");
2669   param_error(entry == NULL, return -1, "seqfgcgify", "arg 2 is NULL");
2670   param_error(entrylen <= 0, return -1, "seqfgcgify",
2671               "arg 3 is less than or equal to 0");
2672 
2673   error_test(isfpout->format != FORMAT_GCG, E_INVFORMAT, return -1,
2674              print_error("%s:  seqfgcgify:  Cannot write GCG entry when "
2675                          "output format specified as %s.\n",
2676                          isfpout->filename, seqfformat(isfpout, 0)));
2677 
2678   if (isfpout->entry_count != 0)
2679     return -1;
2680 
2681   /*
2682    * Construct an INTSEQFILE structure for the "read" procedure to
2683    * use while parsing the entry.
2684    */
2685   oldpe = pe_flag;
2686   pe_flag = PE_NONE;
2687 
2688   isfp = &isfbuffer;
2689   memset(isfp, 0, sizeof(INTSEQFILE));
2690   isfp->optype = OP_READ;
2691   isfp->opstatus = OP_ACTIVE;
2692   isfp->filename = "";
2693   isfp->fp_buffer = isfp->fp_current = entry;
2694   isfp->fp_top = entry + entrylen;
2695 
2696   if (isfpout->gcg_subformat != FORMAT_UNKNOWN)
2697     isfp->format = isfpout->gcg_subformat;
2698   else {
2699     status = determine_format(isfp);
2700     if (status != STATUS_OK && status != STATUS_WARNING) {
2701       pe_flag = oldpe;
2702       raise_error(E_PARSEERROR, return -1,
2703                   print_error("%s:  seqfgcgify:  Parse error while scanning "
2704                               "entry to output.\n", isfpout->filename));
2705     }
2706   }
2707 
2708   /*
2709    * Reset the error mode to not perform any error actions and then
2710    * call the read function to parse the entry.
2711    */
2712   status = (*file_table[isfp->format].read_fn)(isfp, 0);
2713   pe_flag = oldpe;
2714 
2715   error_test(status != STATUS_OK && status != STATUS_WARNING,
2716              E_PARSEERROR, return -1,
2717              print_error("%s:  seqfgcgify:  Parse error while scanning "
2718                          "entry to output.\n", isfpout->filename));
2719 
2720   error_test(isfp->fp_seqstart == NULL, E_PARSEERROR, return -1,
2721              print_error("%s:  seqfgcgify:  Parse error while scanning "
2722                          "entry to output.\n", isfpout->filename));
2723 
2724   /*
2725    * Output the header lines.
2726    */
2727   isfpout->entry_count++;
2728   fp = isfpout->output_fp;
2729   fmt = isfp->format;
2730 
2731   if (isfp->format == FORMAT_GCG) {
2732     fwrite(entry, entrylen, 1, fp);
2733     return STATUS_OK;
2734   }
2735 
2736   fwrite(isfp->fp_entrystart, isfp->fp_seqstart - isfp->fp_entrystart, 1, fp);
2737   if ((fmt == FORMAT_NBRF || fmt == FORMAT_NBRFOLD) && isfp->nbrf_header) {
2738     fwrite(isfp->nbrf_header, isfp->fp_entryend - isfp->nbrf_header, 1, fp);
2739     if (*(isfp->fp_entryend - 1) != '\n')
2740       fputc('\n', fp);
2741   }
2742 
2743   /*
2744    * Find the sequence length, the alphabet type and the checksum.
2745    */
2746   start = isfp->fp_seqstart;
2747   end = isfp->fp_entryend;
2748   if (fmt == FORMAT_GENBANK || fmt == FORMAT_PIR ||
2749       fmt == FORMAT_SPROT || fmt == FORMAT_EMBL) {
2750     if (*(end - 1) == '\n')
2751       end--;
2752     while (*(end - 1) != '\n') end--;
2753   }
2754   else if (fmt == FORMAT_NBRF && isfp->nbrf_header &&
2755            isfp->nbrf_header < isfp->fp_entryend)
2756     end = isfp->nbrf_header;
2757 
2758   checksum = seqlen = 0;
2759   dna = others = 0;
2760   for (s=start; s < end; s++) {
2761     if (!isspace(*s) && !isdigit(*s)) {
2762       if (((fmt == FORMAT_NBRF || fmt == FORMAT_NBRFOLD) && *s == '*') ||
2763           ((fmt == FORMAT_STANFORD || fmt == FORMAT_STANFORDOLD) &&
2764            (*s == '1' || *s == '2')))
2765         break;
2766       else if (fmt == FORMAT_PIR && !isalpha(*s))
2767         continue;
2768 
2769       ch = (*s == '-' ? '.' : toupper(*s));
2770       checksum += (seqlen % 57 + 1) * ch;
2771       seqlen++;
2772 
2773       if (ch == 'A' || ch == 'G' || ch == 'C' || ch == 'T' || ch == 'U')
2774         dna++;
2775       else if (!isalpha(*s))
2776         others++;
2777     }
2778   }
2779   checksum %= 10000;
2780 
2781   if (((float) dna / (float) (seqlen - others)) >= 0.85)
2782     alpha = DNA;
2783   else
2784     alpha = PROTEIN;
2785 
2786   /*
2787    * Output the gcg info line.
2788    */
2789   fputc('\n', fp);
2790   fputs("  ", fp);
2791   if ((mainid = seqfmainid((SEQFILE *) isfp, 0)) != NULL)
2792     fputs(mainid, fp);
2793   fputs("  ", fp);
2794 
2795   fprintf(fp, "Length: %d  ", seqlen);
2796 
2797   date = get_today();
2798   for (i=1; i <= 12; i++)
2799     if (myncasecmp(date+3, months[i], 3) == 0)
2800       break;
2801   if (i <= 12)
2802     fprintf(fp, "%s %c%c, %s %s  ", gcg_full_months[i], date[0], date[1],
2803             date+7, date+12);
2804 
2805   if (alpha == DNA)
2806     fputs("Type: N  ", fp);
2807   else if (alpha == PROTEIN)
2808     fputs("Type: P  ", fp);
2809 
2810   fprintf(fp, "Check: %d  ..\n\n", checksum);
2811 
2812   /*
2813    * Print the gcg sequence lines.
2814    */
2815   s = NULL;
2816   j = k = 0;
2817   count = 1;
2818   for (seq=start; seq < end; seq++) {
2819     if (!isspace(*seq) && !isdigit(*seq)) {
2820       if (((fmt == FORMAT_NBRF || fmt == FORMAT_NBRFOLD) && *seq == '*') ||
2821           ((fmt == FORMAT_STANFORD || fmt == FORMAT_STANFORDOLD) &&
2822            (*seq == '1' || *seq == '2')))
2823         break;
2824       else if (fmt == FORMAT_PIR && !isalpha(*seq))
2825         continue;
2826 
2827       if (j == 0 && k == 0) {
2828         sprintf(buffer, "%8d  ", count);
2829         s = buffer + 9;
2830       }
2831 
2832       *s++ = (*seq == '-' ? '.' : *seq);
2833       count++;
2834 
2835       if (++k == 10) {
2836         *s++ = ' ';
2837         k = 0;
2838         if (++j == 5) {
2839           *s++ = '\n';
2840           *s++ = '\n';
2841           *s = '\0';
2842           fputs(buffer, fp);
2843           j = 0;
2844         }
2845       }
2846     }
2847   }
2848   if (j != 0 || k != 0) {
2849     *s++ = '\n';
2850     *s++ = '\n';
2851     *s = '\0';
2852     fputs(buffer, fp);
2853   }
2854 
2855   /*
2856    * Free up the SEQINFO structure that may have been allocated when
2857    * the main identifier was extracted.
2858    */
2859   if (isfp->info != NULL)
2860     free(isfp->info);
2861 
2862   return 0;
2863 }
2864 
2865 
2866 /*
2867  * seqfungcgify
2868  *
2869  * Convert an entry from a GCG entry and output the converted text to
2870  * the given file.
2871  *
2872  * Parameters:  sfp       - A SEQFILE structure open for writing
2873  *              entry     - The entry
2874  *              entrylen  - The entry's length
2875  *
2876  * Returns:  0 on success, -1 on error.
2877  */
seqfungcgify(SEQFILE * sfpout,char * entry,int entrylen)2878 int seqfungcgify(SEQFILE *sfpout, char *entry, int entrylen)
2879 {
2880   int i, j, k, count, fmt, status, oldpe;
2881   char ch, *s, *seq, *end;
2882   FILE *fp;
2883   INTSEQFILE isfbuffer, *isfp;
2884   INTSEQFILE *isfpout = (INTSEQFILE *) sfpout;
2885 
2886   if (!ctype_initflag)
2887     init_ctype();
2888 
2889   reset_errors();
2890   param_error(isfpout == NULL, return -1, "seqfungcgify", "arg 1 is NULL");
2891   param_error(isfpout->opstatus == OP_FREED, return -1, "seqfungcgify",
2892               "arg 1 is not an open SEQFILE");
2893   param_error(isfpout->optype != OP_WRITE, return -1, "seqfungcgify",
2894               "arg 1 is not open for writing");
2895   param_error(entry == NULL, return -1, "seqfungcgify", "arg 2 is NULL");
2896   param_error(entrylen <= 0, return -1, "seqfungcgify",
2897               "arg 3 is less than or equal to 0");
2898 
2899   for (i=0; i < gcg_table_size; i++)
2900     if (isfpout->format == gcg_table[i].format &&
2901         gcg_table[i].format != FORMAT_MSF)
2902       break;
2903   error_test(i == gcg_table_size && isfpout->format != FORMAT_RAW,
2904              E_INVFORMAT, return -1,
2905              print_error("%s:  seqfungcgify:  Cannot convert GCG entry to "
2906                          "output format %s.\n", isfpout->filename,
2907                          seqfformat(isfpout, 0)));
2908 
2909   /*
2910    * Construct an INTSEQFILE structure for the "read" procedure to
2911    * use while parsing the entry.
2912    */
2913   isfp = &isfbuffer;
2914   memset(isfp, 0, sizeof(INTSEQFILE));
2915   isfp->optype = OP_READ;
2916   isfp->opstatus = OP_ACTIVE;
2917   isfp->filename = "";
2918 
2919   isfp->format = FORMAT_GCG;
2920   isfp->gcg_subformat = isfpout->format;
2921   isfp->fp_buffer = isfp->fp_current = entry;
2922   isfp->fp_top = entry + entrylen;
2923 
2924   /*
2925    * Reset the error mode to not perform any error actions and then
2926    * call the read function to parse the entry.
2927    */
2928   oldpe = pe_flag;
2929   pe_flag = PE_NONE;
2930   status = (*file_table[isfpout->format].read_fn)(isfp, 0);
2931   pe_flag = oldpe;
2932 
2933   error_test(status != STATUS_OK && status != STATUS_WARNING,
2934              E_PARSEERROR, return -1,
2935              print_error("%s:  seqfungcgify:  Parse error while scanning "
2936                          "entry to output.\n", isfpout->filename));
2937 
2938   error_test(isfp->fp_seqstart == NULL || isfp->gcg_infoline == NULL,
2939              E_PARSEERROR, return -1,
2940              print_error("%s:  seqfungcgify:  Parse error while scanning "
2941                          "entry to output.\n", isfpout->filename));
2942 
2943   /*
2944    * Output the header lines.
2945    */
2946   fp = isfpout->output_fp;
2947   fmt = isfpout->format;
2948   isfpout->entry_count++;
2949 
2950   if (fmt != FORMAT_RAW && fmt != FORMAT_PLAIN) {
2951     if (fmt != FORMAT_NBRF || isfp->nbrf_header == NULL)
2952       for (s=isfp->gcg_infoline; *(s-2) == '\n'; s--) ;
2953     else
2954       s = isfp->nbrf_header;
2955 
2956     fwrite(isfp->fp_entrystart, s - isfp->fp_entrystart, 1, fp);
2957   }
2958 
2959   /*
2960    * Output the sequence.
2961    */
2962   seq = isfp->fp_seqstart;
2963   end = isfp->fp_entryend;
2964 
2965   switch (fmt) {
2966   case FORMAT_RAW:
2967     for ( ; seq < end; seq++) {
2968       if (*seq == '>' || *seq == '<' || *seq == '$') {
2969         for (ch=*seq++; seq < end && *seq != ch; seq++) ;
2970       }
2971       else if (!isspace(*seq) && !isdigit(*seq)) {
2972         ch = (*seq == '.' ? '-' : *seq);
2973         fputc(ch, fp);
2974       }
2975     }
2976     break;
2977 
2978   case FORMAT_PLAIN:
2979   case FORMAT_FASTA:
2980   case FORMAT_FASTAOLD:
2981   case FORMAT_STANFORD:
2982   case FORMAT_STANFORDOLD:
2983   case FORMAT_NBRF:
2984   case FORMAT_NBRFOLD:
2985     count = 0;
2986     for ( ; seq < end; seq++) {
2987       if (*seq == '>' || *seq == '<' || *seq == '$') {
2988         for (ch=*seq++; seq < end && *seq != ch; seq++) ;
2989       }
2990       else if (!isspace(*seq) && !isdigit(*seq)) {
2991         ch = (*seq == '.' ? '-' : *seq);
2992 
2993         if (count == 60) {
2994           fputc('\n', fp);
2995           count = 0;
2996         }
2997         fputc(ch, fp);
2998         count++;
2999       }
3000     }
3001 
3002     if (fmt == FORMAT_STANFORD || fmt == FORMAT_STANFORDOLD)
3003       fputc('1', fp);
3004     else if (fmt == FORMAT_NBRF || fmt == FORMAT_NBRFOLD)
3005       fputc('*', fp);
3006 
3007     if (count != 0)
3008       fputc('\n', fp);
3009 
3010     if ((fmt == FORMAT_NBRF || fmt == FORMAT_NBRFOLD) &&
3011         isfp->nbrf_header != NULL) {
3012       for (s=isfp->gcg_infoline; *(s-2) == '\n'; s--) ;
3013       fwrite(isfp->nbrf_header, s - isfp->nbrf_header, 1, fp);
3014     }
3015     break;
3016 
3017   case FORMAT_GENBANK:
3018     j = k = count = 0;
3019     for ( ; seq < end; seq++) {
3020       if (*seq == '>' || *seq == '<' || *seq == '$') {
3021         for (ch=*seq++; seq < end && *seq != ch; seq++) ;
3022       }
3023       else if (!isspace(*seq) && !isdigit(*seq)) {
3024         ch = (*seq == '.' ? '-' : *seq);
3025 
3026         if (j == 0 && k == 0)
3027           fprintf(fp, "   %6d", count+1);
3028         if (k == 0)
3029           fputc(' ', fp);
3030 
3031         fputc(ch, fp);
3032         count++;
3033 
3034         if (++k == 10) {
3035           k = 0;
3036           if (++j == 6) {
3037             fputc('\n', fp);
3038             j = 0;
3039           }
3040         }
3041       }
3042     }
3043     if (j != 0 || k != 0)
3044       fputc('\n', fp);
3045     fputs("//\n", fp);
3046     break;
3047 
3048   case FORMAT_PIR:
3049     fputs("                5        10        15"
3050           "        20        25        30\n", fp);
3051     j = count = 0;
3052     for ( ; seq < end; seq++) {
3053       if (*seq == '>' || *seq == '<' || *seq == '$') {
3054         for (ch=*seq++; seq < end && *seq != ch; seq++) ;
3055       }
3056       else if (!isspace(*seq) && !isdigit(*seq)) {
3057         ch = (*seq == '.' ? '-' : *seq);
3058 
3059         if (j == 0)
3060           fprintf(fp, "%7d", count + 1);
3061 
3062         fputc(' ', fp);
3063         fputc(ch, fp);
3064         count++;
3065 
3066         if (++j == 30) {
3067           j = 0;
3068           fputc('\n', fp);
3069         }
3070       }
3071     }
3072     if (j != 0)
3073       fputc('\n', fp);
3074     fputs("///\n", fp);
3075     break;
3076 
3077   case FORMAT_EMBL:
3078     j = k = count = 0;
3079     for ( ; seq < end; seq++) {
3080       if (*seq == '>' || *seq == '<' || *seq == '$') {
3081         for (ch=*seq++; seq < end && *seq != ch; seq++) ;
3082       }
3083       else if (!isspace(*seq) && !isdigit(*seq)) {
3084         ch = (*seq == '.' ? '-' : *seq);
3085 
3086         if (j == 0 && k == 0)
3087           fputs("    ", fp);
3088         if (k == 0)
3089           fputc(' ', fp);
3090 
3091         fputc(ch, fp);
3092         count++;
3093 
3094         if (++k == 10) {
3095           k = 0;
3096           if (++j == 6) {
3097             fprintf(fp, "%10d\n", count);
3098             j = 0;
3099           }
3100         }
3101       }
3102     }
3103     if (j != 0 || k != 0) {
3104       while (j != 0 && k != 0) {
3105         if (k == 0)
3106           fputc(' ', fp);
3107         fputc(' ', fp);
3108         if (++k == 10) {
3109           k = 0;
3110           if (++j == 6) {
3111             fprintf(fp, "%10d\n", count);
3112             j = 0;
3113           }
3114         }
3115       }
3116     }
3117     fputs("//\n", fp);
3118     break;
3119 
3120   case FORMAT_SPROT:
3121     j = k = 0;
3122     for ( ; seq < end; seq++) {
3123       if (*seq == '>' || *seq == '<' || *seq == '$') {
3124         for (ch=*seq++; seq < end && *seq != ch; seq++) ;
3125       }
3126       else if (!isspace(*seq) && !isdigit(*seq)) {
3127         ch = (*seq == '.' ? '-' : *seq);
3128 
3129         if (j == 0 && k == 0)
3130           fputs("    ", fp);
3131         if (k == 0)
3132           fputc(' ', fp);
3133 
3134         fputc(ch, fp);
3135 
3136         if (++k == 10) {
3137           k = 0;
3138           if (++j == 6) {
3139             fputc('\n', fp);
3140             j = 0;
3141           }
3142         }
3143       }
3144     }
3145     if (j != 0 || k != 0)
3146       fputc('\n', fp);
3147     fputs("//\n", fp);
3148     break;
3149 
3150   default:
3151     status_error(return -1, "seqfungcgify");
3152   }
3153 
3154   return 0;
3155 }
3156 
3157 
3158 /*
3159  * seqfparseent
3160  *
3161  * This function parses a given entry and constructs a SEQINFO structure
3162  * containing the information about that entry.
3163  *
3164  * Parameters:      entry     -  an entry
3165  *                  entrylen  -  the length of the entry
3166  *                  format    -  the file format for the entry
3167  *
3168  * Returns:  a SEQINFO structure, or NULL on an error
3169  */
seqfparseent(char * entry,int entrylen,char * format)3170 SEQINFO *seqfparseent(char *entry, int entrylen, char *format)
3171 {
3172   int i, status, gcgflag;
3173   INTSEQFILE istruct;
3174   INTSEQFILE *isfp;
3175 
3176   if (!ctype_initflag)
3177     init_ctype();
3178 
3179   reset_errors();
3180   param_error(entry == NULL, return NULL, "seqfparseent", "arg 1 is NULL");
3181   param_error(entrylen <= 0, return NULL, "seqfparseent",
3182               "arg 2 is less than or equal to 0");
3183   param_error(format == NULL, return NULL, "seqfparseent", "arg 3 is NULL");
3184   param_error(format[0] == '\0', return NULL, "seqfparseent",
3185               "arg 3 is an empty string");
3186 
3187   error_test(!seqfcanparseent(format), E_PARAMERROR, return NULL,
3188              print_error("%s:  Cannot parse entries in this file format.\n",
3189                          format));
3190 
3191   /*
3192    * Figure out the specified format.
3193    */
3194   if (myncasecmp(format, "GCG-", 4) == 0) {
3195     for (i=0; i < gcg_table_size; i++)
3196       if (mycasecmp(format+4, gcg_table[i].ident+4) == 0)
3197         break;
3198     i = gcg_table[i].format;
3199     gcgflag = 1;
3200   }
3201   else {
3202     for (i=0; i < file_table_size; i++)
3203       if (mycasecmp(format, file_table[i].ident) == 0)
3204         break;
3205     i = file_table[i].format;
3206     gcgflag = 0;
3207   }
3208 
3209   /*
3210    * Create a dummy INTSEQFILE struct to give to the getinfo function.
3211    */
3212   isfp = &istruct;
3213   memset(isfp, 0, sizeof(INTSEQFILE));
3214 
3215   isfp->optype = OP_READ;
3216   isfp->filename = "";
3217   if (!gcgflag) {
3218     isfp->format = i;
3219     isfp->gcg_subformat = FORMAT_UNKNOWN;
3220   }
3221   else {
3222     isfp->format = FORMAT_GCG;
3223     isfp->gcg_subformat = i;
3224   }
3225 
3226   isfp->infosize = isfp->infobufsize = sizeof(SEQINFO);
3227   isfp->info = (SEQINFO *) malloc(isfp->infobufsize);
3228   memory_error(isfp->info == NULL, return NULL);
3229   memset(isfp->info, isfp->infosize, 0);
3230 
3231   isfp->iflag_rawlen = isfp->iflag_truelen = 1;
3232 
3233   /*
3234    * Call the getinfo function.
3235    */
3236   status = (*file_table[i].getinfo_fn)(isfp, entry, entrylen, SEQINFO_ALL);
3237   switch (status) {
3238   case STATUS_OK:
3239   case STATUS_WARNING:
3240     return isfp->info;
3241 
3242   case STATUS_ERROR:
3243   case STATUS_FATAL:
3244     if (isfp->info != NULL)
3245       free(isfp->info);
3246     return NULL;
3247 
3248   default:
3249     status_error(return NULL, "seqfparseent");
3250   }
3251 }
3252 
3253 
3254 /*
3255  * seqfsetpretty
3256  *
3257  * This function sets the pretty flag used by the Plain, FASTA, NBRF and
3258  * IG/Stanford putseq functions.
3259  *
3260  * Parameters:      sfp    -  a SEQFILE structure open for writing
3261  *                  value  -  value to set the pretty flag to
3262  *
3263  * Returns:  nothing.
3264  */
seqfsetpretty(SEQFILE * sfp,int value)3265 void seqfsetpretty(SEQFILE *sfp, int value)
3266 {
3267   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
3268 
3269   if (!ctype_initflag)
3270     init_ctype();
3271 
3272   reset_errors();
3273   param_error(isfp == NULL, return, "seqfsetpretty", "arg 1 is NULL");
3274   param_error(isfp->opstatus == OP_FREED, return, "seqfsetpretty",
3275               "arg 1 is not an open SEQFILE");
3276   param_error(isfp->optype != OP_WRITE, return, "seqfsetpretty",
3277               "arg 1 is not open for writing");
3278 
3279   isfp->prettyflag = (value != 0 ? 2 : 1);
3280 }
3281 
3282 
3283 /*
3284  * seqfisaformat
3285  *
3286  * This function tests whether the given string is a valid file format.
3287  *
3288  * Parameters:      format  -  a file format string
3289  *
3290  * Returns:  non-zero if the string is a file format, zero otherwise.
3291  */
seqfisaformat(char * format)3292 int seqfisaformat(char *format)
3293 {
3294   int i;
3295 
3296   if (!ctype_initflag)
3297     init_ctype();
3298 
3299   reset_errors();
3300   param_error(format == NULL, return 0, "seqfisaformat", "arg 1 is NULL");
3301   param_error(format[0] == '\0', return 0, "seqfisaformat",
3302               "arg 1 is an empty string");
3303 
3304   /*
3305    * Test the format string.
3306    */
3307   if (myncasecmp(format, "GCG-", 4) == 0) {
3308     for (i=0; i < gcg_table_size; i++)
3309       if (mycasecmp(format + 4, gcg_table[i].ident + 4) == 0)
3310         break;
3311 
3312     return (i < gcg_table_size);
3313   }
3314   else {
3315     for (i=0; i < file_table_size; i++)
3316       if (mycasecmp(format, file_table[i].ident) == 0)
3317         break;
3318 
3319     return (i < file_table_size);
3320   }
3321 }
3322 
3323 
3324 /*
3325  * seqffmttype
3326  *
3327  * This function returns some type information about a format.
3328  *
3329  * Parameters:      format  -  a file format string
3330  *
3331  * Returns:  a type define value if the string is a file format,
3332  *           zero otherwise.
3333  */
seqffmttype(char * format)3334 int seqffmttype(char *format)
3335 {
3336   int i;
3337 
3338   if (!ctype_initflag)
3339     init_ctype();
3340 
3341   reset_errors();
3342   param_error(format == NULL, return T_INVFORMAT, "seqffmttype",
3343               "arg 1 is NULL");
3344   param_error(format[0] == '\0', return T_INVFORMAT, "seqffmttype",
3345               "arg 1 is an empty string");
3346 
3347   /*
3348    * Test the format string.
3349    */
3350   if (myncasecmp(format, "GCG-", 4) == 0) {
3351     for (i=0; i < gcg_table_size; i++)
3352       if (mycasecmp(format + 4, gcg_table[i].ident + 4) == 0)
3353         break;
3354 
3355     if (i < gcg_table_size)
3356       return file_table[gcg_table[i].format].type;
3357   }
3358   else {
3359     for (i=0; i < file_table_size; i++)
3360       if (mycasecmp(format, file_table[i].ident) == 0)
3361         break;
3362 
3363     if (i < file_table_size)
3364       return file_table[file_table[i].format].type;
3365   }
3366 
3367   set_error(E_INVFORMAT);
3368   return T_INVFORMAT;
3369 }
3370 
3371 
3372 /*
3373  * seqfcanwrite
3374  *
3375  * This function tests whether the format has a putseq function.
3376  *
3377  * Parameters:      format  -  a file format string
3378  *
3379  * Returns:  non-zero if the string is a file format with a putseq
3380              function, zero otherwise.
3381  */
seqfcanwrite(char * format)3382 int seqfcanwrite(char *format)
3383 {
3384   int i;
3385 
3386   if (!ctype_initflag)
3387     init_ctype();
3388 
3389   reset_errors();
3390   param_error(format == NULL, return 0, "seqfcanwrite", "arg 1 is NULL");
3391   param_error(format[0] == '\0', return 0, "seqfcanwrite",
3392               "arg 1 is an empty string");
3393 
3394   /*
3395    * Test the format string.
3396    */
3397   if (myncasecmp(format, "GCG-", 4) == 0) {
3398     for (i=0; i < gcg_table_size; i++)
3399       if (mycasecmp(format + 4, gcg_table[i].ident + 4) == 0)
3400         break;
3401 
3402     if (i < gcg_table_size)
3403       return (file_table[gcg_table[i].format].putseq_fn != NULL);
3404   }
3405   else {
3406     for (i=0; i < file_table_size; i++)
3407       if (mycasecmp(format, file_table[i].ident) == 0)
3408         break;
3409 
3410     if (i < file_table_size)
3411       return (file_table[file_table[i].format].putseq_fn != NULL);
3412   }
3413 
3414   set_error(E_INVFORMAT);
3415   return 0;
3416 }
3417 
3418 
3419 /*
3420  * seqfcanannotate
3421  *
3422  * This function tests whether the format has an annotate function.
3423  *
3424  * Parameters:      format  -  a file format string
3425  *
3426  * Returns:  non-zero if the string is a file format with an annotate
3427              function, zero otherwise.
3428  */
seqfcanannotate(char * format)3429 int seqfcanannotate(char *format)
3430 {
3431   int i;
3432 
3433   if (!ctype_initflag)
3434     init_ctype();
3435 
3436   reset_errors();
3437   param_error(format == NULL, return 0, "seqfcanannotate", "arg 1 is NULL");
3438   param_error(format[0] == '\0', return 0, "seqfcanannotate",
3439               "arg 1 is an empty string");
3440 
3441   /*
3442    * Test the format string.
3443    */
3444   if (myncasecmp(format, "GCG-", 4) == 0) {
3445     for (i=0; i < gcg_table_size; i++)
3446       if (mycasecmp(format + 4, gcg_table[i].ident + 4) == 0)
3447         break;
3448 
3449     if (i < gcg_table_size)
3450       return (file_table[gcg_table[i].format].annotate_fn != NULL);
3451   }
3452   else {
3453     for (i=0; i < file_table_size; i++)
3454       if (mycasecmp(format, file_table[i].ident) == 0)
3455         break;
3456 
3457     if (i < file_table_size)
3458       return (file_table[file_table[i].format].annotate_fn != NULL);
3459   }
3460 
3461   set_error(E_INVFORMAT);
3462   return 0;
3463 }
3464 
3465 
3466 /*
3467  * seqfcanparseent
3468  *
3469  * This function tests whether the format is parsable from the raw entry
3470  * text.
3471  *
3472  * Parameters:      format  -  a file format string
3473  *
3474  * Returns:  non-zero if the string is parseable, zero otherwise.
3475  */
seqfcanparseent(char * format)3476 int seqfcanparseent(char *format)
3477 {
3478   int i;
3479 
3480   if (!ctype_initflag)
3481     init_ctype();
3482 
3483   reset_errors();
3484   param_error(format == NULL, return 0, "seqfcanparseent", "arg 1 is NULL");
3485   param_error(format[0] == '\0', return 0, "seqfcanparseent",
3486               "arg 1 is an empty string");
3487 
3488   /*
3489    * Test the format string.
3490    */
3491   if (myncasecmp(format, "GCG-", 4) == 0) {
3492     for (i=0; i < gcg_table_size; i++)
3493       if (mycasecmp(format + 4, gcg_table[i].ident + 4) == 0)
3494         break;
3495 
3496     if (i < gcg_table_size) {
3497       if (gcg_table[i].format == FORMAT_MSF)
3498         return 0;
3499       else
3500         return 1;
3501     }
3502   }
3503   else {
3504     for (i=0; i < file_table_size; i++)
3505       if (mycasecmp(format, file_table[i].ident) == 0)
3506         break;
3507 
3508     if (i < file_table_size) {
3509       if (i == FORMAT_FOUT || i == FORMAT_PHYLIP || i == FORMAT_PHYSEQ ||
3510           i == FORMAT_PHYINT || i == FORMAT_CLUSTAL || i == FORMAT_MSF)
3511         return 0;
3512       else
3513         return 1;
3514     }
3515   }
3516 
3517   set_error(E_INVFORMAT);
3518   return 0;
3519 }
3520 
3521 
3522 /*
3523  * seqfcangcgify
3524  *
3525  * This function tests whether the seqfgcgify and seqfungcgify can be
3526  * used with entries in the given format.
3527  *
3528  * Parameters:      format  -  a file format string
3529  *
3530  * Returns:  non-zero if the format is gcgify'able, zero otherwise.
3531  */
seqfcangcgify(char * format)3532 int seqfcangcgify(char *format)
3533 {
3534   int i;
3535 
3536   if (!ctype_initflag)
3537     init_ctype();
3538 
3539   reset_errors();
3540   param_error(format == NULL, return 0, "seqfcangcgify", "arg 1 is NULL");
3541   param_error(format[0] == '\0', return 0, "seqfcangcgify",
3542               "arg 1 is an empty string");
3543 
3544   /*
3545    * Test the format string.
3546    */
3547   if (myncasecmp(format, "GCG-", 4) == 0)
3548     format += 4;
3549 
3550   for (i=0; i < gcg_table_size; i++)
3551     if (mycasecmp(format, gcg_table[i].ident + 4) == 0)
3552       break;
3553 
3554   if (i == gcg_table_size || gcg_table[i].format == FORMAT_MSF)
3555     return 0;
3556   else
3557     return 1;
3558 }
3559 
3560 
3561 /*
3562  * seqfbytepos
3563  *
3564  * This function returns the byte position of the current entry when reading.
3565  *
3566  * Parameters:      sfp  -  An open SEQFILE structure
3567  *
3568  * Returns:  The byte position, or a -1 on an error or if there is no
3569  *           current entry.
3570  */
seqfbytepos(SEQFILE * sfp)3571 int seqfbytepos(SEQFILE *sfp)
3572 {
3573   INTSEQFILE *isfp = (INTSEQFILE *) sfp;
3574 
3575   if (!ctype_initflag)
3576     init_ctype();
3577 
3578   reset_errors();
3579   param_error(isfp == NULL, return -1, "seqfbytepos", "arg 1 is NULL");
3580   param_error(isfp->opstatus == OP_FREED, return -1, "seqfbytepos",
3581               "arg 1 is not an open SEQFILE");
3582   param_error(isfp->optype == OP_WRITE, return -1, "seqfbytepos",
3583               "arg 1 is not open for reading");
3584 
3585   preverror_test(isfp->opstatus == OP_ERROR, return -1);
3586   preverror_test(isfp->opstatus == OP_TEMPERR, return -1);
3587   eof_test(isfp->opstatus == OP_EOF, return -1);
3588 
3589   if (isfp->fp_entrystart == NULL)
3590     return -1;
3591   else
3592     return isfp->fp_bytepos + (isfp->fp_entrystart - isfp->fp_buffer);
3593 }
3594 
3595 
3596 /*
3597  * seqfisafile
3598  *
3599  * This function checks an input string to see if it's an existing file.
3600  * This function is needed because a valid filename may contain a '@'
3601  * suffix specifying the entries in the file to get.  Thus, the file check
3602  * has to strip off that suffix before calling stat.
3603  *
3604  * Parameters:  file  -  A filename
3605  *
3606  * Returns:  Non-zero if the file specifies a valid and existing file,
3607  *           zero otherwise.
3608  */
seqfisafile(char * file)3609 int seqfisafile(char *file)
3610 {
3611   char *s;
3612 
3613   if (!ctype_initflag)
3614     init_ctype();
3615 
3616   reset_errors();
3617   param_error(file == NULL, return 0, "seqfisafile", "arg 1 is NULL");
3618   for (s=file; *s && *s != '@'; s++) ;
3619 
3620   return isa_file(get_truename(file, s));
3621 }
3622 
3623 
3624 
3625 
3626 /*
3627  *
3628  *
3629  * Internal procedures to do some of the grunge work of the interface
3630  * procedures.
3631  *
3632  *    intseqf_open, intseqf_open_for_writing, determine_format,
3633  *    intseqf_close, intseqf_read, intseqf_info
3634  *
3635  *
3636  *
3637  */
3638 
3639 /*
3640  * intseqf_open
3641  *
3642  * Opens the specified file for reading, determines the file format (if
3643  * not specified), sets the appropriate INTSEQFILE fields and reads the
3644  * first entry.
3645  *
3646  * If the filename parameter is the string "-", then standard input
3647  * is used.
3648  *
3649  * If the format parameter is NULL, the file format is determined from
3650  * the first line of the first that begins with a non-whitespace character.
3651  *
3652  * Returns STATUS_ERROR when the file can't be opened or read, but
3653  * returns STATUS_FATAL on a malloc error or if "format" isn't valid.
3654  *
3655  * Parameters:      isfp      -  an INTSEQFILE structure
3656  *                  filename  -  the name of the file to open for reading
3657  *                  format    -  the file format to use (may be NULL)
3658  *
3659  * Returns:  a STATUS value
3660  */
intseqf_open(INTSEQFILE * isfp,char * filename,char * format)3661 static int intseqf_open(INTSEQFILE *isfp, char *filename, char *format)
3662 {
3663   int id, status, subid, offset;
3664   char *s, *offset_string;
3665 
3666   id = subid = FORMAT_UNKNOWN;
3667 
3668   /*
3669    * Figure out the format, if given.
3670    */
3671   if (format != NULL) {
3672     if (myncasecmp(format, "GCG-", 4) == 0) {
3673       for (id=0; id < gcg_table_size; id++)
3674         if (mycasecmp(format + 4, gcg_table[id].ident + 4) == 0)
3675           break;
3676 
3677       error_test(id == gcg_table_size, E_INVFORMAT, return STATUS_FATAL,
3678                  print_error("Read Error:  `%s' is not a valid file format.\n",
3679                              format));
3680 
3681       subid = gcg_table[id].format;
3682       id = (subid == FORMAT_MSF ? FORMAT_MSF : FORMAT_GCG);
3683     }
3684     else {
3685       for (id=0; id < file_table_size; id++)
3686         if (mycasecmp(format, file_table[id].ident) == 0)
3687           break;
3688 
3689       error_test(id == file_table_size, E_INVFORMAT, return STATUS_FATAL,
3690                  print_error("Read Error:  `%s' is not a valid file format.\n",
3691                              format));
3692 
3693       subid = FORMAT_UNKNOWN;
3694       id = file_table[id].format;
3695     }
3696   }
3697 
3698   /*
3699    * Open the specified file or standard input, check for random access
3700    * mode and make a local copy of the filename.
3701    */
3702   if (isfp->filename != NULL)
3703     free(isfp->filename);
3704 
3705   if (filename[0] == '-' && filename[1] == '\0') {
3706     status = open_raw_stdin(&isfp->input_fd);
3707     error_test(status != STATUS_OK, E_OPENFAILED, return STATUS_ERROR,
3708                print_error("Read Error:  Standard input is not available.\n"));
3709 
3710     isfp->filename = mystrdup("(stdin)");
3711     memory_error(isfp->filename == NULL, return STATUS_FATAL);
3712 
3713     isfp->openflag = 0;
3714     isfp->randaccessflag = 0;
3715     offset_string = NULL;
3716   }
3717   else {
3718     for (s=filename; *s && *s != '@'; s++) ;
3719     isfp->randaccessflag = (*s == '@');
3720     offset_string = (*s == '@' ? s + 1 : NULL);
3721 
3722     isfp->filename = mystrdup2(filename, s);
3723     memory_error(isfp->filename == NULL, return STATUS_FATAL);
3724 
3725     status = open_raw_file(get_truename(isfp->filename, NULL),
3726                            &isfp->input_fd);
3727     error_test(status != STATUS_OK, E_OPENFAILED, return STATUS_ERROR,
3728                print_error("%s:  %s\n", filename, sys_errlist[errno]));
3729     isfp->openflag = 1;
3730 
3731 #ifdef ISMAPABLE
3732     if (!isfp->randaccessflag) {
3733       caddr_t addr;
3734 
3735       isfp->filesize = get_filesize(get_truename(isfp->filename, NULL));
3736       isfp->filepos = 0;
3737       isfp->mapsize = (isfp->filesize < MAXMAPSIZE
3738                          ? isfp->filesize : MAXMAPSIZE);
3739       addr = mmap(0, isfp->mapsize, PROT_READ, MAP_SHARED, isfp->input_fd, 0);
3740       if (addr != (caddr_t) -1) {
3741         if (isfp->fp_buffer != NULL)
3742           free(isfp->fp_buffer);
3743         isfp->fp_buffer = addr;
3744         isfp->fp_bufsize = isfp->mapsize;
3745         isfp->ismapped = 1;
3746       }
3747     }
3748 #endif
3749   }
3750   isfp->entry_count = 0;
3751 
3752   /*
3753    * Allocate the space for the internal file buffer and
3754    * initialize the entry markers, if necessary.
3755    */
3756   if (isfp->ismapped) {
3757     isfp->fp_current = isfp->fp_buffer;
3758     isfp->fp_top = isfp->fp_buffer + isfp->fp_bufsize;
3759     isfp->isendtagged = 0;
3760   }
3761   else {
3762     if (isfp->fp_buffer == NULL) {
3763       isfp->fp_buffer = (char *) malloc(INIT_BUFSIZE);
3764       memory_error(isfp->fp_buffer == NULL, return STATUS_FATAL);
3765       isfp->fp_bufsize = INIT_BUFSIZE;
3766     }
3767     isfp->fp_current = isfp->fp_top = isfp->fp_buffer;
3768     isfp->fp_buffer[0] = '\n';
3769     isfp->isendtagged = 1;
3770   }
3771   isfp->fp_bytepos = 0;
3772 
3773   isfp->fp_entrystart = isfp->fp_entryend = isfp->fp_seqstart = NULL;
3774 
3775   /*
3776    * Set the file format for the sequence file, either by the passed
3777    * in argument or by looking at the first line of the file beginning
3778    * with a non-space character.
3779    */
3780   if (format != NULL) {
3781     isfp->format = id;
3782     isfp->gcg_subformat = subid;
3783   }
3784   else {
3785     isfp->autodetermined = 1;
3786     status = determine_format(isfp);
3787     if (status != STATUS_OK && status != STATUS_WARNING)
3788       return status;
3789   }
3790 
3791   /*
3792    * If the filename specifies the entries to read, then scan that
3793    * list to resolve the entries.
3794    */
3795   if (isfp->randaccessflag) {
3796     if (isfp->format == FORMAT_ASN || isfp->format == FORMAT_CLUSTAL ||
3797         isfp->format == FORMAT_FOUT || isfp->format == FORMAT_BOUT) {
3798       raise_error(E_FILEERROR, return STATUS_ERROR,
3799                   print_error("%s:  Cannot access single entries of %s "
3800                               "formatted files.\n", filename,
3801                               file_table[isfp->format].ident));
3802     }
3803 
3804     status = resolve_offsets(isfp, filename, offset_string);
3805     if (status != STATUS_OK && status != STATUS_WARNING)
3806       return status;
3807 
3808     /*
3809      * Seek to the first specified entry, and reset all of the
3810      * appropriate fields.
3811      */
3812     isfp->currentoffset = 0;
3813     offset = isfp->byteoffsets[isfp->currentoffset++];
3814     status = seek_raw_file(isfp->input_fd, offset);
3815     error_test(status != STATUS_OK, E_READFAILED, return -1,
3816                print_error("%s:  %s\n", isfp->filename, sys_errlist[errno]));
3817 
3818     isfp->fp_bytepos = offset;
3819     isfp->entry_count = 0;
3820 
3821     isfp->fp_current = isfp->fp_top = isfp->fp_buffer;
3822     isfp->fp_buffer[0] = '\n';
3823     isfp->isendtagged = 1;
3824 
3825     isfp->savech_loc = NULL;
3826     isfp->savech = '\0';
3827   }
3828 
3829   isfp->initreadflag = 1;
3830   return intseqf_read(isfp, 0);
3831 }
3832 
3833 
3834 /*
3835  * intseqf_open_for_writing
3836  *
3837  * Opens the specified file for writing, and sets the appropriate
3838  * INTSEQFILE fields.
3839  *
3840  * If the filename parameter is the string "-", then standard output
3841  * is used.
3842  *
3843  * Parameters:      isfp      -  an INTSEQFILE structure
3844  *                  filename  -  the name of the file to open for reading
3845  *                  format    -  the file format to use
3846  *                  mode      -  whether to write or append
3847  *
3848  * Returns:  a STATUS value
3849  */
intseqf_open_for_writing(INTSEQFILE * isfp,char * filename,char * format,char * mode)3850 static int intseqf_open_for_writing(INTSEQFILE *isfp, char *filename,
3851                                     char *format, char *mode)
3852 {
3853   int id, status, subid;
3854 
3855   /*
3856    * Figure out the format.
3857    */
3858   if (myncasecmp(format, "GCG-", 4) == 0) {
3859     for (id=0; id < gcg_table_size; id++)
3860       if (mycasecmp(format + 4, gcg_table[id].ident + 4) == 0)
3861         break;
3862 
3863     error_test(id == gcg_table_size, E_INVFORMAT, return STATUS_FATAL,
3864                print_error("Write Error:  `%s' is not a valid file format.\n",
3865                            format));
3866 
3867     subid = gcg_table[id].format;
3868     id = (subid == FORMAT_MSF ? FORMAT_MSF : FORMAT_GCG);
3869   }
3870   else {
3871     for (id=0; id < file_table_size; id++)
3872       if (mycasecmp(format, file_table[id].ident) == 0)
3873         break;
3874 
3875     error_test(id == file_table_size, E_INVFORMAT, return STATUS_FATAL,
3876                print_error("Write Error:  `%s' is not a valid file format.\n",
3877                            format));
3878 
3879     subid = FORMAT_UNKNOWN;
3880     id = file_table[id].format;
3881   }
3882 
3883   error_test(file_table[id].putseq_fn == NULL,
3884              E_INVFORMAT, return STATUS_FATAL,
3885              print_error("Write Error:  Format `%s' is a read-only format.\n",
3886                          format));
3887 
3888   /*
3889    * Open the specified file or standard output.
3890    */
3891   if (filename[0] == '-' && filename[1] == '\0') {
3892     status = open_stdout(&isfp->output_fp);
3893     error_test(status != STATUS_OK, E_OPENFAILED, return STATUS_ERROR,
3894                print_error("Write Error:  Standard output is not "
3895                            "available.\n"));
3896     filename = "(stdout)";
3897     isfp->openflag = 0;
3898   }
3899   else {
3900     isfp->output_fp = fopen(get_truename(filename, NULL), mode);
3901     error_test(isfp->output_fp == NULL, E_OPENFAILED, return STATUS_ERROR,
3902                print_error("%s:  %s\n", filename, sys_errlist[errno]));
3903     isfp->openflag = 1;
3904   }
3905 
3906   isfp->entry_count = 0;
3907   isfp->format = id;
3908   isfp->gcg_subformat = subid;
3909 
3910   /*
3911    * Make a local copy of the filename.
3912    */
3913   if (isfp->filename != NULL)
3914     free(isfp->filename);
3915 
3916   isfp->filename = mystrdup(filename);
3917   memory_error(isfp->filename == NULL, return STATUS_FATAL);
3918 
3919   return STATUS_OK;
3920 }
3921 
3922 
3923 /*
3924  * determine_format
3925  *
3926  * Look for the first line in the file which does not begin with a
3927  * space, tab or newline character.  Check the beginning of that line
3928  * against the determinants listed in the file table (where a '?' in
3929  * a determinant is a wildcard match to any character in the line).
3930  * If a match occurs, set the file format.  Otherwise, return an error.
3931  *
3932  * This procedure will also skip over an e-mail header, if it exists, before
3933  * determining the file format.
3934  *
3935  * Parameters:  isfp  -  an INTSEQFILE structure
3936  *
3937  * Returns: a STATUS value
3938  */
determine_format(INTSEQFILE * isfp)3939 static int determine_format(INTSEQFILE *isfp)
3940 {
3941   int i, status;
3942   char *s, *line, *start, *end, *text, *det;
3943 
3944   start = isfp->fp_current;
3945 
3946   /*
3947    * Skip the e-mail header if it exists.
3948    */
3949   status = fp_get_line(isfp, &line, &end);
3950   if (status == STATUS_OK && strncmp(line, "From ", 5) == 0) {
3951     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
3952       if (line == end)
3953         break;
3954   }
3955 
3956   /*
3957    * Skip any blank lines (or lines of whitespace).
3958    */
3959   s = NULL;
3960   while (status == STATUS_OK) {
3961     for (s=line; s < end && isspace(*s); s++) ;
3962     if (s < end)
3963       break;
3964 
3965     status = fp_get_line(isfp, &line, &end);
3966   }
3967 
3968   /*
3969    * Test to see if we've hit an error or reached the end of the file.
3970    */
3971   switch (status) {
3972   case STATUS_OK:     break;
3973   case STATUS_EOF:    raise_error(E_DETFAILED, return STATUS_ERROR,
3974                                   print_error("%s:  File contains no "
3975                                               "sequence entries.\n",
3976                                               isfp->filename));
3977   case STATUS_ERROR:  return STATUS_ERROR;
3978   case STATUS_FATAL:  return STATUS_FATAL;
3979   default:            status_error(return STATUS_ERROR, "determine_format");
3980   }
3981 
3982   /*
3983    * Search through the list of determinants to find one which
3984    * matches the beginning of the non-whitespace characters.
3985    * If none match, then assume the file contains plaintext.
3986    */
3987   text = s;
3988   for (i=0; i < file_table_size; i++) {
3989     det = file_table[i].determinant;
3990     if (det == NULL)
3991       continue;
3992 
3993     while (*det != '\0') {
3994       /*
3995        * Try to match the determinant, where '?' in the determinant
3996        * matches any character and '|' divides alternative determinants.
3997        */
3998       for (s=text; *s && *det && *det != '|'; s++,det++)
3999         if (*det != '?' && toupper(*s) != toupper(*det))
4000           break;
4001 
4002       /*
4003        * If a match did occur, then set the file format accordingly and
4004        * set the current file pointer to point to the beginning of the line.
4005        *
4006        * We have to special case the EMBL/Swiss-Prot determination here
4007        * because it depends on the structure of the first line.
4008        */
4009       if (!*det || *det == '|') {
4010         isfp->format = file_table[i].format;
4011         isfp->fp_current = line;
4012         return STATUS_OK;
4013       }
4014       else {
4015         while (*det && *det != '|')
4016           det++;
4017         if (*det)
4018           det++;
4019       }
4020     }
4021   }
4022 
4023   /*
4024    * Otherwise, if the line matches none of the determinants,
4025    * assume that the file contains only the sequence, and
4026    * reset the current file pointer to the beginning of the file.
4027    */
4028   set_error(E_DETFAILED);
4029   isfp->format = FORMAT_PLAIN;
4030   isfp->fp_current = start;
4031 
4032   return STATUS_WARNING;
4033 }
4034 
4035 
4036 /*
4037  * resolve_offsets
4038  *
4039  * When a filename is specified in "random access" mode, i.e. with one or
4040  * more specific entries or byte offsets, setup the INTSEQFILE structure
4041  * to read in random access mode and resolve all of the entries/offsets
4042  * specified to byte offsets (this may require reading an initial portion
4043  * of the file in order to convert an identifier or entry number to a
4044  * byte offset).
4045  *
4046  * Parameters:  isfp      -  An INTSEQFILE structure being opened
4047  *              filename  -  The complete filename (with entry/offset specs)
4048  *              offsets   -  The part of `filename' with the entries/offsets
4049  *
4050  * Returns:  a STATUS value.
4051  */
resolve_offsets(INTSEQFILE * isfp,char * filename,char * offset_string)4052 static int resolve_offsets(INTSEQFILE *isfp, char *filename,
4053                            char *offset_string)
4054 {
4055   int i, size, offset, count, scan_mode, oldpe, minentry, mincell;
4056   int status, flag, isacolon, match, len;
4057   char *s, *t, *s2, *t2, *t3, *idlist, buffer[1024];
4058   SEQFILE *sfp = (SEQFILE *) isfp;
4059 
4060   size = get_filesize(get_truename(isfp->filename, NULL));
4061 
4062   /*
4063    * Make an initial count of the number of offsets to find, and malloc
4064    * the array that will hold the offsets.
4065    */
4066   for (s=offset_string,count=1; *s; s++)
4067     if (*s == ',')
4068       count++;
4069 
4070   isfp->byteoffsets = (int *) malloc(count * sizeof(int));
4071   memory_error(isfp->byteoffsets == NULL, return STATUS_FATAL);
4072 
4073   /*
4074    * Make a more complete parse of the offset string, converting any
4075    * byte offsets found (of the form "#%d"), marking any identifiers
4076    * found using a -1 in the offsets array, and semi-converting any
4077    * entry position numbers found (i.e., if the 5th entry is specified,
4078    * put  -5 - 1 = -6 in the byte array, so that the negative values
4079    * from -2 downward specify entry numbers from 1 upward).
4080    *
4081    * Also, set the `scan_mode' flag if any entry numbers or identifiers
4082    * are found.  That will mean that we will have to read the file
4083    * to convert them into byte offsets.
4084    */
4085   count = 0;
4086   scan_mode = 0;
4087   for (s=offset_string; *s; ) {
4088     error_test(*s == ',', E_FILEERROR, return STATUS_ERROR,
4089                print_error("%s:  Parse error in filename at `%.10s'.\n",
4090                            filename, s));
4091 
4092     if (*s == '#') {
4093       for (t=++s; *s && *s != ','; s++) {
4094         error_test(!isdigit(*s), E_FILEERROR, return STATUS_ERROR,
4095                    print_error("%s:  Parse error in filename at `%.10s'.\n",
4096                                filename, t));
4097       }
4098       error_test(t == s, E_FILEERROR, return STATUS_ERROR,
4099                  print_error("%s:  Parse error at end of filename.\n",
4100                              filename, t));
4101 
4102       offset = myatoi(t, 10, '0');
4103       if (offset < 0) {
4104         memcpy(buffer, t-1, s - t + 1);
4105         buffer[s - t + 1] = '\0';
4106         raise_error(E_FILEERROR, return STATUS_ERROR,
4107                     print_error("%s:  Invalid byte offset `%s'.\n", filename,
4108                                 buffer));
4109       }
4110       if (offset >= size) {
4111         memcpy(buffer, t-1, s - t + 1);
4112         buffer[s - t + 1] = '\0';
4113         raise_error(E_FILEERROR, return STATUS_ERROR,
4114                     print_error("%s:  Byte offset `%s' larger than file "
4115                                 "size.\n", filename, buffer));
4116       }
4117 
4118       isfp->byteoffsets[count++] = offset;
4119     }
4120     else if (isdigit(*s)) {
4121       for (t=s; *s && *s != ','; s++) {
4122         error_test(!isdigit(*s), E_FILEERROR, return STATUS_ERROR,
4123                    print_error("%s:  Parse error in filename at `%.10s'.\n",
4124                                filename, t));
4125       }
4126 
4127       offset = myatoi(t, 10, '0');
4128       if (offset <= 0) {
4129         memcpy(buffer, t-1, s - t + 1);
4130         buffer[s - t + 1] = '\0';
4131         raise_error(E_FILEERROR, return STATUS_ERROR,
4132                     print_error("%s:  Invalid entry number `%s'.\n", filename,
4133                                 buffer));
4134       }
4135 
4136       isfp->byteoffsets[count++] = - offset - 1;
4137       if (!scan_mode)
4138         scan_mode = 1;
4139     }
4140     else {
4141       while (*s && *s != ',') s++;
4142       isfp->byteoffsets[count++] = -1;
4143       scan_mode = 2;
4144     }
4145 
4146     if (*s) s++;
4147   }
4148   error_test(count == 0, E_FILEERROR, return STATUS_ERROR,
4149              print_error("%s:  Invalid entry specifiers:  `%s'.\n",
4150                          filename, offset_string-1));
4151 
4152   isfp->num_offsets = count;
4153 
4154   if (!scan_mode)
4155     return STATUS_OK;
4156 
4157   /*
4158    * Start reading the file, and trying to convert the entry numbers
4159    * and entry identifiers into byte offsets.
4160    */
4161   oldpe = pe_flag;
4162   pe_flag = PE_NONE;
4163 
4164   /*
4165    * Use minentry and mincell to always keep track of the smallest
4166    * entry number that has not been resolved.
4167    */
4168   minentry = -1;
4169   mincell = 0;
4170   for (i=0,count=0; i < isfp->num_offsets; i++) {
4171     if (isfp->byteoffsets[i] < 0)
4172       count++;
4173     if (isfp->byteoffsets[i] <= -2 &&
4174         (minentry == -1 || minentry > - (isfp->byteoffsets[i] + 1))) {
4175       minentry = - (isfp->byteoffsets[i] + 1);
4176       mincell = i;
4177     }
4178   }
4179 
4180   /*
4181    * The main reading loop.
4182    */
4183   status = STATUS_OK;
4184   while (count > 0) {
4185     status = intseqf_read(isfp, (scan_mode == 1 ? 0 : 1));
4186     if (status != STATUS_OK && status != STATUS_WARNING)
4187       break;
4188 
4189     while (count && minentry != -1 && isfp->entry_count == minentry) {
4190       isfp->byteoffsets[mincell] = seqfbytepos(sfp);
4191       count--;
4192       minentry = -1;
4193       for (i=0; i < isfp->num_offsets; i++) {
4194         if (isfp->byteoffsets[i] <= -2 &&
4195             (minentry == -1 || minentry >= - (isfp->byteoffsets[i] + 1))) {
4196           minentry = - (isfp->byteoffsets[i] + 1);
4197           mincell = i;
4198         }
4199       }
4200     }
4201 
4202     /*
4203      * Check all of the identifiers that can be found in the entry to
4204      * to see if they match any of the identifier that have yet to be
4205      * resolved.  If an outstanding identifier does not have an
4206      * identifier prefix, then check it against all of the identifiers
4207      * in each read-in entry (regardless of their identifier prefix).
4208      */
4209     if (scan_mode == 2) {
4210       idlist = seqfidlist(sfp, 0);
4211       flag = 0;
4212       for (i=0,s=offset_string; count > 0 && i < isfp->num_offsets; i++,s++) {
4213         isacolon = 0;
4214         for (t=s; *s && *s != ','; s++)
4215           if (!isacolon && *s == ':')
4216             isacolon = 1;
4217 
4218         len = s - t;
4219         if (isfp->byteoffsets[i] == -1) {
4220           flag++;
4221 
4222           match = 0;
4223           if (idlist != NULL) {
4224             for (s2=idlist; *s2; ) {
4225               for (t2=s2; *s2 && *s2 != '|'; s2++) ;
4226               if (!isacolon) {
4227                 for (t3=t2; t2 < s2 && *t2 != ':'; t2++) ;
4228                 if (t2 + 1 < s2)
4229                   t2++;
4230                 else
4231                   t2 = t3;
4232               }
4233 
4234               if (len == s2 - t2 && myncasecmp(t2, t, len) == 0) {
4235                 match = 1;
4236                 break;
4237               }
4238 
4239               if (*s2) s2++;
4240             }
4241           }
4242 
4243           if (match) {
4244             isfp->byteoffsets[i] = seqfbytepos(sfp);
4245             count--;
4246             flag--;
4247           }
4248         }
4249       }
4250 
4251       if (flag == 0)
4252         scan_mode = 1;
4253     }
4254   }
4255   pe_flag = oldpe;
4256 
4257   /*
4258    * Check for unresolved entry numbers or identifiers.
4259    */
4260   if (count > 0) {
4261     if (status == STATUS_ERROR || status == STATUS_FATAL) {
4262       print_error("%s", seqferrstr);
4263       return status;
4264     }
4265 
4266     error_test(minentry > -1, E_FILEERROR, return STATUS_ERROR,
4267                print_error("%s:  File only contains %d entries, not %d.\n",
4268                            filename, isfp->entry_count, minentry));
4269 
4270     t = NULL;
4271     for (i=0,s=offset_string; i < isfp->num_offsets; i++,s++) {
4272       for (t=s; *s && *s != ','; s++) ;
4273       if (isfp->byteoffsets[i] < 0)
4274         break;
4275     }
4276     error_test(i == isfp->num_offsets, E_FILEERROR, return STATUS_ERROR,
4277                print_error("%s:  Unable to resolve entry names/offsets in "
4278                            "filename.\n", filename));
4279 
4280     memcpy(buffer, t, s - t);
4281     buffer[s - t] = '\0';
4282     raise_error(E_FILEERROR, return STATUS_ERROR,
4283                 print_error("%s:  Unable to resolve entry for `%s'.\n",
4284                             filename, buffer));
4285   }
4286 
4287   return status;
4288 }
4289 
4290 
4291 /*
4292  * intseqf_close
4293  *
4294  * Performs the actual closing of the file.  Used both by the interface
4295  * procedure seqfclose and the file/database opening procedures when an
4296  * error occurs (so not all fields of the structure are guaranteed to
4297  * be allocated).
4298  *
4299  * Parameters:  isfp  - an INTSEQFILE structure
4300  *
4301  * Returns:  nothing
4302  */
intseqf_close(INTSEQFILE * isfp)4303 static void intseqf_close(INTSEQFILE *isfp)
4304 {
4305   int i;
4306 
4307   if (isfp->optype != OP_WRITE && isfp->openflag) {
4308 
4309 #ifdef ISMAPABLE
4310     if (isfp->ismapped) {
4311       munmap(isfp->fp_buffer, isfp->mapsize);
4312       isfp->fp_buffer = NULL;
4313     }
4314 #endif
4315 
4316     close_raw_file(isfp->input_fd);
4317     isfp->input_fd = 0;
4318     isfp->openflag = 0;
4319   }
4320 
4321   if (isfp->optype == OP_WRITE) {
4322     if (isfp->opstatus == OP_ACTIVE) {
4323       if (isfp->format == FORMAT_ASN)
4324         asn_putseqend(isfp);
4325       else if (isfp->format == FORMAT_PHYSEQ)
4326         physeq_putseqend(isfp);
4327       else if (isfp->format == FORMAT_PHYINT || isfp->format == FORMAT_PHYLIP)
4328         phyint_putseqend(isfp);
4329       else if (isfp->format == FORMAT_CLUSTAL)
4330         clustal_putseqend(isfp);
4331       else if (isfp->format == FORMAT_MSF)
4332         msf_putseqend(isfp);
4333     }
4334 
4335     if (isfp->openflag)
4336       fclose(isfp->output_fp);
4337     isfp->output_fp = NULL;
4338     isfp->openflag = 0;
4339   }
4340 
4341   if (isfp->filename != NULL) {
4342     free(isfp->filename);
4343     isfp->filename = NULL;
4344   }
4345   if (isfp->fp_buffer != NULL) {
4346     free(isfp->fp_buffer);
4347     isfp->fp_buffer = NULL;
4348   }
4349   if (isfp->seq != NULL) {
4350     free(isfp->seq);
4351     isfp->seq = NULL;
4352   }
4353   if (isfp->info != NULL) {
4354     free(isfp->info);
4355     isfp->info = NULL;
4356   }
4357   if (isfp->idbuffer != NULL) {
4358     free(isfp->idbuffer);
4359     isfp->idbuffer = NULL;
4360   }
4361   if (isfp->db_files != NULL) {
4362     free(isfp->db_files);
4363     isfp->db_files = NULL;
4364   }
4365   if (isfp->db_spec != NULL) {
4366     free(isfp->db_spec);
4367     isfp->db_spec = NULL;
4368   }
4369   if (isfp->db_name != NULL) {
4370     free(isfp->db_name);
4371     isfp->db_name = NULL;
4372   }
4373   if (isfp->db_format != NULL) {
4374     free(isfp->db_format);
4375     isfp->db_format = NULL;
4376   }
4377   if (isfp->db_alpha != NULL) {
4378     free(isfp->db_alpha);
4379     isfp->db_alpha = NULL;
4380   }
4381   if (isfp->db_idprefix != NULL) {
4382     free(isfp->db_idprefix);
4383     isfp->db_idprefix = NULL;
4384   }
4385   if (isfp->fout_id1 != NULL) {
4386     free(isfp->fout_id1);
4387     isfp->fout_id1 = NULL;
4388   }
4389   if (isfp->fout_descr1 != NULL) {
4390     free(isfp->fout_descr1);
4391     isfp->fout_descr1 = NULL;
4392   }
4393   if (isfp->fout_id2 != NULL) {
4394     free(isfp->fout_id2);
4395     isfp->fout_id2 = NULL;
4396   }
4397   if (isfp->fout_descr2 != NULL) {
4398     free(isfp->fout_descr2);
4399     isfp->fout_descr2 = NULL;
4400   }
4401   if (isfp->malign_size > 0) {
4402     if (isfp->malign_seqs != NULL) {
4403       for (i=0; i < isfp->malign_count; i++)
4404         if (isfp->malign_seqs[i] != NULL)
4405           free(isfp->malign_seqs[i]);
4406       free(isfp->malign_seqs);
4407       isfp->malign_seqs = NULL;
4408     }
4409     if (isfp->malign_ids != NULL) {
4410       for (i=0; i < isfp->malign_count; i++)
4411         if (isfp->malign_ids[i] != NULL)
4412           free(isfp->malign_ids[i]);
4413       free(isfp->malign_ids);
4414       isfp->malign_ids = NULL;
4415     }
4416     if (isfp->malign_seqlens != NULL) {
4417       free(isfp->malign_seqlens);
4418       isfp->malign_seqlens = NULL;
4419     }
4420     isfp->malign_count = isfp->malign_size = 0;
4421   }
4422   if (isfp->byteoffsets != NULL) {
4423     free(isfp->byteoffsets);
4424     isfp->byteoffsets = NULL;
4425   }
4426   isfp->opstatus = OP_FREED;
4427   free(isfp);
4428 }
4429 
4430 
4431 /*
4432  * intseqf_read
4433  *
4434  * Read the next entry or sequence in the file.
4435  *
4436  * Parameters:  isfp  -  An INTSEQFILE structure open for reading
4437  *              flag  -  Read the next sequence or entry.
4438  *
4439  * Returns:  a status value
4440  */
intseqf_read(INTSEQFILE * isfp,int flag)4441 static int intseqf_read(INTSEQFILE *isfp, int flag)
4442 {
4443   /*
4444    * If a character was replaced with a '\0' to NULL terminate an entry,
4445    * restore the original character.
4446    */
4447   if (isfp->savech_loc != NULL) {
4448     *isfp->savech_loc = isfp->savech;
4449     isfp->savech_loc = NULL;
4450   }
4451 
4452   /*
4453    * Reset all of the flags marking valid information about the current
4454    * sequence to false.
4455    */
4456   isfp->mapentflag = isfp->isseqcurrent = isfp->rawseqflag = 0;
4457   isfp->iflag_truelen = isfp->iflag_rawlen = 0;
4458   if (isfp->istatus != INFO_NONE) {
4459     isfp->istatus = INFO_NONE;
4460     isfp->iflag_date = isfp->iflag_idlist = isfp->iflag_description = 0;
4461     isfp->iflag_comment = isfp->iflag_organism = isfp->iflag_fragment = 0;
4462     isfp->iflag_circular = isfp->iflag_alphabet = isfp->iflag_fragstart = 0;
4463     if (isfp->info != NULL) {
4464       memset(isfp->info, 0, sizeof(SEQINFO));
4465       isfp->infosize = sizeof(SEQINFO);
4466     }
4467   }
4468 
4469   /*
4470    * Call the "read_fn" function to read the next sequence.
4471    */
4472   return (*file_table[isfp->format].read_fn)(isfp, flag);
4473 }
4474 
4475 
4476 /*
4477  * intseqf_info
4478  *
4479  * The internal procedure to get either one piece of or all of the current
4480  * entry's information.  This procedure also handles the buffering of
4481  * that information (i.e., whether it's stored in the SEQFILE's info
4482  * structure or if a copy is made).
4483  *
4484  * Parameters:  isfp       - an open INTSEQFILE structure
4485  *              newbuffer  - flag telling whether to make a copy of the info
4486  *              flag       - flag telling what information to retrieve
4487  *
4488  * Returns:  a pointer to the information, or NULL on an error
4489  *                                              (seqferrno set on error)
4490  *           (Special return values:
4491  *              1) When computing all of the information from an entry,
4492  *                 the pointer to the SEQINFO structure is cast and returned.
4493  *              2) When computing the `iscircular', `isfragment' or
4494  *                 `alphabet' information, the return value only signals
4495  *                 whether the information has been stored in the SEQINFO
4496  *                 structure.)
4497  */
intseqf_info(INTSEQFILE * isfp,int newbuffer,int flag)4498 static char *intseqf_info(INTSEQFILE *isfp, int newbuffer, int flag)
4499 {
4500   int status, exists, len;
4501   char *s, *buf;
4502   SEQINFO *newinfo;
4503 
4504   if ((isfp->istatus == INFO_ALL && flag != SEQINFO_ALLINFO) ||
4505       (isfp->istatus == INFO_ALLINFO &&
4506        flag != SEQINFO_ALL && flag != SEQINFO_COMMENT) ||
4507       (isfp->istatus == INFO_ANY &&
4508        ((flag == SEQINFO_DATE && isfp->iflag_date) ||
4509         (flag == SEQINFO_IDLIST && isfp->iflag_idlist) ||
4510         (flag == SEQINFO_DESCRIPTION && isfp->iflag_description) ||
4511         (flag == SEQINFO_COMMENT && isfp->iflag_comment) ||
4512         (flag == SEQINFO_ORGANISM && isfp->iflag_organism) ||
4513         (flag == SEQINFO_FRAGMENT && isfp->iflag_fragment) ||
4514         (flag == SEQINFO_CIRCULAR && isfp->iflag_circular) ||
4515         (flag == SEQINFO_ALPHABET && isfp->iflag_alphabet) ||
4516         (flag == SEQINFO_STARTPOS && isfp->iflag_fragstart) ||
4517         (flag == SEQINFO_TRUELEN && isfp->iflag_truelen) ||
4518         (flag == SEQINFO_RAWLEN && isfp->iflag_rawlen))))
4519     exists = 1;
4520   else
4521     exists = 0;
4522 
4523   if (!exists) {
4524     if (isfp->info == NULL) {
4525       isfp->info = (SEQINFO *) malloc(sizeof(SEQINFO));
4526       memory_error(isfp->info == NULL, return NULL);
4527       isfp->infosize = isfp->infobufsize = sizeof(SEQINFO);
4528       memset(isfp->info, 0, isfp->infosize);
4529     }
4530 
4531     if (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO) {
4532       memset(isfp->info, 0, sizeof(SEQINFO));
4533       isfp->infosize = sizeof(SEQINFO);
4534       isfp->istatus = INFO_NONE;
4535       isfp->iflag_date = isfp->iflag_idlist = isfp->iflag_description = 0;
4536       isfp->iflag_comment = isfp->iflag_organism = isfp->iflag_fragment = 0;
4537       isfp->iflag_circular = isfp->iflag_alphabet = isfp->iflag_fragstart = 0;
4538     }
4539     else {
4540       if (isfp->istatus == INFO_ALLINFO) {
4541         isfp->info->comment = NULL;
4542         isfp->iflag_comment = 0;
4543         isfp->istatus = INFO_ANY;
4544       }
4545 
4546       if (flag == SEQINFO_IDLIST) {
4547         isfp->info->idlist = NULL;
4548         isfp->iflag_idlist = 0;
4549         isfp->istatus = INFO_ANY;
4550       }
4551     }
4552 
4553     /*
4554      * Call the format specific getinfo procedure to get the information.
4555      */
4556     len = isfp->fp_entryend - isfp->fp_entrystart;
4557     status = (*file_table[isfp->format].getinfo_fn)(isfp, isfp->fp_entrystart,
4558                                                     len, flag);
4559     switch (status) {
4560     case STATUS_OK:
4561     case STATUS_WARNING:
4562       break;
4563 
4564     case STATUS_ERROR:
4565     case STATUS_FATAL:
4566       return NULL;
4567 
4568     default:
4569       status_error(return NULL, "intseqf_info");
4570     }
4571   }
4572 
4573   if (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO) {
4574     isfp->istatus = (flag == SEQINFO_ALL ? INFO_ALL : INFO_ALLINFO);
4575     if (!newbuffer)
4576       return (char *) isfp->info;
4577 
4578     s = (char *) isfp->info;
4579     len = isfp->infosize;
4580     buf = (char *) malloc(len);
4581     memory_error(buf == NULL, return NULL);
4582     memcpy(buf, s, len);
4583 
4584     newinfo = (SEQINFO *) buf;
4585     if (isfp->info->dbname)
4586       newinfo->dbname = buf + (isfp->info->dbname - s);
4587     if (isfp->info->filename)
4588       newinfo->filename = buf + (isfp->info->filename - s);
4589     if (isfp->info->format)
4590       newinfo->format = buf + (isfp->info->format - s);
4591     if (isfp->info->date)
4592       newinfo->date = buf + (isfp->info->date - s);
4593     if (isfp->info->idlist)
4594       newinfo->idlist = buf + (isfp->info->idlist - s);
4595     if (isfp->info->description)
4596       newinfo->description = buf + (isfp->info->description - s);
4597     if (isfp->info->comment)
4598       newinfo->comment = buf + (isfp->info->comment - s);
4599     if (isfp->info->organism)
4600       newinfo->organism = buf + (isfp->info->organism - s);
4601     if (isfp->info->history)
4602       newinfo->history = buf + (isfp->info->history - s);
4603 
4604     return buf;
4605   }
4606   else {
4607     s = NULL;
4608     isfp->istatus = INFO_ANY;
4609     switch (flag) {
4610     case SEQINFO_DATE:  isfp->iflag_date = 1;
4611                         s = isfp->info->date;
4612                         break;
4613     case SEQINFO_IDLIST:  isfp->iflag_idlist = 1;
4614                           s = isfp->info->idlist;
4615                           break;
4616     case SEQINFO_DESCRIPTION:  isfp->iflag_description = 1;
4617                                s = isfp->info->description;
4618                                break;
4619     case SEQINFO_COMMENT: isfp->iflag_comment = 1;
4620                           s = isfp->info->comment;
4621                           break;
4622     case SEQINFO_ORGANISM:  isfp->iflag_organism = 1;
4623                             s = isfp->info->organism;
4624                             break;
4625     case SEQINFO_FRAGMENT:  isfp->iflag_fragment = 1;
4626                             return "";
4627     case SEQINFO_CIRCULAR:  isfp->iflag_circular = 1;
4628                             return "";
4629     case SEQINFO_ALPHABET:  isfp->iflag_alphabet = 1;
4630                             return "";
4631     case SEQINFO_STARTPOS:  isfp->iflag_fragstart = 1;
4632                             return "";
4633     case SEQINFO_TRUELEN:
4634     case SEQINFO_RAWLEN:    return "";
4635     default:
4636       program_error(1, return NULL,
4637                     print_error("   Illegal flag value %d in intseqf_info\n",
4638                                 flag));
4639     }
4640     if (s == NULL || !newbuffer)
4641       return s;
4642 
4643     len = strlen(s) + 1;
4644     buf = (char *) malloc(len);
4645     memory_error(buf == NULL, return NULL);
4646     memcpy(buf, s, len);
4647 
4648     return buf;
4649   }
4650 }
4651 
4652 
4653 
4654 
4655 /*
4656  *
4657  *
4658  * Section for error reporting procedures, both interface procedures
4659  * and internal procedures.
4660  *
4661  *
4662  *
4663  */
4664 
4665 
4666 /*
4667  * seqfperror
4668  *
4669  * Prints the error string for the last error that occurred.
4670  * Similar to the `perror' function.
4671  *
4672  * Parameters:  s  - a character string (could be NULL)
4673  *
4674  * Returns:  nothing
4675  */
seqfperror(char * s)4676 void seqfperror(char *s)
4677 {
4678   if (!ctype_initflag)
4679     init_ctype();
4680 
4681   if (s != NULL) {
4682     (*perror_fn)(s);
4683     (*perror_fn)(":  ");
4684   }
4685 
4686   if (seqferrno != E_NOERROR)
4687     (*perror_fn)(seqferrstr);
4688 }
4689 
4690 
4691 /*
4692  * seqferrpolicy
4693  *
4694  * Sets the error policy for the SEQIO package.  When the package
4695  * hits either an error or a warning condition, it does up to three
4696  * things:
4697  *    1) set seqferrno to the appropriate value
4698  *    2) print a descriptive error message to stderr
4699  *    3) if an error, either call exit or return an error value
4700  *
4701  * The different error policies allow the user to specify whether
4702  * a descriptive message is printed and whether `exit' is called
4703  * on an error.
4704  *
4705  * Parameters:  pe  - an integer describing the new error policy
4706  *
4707  * Returns:  the old error policy
4708  */
seqferrpolicy(int pe)4709 int seqferrpolicy(int pe)
4710 {
4711   int oldpe;
4712 
4713   if (!ctype_initflag)
4714     init_ctype();
4715 
4716   param_error(pe < PE_NONE || pe > PE_ALL, return -1, "seqferrpolicy",
4717               "arg 1 is an invalid error policy");
4718 
4719   oldpe = pe_flag;
4720   pe_flag = pe;
4721 
4722   return oldpe;
4723 }
4724 
4725 
4726 /*
4727  * seqfsetperror
4728  *
4729  * Sets the function called to output all of the error messages.
4730  *
4731  * Parameters:  perror_fn  - a pointer to an error function (or NULL
4732  *                           to use the default printing)
4733  *
4734  * Returns:  nothing
4735  */
seqfsetperror(void (* perr_fn)(char *))4736 void seqfsetperror(void (*perr_fn)(char *))
4737 {
4738   if (!ctype_initflag)
4739     init_ctype();
4740 
4741   param_error(perr_fn == NULL, return, "seqfsetperror", "arg 1 is NULL");
4742 
4743   if (perr_fn == NULL)
4744     perror_fn = puterror;
4745   else
4746     perror_fn = perr_fn;
4747 }
4748 
4749 
4750 /*
4751  * print_fatal
4752  *
4753  * An internal procedure which does the printing and exiting on
4754  * an error.  Its arguments are similar to that of printf.
4755  *
4756  * Parameters:  format, ...   -  similar to printf
4757  *
4758  * Returns:  nothing
4759  */
print_fatal(char * format,...)4760 static void print_fatal(char *format, ...)
4761 {
4762   char *s;
4763   va_list ap;
4764 
4765   for (s=seqferrstr; err_batchmode && *s; s++) ;
4766   va_start(ap, format);
4767   vsprintf(s, format, ap);
4768   va_end(ap);
4769 
4770   if (pe_flag == PE_ALL || pe_flag == PE_NOEXIT ||
4771       pe_flag == PE_NOWARN || pe_flag == PE_ERRONLY) {
4772     (*perror_fn)(seqferrstr);
4773   }
4774   if (pe_flag == PE_ALL || pe_flag == PE_NOWARN)
4775     exit(1);
4776 }
4777 
4778 
4779 /*
4780  * print_error
4781  *
4782  * An internal procedure which does the printing on an error.
4783  * Its arguments are similar to that of printf.
4784  *
4785  * Parameters:  format, ...   -  similar to printf
4786  *
4787  * Returns:  nothing
4788  */
print_error(char * format,...)4789 static void print_error(char *format, ...)
4790 {
4791   char *s;
4792   va_list ap;
4793 
4794   for (s=seqferrstr; err_batchmode && *s; s++) ;
4795   va_start(ap, format);
4796   vsprintf(s, format, ap);
4797   va_end(ap);
4798 
4799   if (pe_flag == PE_ALL || pe_flag == PE_NOEXIT ||
4800       pe_flag == PE_NOWARN || pe_flag == PE_ERRONLY) {
4801     (*perror_fn)(seqferrstr);
4802   }
4803 }
4804 
4805 
4806 /*
4807  * print_warning
4808  *
4809  * An internal procedure which does the printing of a descriptive
4810  * message about a warning.  Its arguments are similar to that of printf.
4811  *
4812  * Parameters:  format, ...   -  similar to printf
4813  *
4814  * Returns:  nothing
4815  */
print_warning(char * format,...)4816 static void print_warning(char *format, ...)
4817 {
4818   char *s;
4819   va_list ap;
4820 
4821   for (s=seqferrstr; err_batchmode && *s; s++) ;
4822   va_start(ap, format);
4823   vsprintf(s, format, ap);
4824   va_end(ap);
4825 
4826   if (pe_flag == PE_ALL || pe_flag == PE_NOEXIT || pe_flag == PE_WARNONLY) {
4827     (*perror_fn)(seqferrstr);
4828   }
4829 }
4830 
4831 
4832 
4833 /*
4834  *
4835  *  Section for procedures that read the various file formats.
4836  *
4837  *
4838  */
4839 
4840 
4841 /*
4842  * raw_read
4843  *
4844  * The input file consists of a single entry (i.e., the characters of
4845  * the file are the characters of the entry).  This read function is
4846  * used for the "Raw", "Plain", "GCG" and "MSF" file formats.
4847  *
4848  * Parameters:  isfp  -  an opened INTSEQFILE structure
4849  *
4850  * Return: a STATUS value
4851  */
raw_read(INTSEQFILE * isfp,int flag)4852 static int raw_read(INTSEQFILE *isfp, int flag)
4853 {
4854   int status, gcglen, msfflag;
4855   char *s, *t, *end, *line;
4856 
4857   if (isfp->entry_count != 0)
4858     return STATUS_EOF;
4859 
4860   isfp->entry_count++;
4861   isfp->entry_seqlen = isfp->entry_truelen = isfp->entry_rawlen = 0;
4862   isfp->iflag_truelen = isfp->iflag_rawlen = 0;
4863   isfp->entry_seqno = isfp->entry_numseqs = 1;
4864 
4865   isfp->fp_entrystart = isfp->fp_seqstart = isfp->fp_current;
4866   isfp->fp_entryend = NULL;
4867 
4868   status = fp_read_all(isfp);
4869   switch (status) {
4870   case STATUS_OK:    break;
4871   case STATUS_EOF:   raise_error(E_PARSEERROR, return STATUS_ERROR,
4872                             print_error("%s:  Empty file.\n", isfp->filename));
4873   case STATUS_ERROR: return STATUS_ERROR;
4874   case STATUS_FATAL: return STATUS_FATAL;
4875   default:           status_error(return STATUS_ERROR, "raw_read");
4876   }
4877 
4878   isfp->fp_entryend = isfp->fp_top;
4879 
4880   if (isfp->format == FORMAT_RAW) {
4881     isfp->entry_truelen = isfp->entry_rawlen = isfp->fp_entryend -
4882                                                isfp->fp_entrystart;
4883     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
4884 
4885     return STATUS_OK;
4886   }
4887 
4888   /*
4889    * Check to see if the file is a GCG or MSF file by looking for a line
4890    * ending with "..".
4891    */
4892   for (s=isfp->fp_entrystart; s < isfp->fp_entryend; s++) {
4893     if (*s == '.' && s[1] == '.') {
4894       for (s+=2; s < isfp->fp_entryend && *s != '\n'; s++)
4895         if (!isspace(*s))
4896           break;
4897       if (s < isfp->fp_entryend && *s == '\n')
4898         break;
4899     }
4900   }
4901 
4902   if (s >= isfp->fp_entryend ||
4903       (!isfp->autodetermined && isfp->format == FORMAT_PLAIN)) {
4904     if (isfp->format == FORMAT_PLAIN) {
4905       if (!isfp->autodetermined)
4906         return STATUS_OK;
4907       else {
4908         set_error(E_DETFAILED);
4909         print_warning("%s:  Cannot determine file format.  Using `plain'.\n",
4910                       isfp->filename);
4911         return STATUS_WARNING;
4912       }
4913     }
4914 
4915     raise_error(E_PARSEERROR, return STATUS_ERROR,
4916                 print_error("%s, entry 1:  Parse error in GCG entry:  "
4917                             "no `..' dividing line.\n", isfp->filename));
4918   }
4919 
4920   /*
4921    * If it is a GCG or MSF file, determine which one by looking for the
4922    * information line preceeding the `..' and looking for "MSF: ".
4923    */
4924   gcglen = msfflag = 0;
4925   for (t=s-3; t >= isfp->fp_entrystart && *t != '\n'; t--) {
4926     if (gcglen == 0 && mystreq(t, 'M', "MSF: ")) {
4927       gcglen = myatoi(t + 4, 10, '0');
4928       msfflag = (toupper(*t) == 'M');
4929     }
4930     else if (gcglen == 0 && mystreq(t, 'L', "LENGTH: ")) {
4931       gcglen = myatoi(t + 8, 10, '0');
4932       msfflag = (toupper(*t) == 'M');
4933     }
4934   }
4935   isfp->gcg_infoline = t+1;
4936 
4937   if (!msfflag || isfp->format == FORMAT_GCG) {
4938     isfp->format = FORMAT_GCG;
4939     if (isfp->autodetermined)
4940       isfp->gcg_subformat = FORMAT_UNKNOWN;
4941     isfp->fp_seqstart = s + 1;
4942     if (gcglen > 0) {
4943       isfp->entry_rawlen = isfp->entry_seqlen = gcglen;
4944       isfp->iflag_rawlen = 1;
4945     }
4946 
4947     return STATUS_OK;
4948   }
4949 
4950   /*
4951    * Find the number of sequences in the MSF entry and the length
4952    * of the first sequence.
4953    */
4954   isfp->format = FORMAT_MSF;
4955   isfp->fp_seqstart = NULL;
4956   isfp->entry_numseqs = 0;
4957   isfp->malign_seqno = 1;
4958 
4959   s++;
4960   end = isfp->fp_entryend;
4961   while (s < end) {
4962     for (line=s; s < end && *s != '\n' && isspace(*s); s++) ;
4963     if (s >= end)
4964       break;
4965 
4966     if (*s != '\n') {
4967       if (*s == '/' && s[1] == '/')
4968         break;
4969 
4970       error_test(!mystreq(s, 'N', "NAME: "), E_PARSEERROR, return STATUS_ERROR,
4971                  print_error("%s, entry 1:  Parse error in MSF sequence "
4972                              "header lines.\n", isfp->filename));
4973 
4974       isfp->entry_numseqs++;
4975       if (isfp->fp_seqstart == NULL) {
4976         isfp->fp_seqstart = line;
4977 
4978         /*
4979          * Find the length of the first sequence.
4980          */
4981         for ( ; s + 5 < end && *s != '\n'; s++) {
4982           if (mystreq(s, 'L', "LEN: ")) {
4983             isfp->entry_rawlen = isfp->entry_seqlen = myatoi(s + 4, 10, '0');
4984             isfp->iflag_rawlen = 1;
4985             break;
4986           }
4987         }
4988         error_test(isfp->entry_seqlen <= 0, E_PARSEERROR, return STATUS_ERROR,
4989                    print_error("%s, entry 1:  Invalid format of MSF sequence "
4990                                "header lines.\n", isfp->filename));
4991       }
4992 
4993       while (s < end && *s != '\n') s++;
4994       if (s >= end)
4995         break;
4996     }
4997     s++;
4998   }
4999   error_test(s >= end, E_PARSEERROR, return STATUS_ERROR,
5000              print_error("%s, entry 1:  Parse error in MSF entry:  "
5001                          "no `//' dividing line.\n", isfp->filename));
5002   error_test(isfp->entry_numseqs == 0, E_PARSEERROR, return STATUS_ERROR,
5003              print_error("%s, entry 1:  Parse error in MSF entry:  "
5004                          "no sequence headers between `..' and `//'.\n",
5005                          isfp->filename));
5006   return STATUS_OK;
5007 }
5008 
5009 
5010 /*
5011  * raw_getseq  (Raw file-format)
5012  *
5013  * In the raw file format, the "entry" contains only the sequence.
5014  *
5015  * Parameters:  isfp        -  an INTSEQFILE structure that has read an
5016  *                             entry from a sequence file.
5017  *              rawseqflag  -  should the actual sequence (when 0) or
5018  *                             the raw sequence text be extracted (when 1),
5019  *                             or should just the lengths be set (when 2).
5020  *
5021  * Return: a STATUS value
5022  */
raw_getseq(INTSEQFILE * isfp,int rawseqflag)5023 static int raw_getseq(INTSEQFILE *isfp, int rawseqflag)
5024 {
5025   int len;
5026 
5027   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
5028                 print_error("   fp_entryend not set by %s's read function\n",
5029                             file_table[isfp->format].ident));
5030 
5031   if (rawseqflag == GETSEQ_LENGTHS)
5032     return STATUS_OK;
5033 
5034   if (isfp->fp_seqstart == NULL) {
5035     isfp->seqlen = 0;
5036     isfp->seq[0] = '\0';
5037     set_error(E_NOSEQ);
5038     print_error("%s, entry %d:  Entry contains no sequence.\n",
5039                 isfp->filename, isfp->entry_count);
5040     return STATUS_ERROR;
5041   }
5042 
5043   len = isfp->fp_entryend - isfp->fp_seqstart;
5044   if (len + 1 >= isfp->seqsize) {
5045     isfp->seqsize += len + 1;
5046     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
5047     if (isfp->seq == NULL) {
5048       isfp->seqsize = 0;
5049       memory_error(1, return STATUS_FATAL);
5050     }
5051   }
5052 
5053   memcpy(isfp->seq, isfp->fp_seqstart, len);
5054   isfp->seq[len] = '\0';
5055   isfp->seqlen = len;
5056 
5057   return STATUS_OK;
5058 }
5059 
5060 
5061 /*
5062  * databank_read
5063  *
5064  * Parameters:  isfp  -  an opened INTSEQFILE structure
5065  *
5066  * Return: a STATUS value
5067  */
databank_read(INTSEQFILE * isfp,int flag)5068 static int databank_read(INTSEQFILE *isfp, int flag)
5069 {
5070   int status, format, period, count, sqflag;
5071   char *s, *t, *line, *end, *keyword;
5072 
5073   isfp->fp_entrystart = isfp->fp_seqstart = isfp->fp_entryend = NULL;
5074   isfp->entry_seqlen = isfp->entry_truelen = isfp->entry_rawlen = 0;
5075   isfp->iflag_truelen = isfp->iflag_rawlen = 0;
5076   isfp->entry_count++;
5077   isfp->entry_seqno = isfp->entry_numseqs = 1;
5078   isfp->gcg_infoline = NULL;
5079 
5080   format = (isfp->format == FORMAT_GCG ? isfp->gcg_subformat : isfp->format);
5081 
5082   /*
5083    * Scan to the first line.
5084    */
5085   switch (format) {
5086   case FORMAT_GENBANK:  keyword = "LOCUS";  break;
5087   case FORMAT_PIR:      keyword = "ENTRY";  break;
5088   case FORMAT_EMBL:     keyword = "ID   ";  break;
5089   case FORMAT_SPROT:    keyword = "ID   ";  break;
5090   default:
5091     status_error(return STATUS_ERROR, "databank_read");
5092   }
5093 
5094   while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
5095     if (mystreq(line, *keyword, keyword))
5096       break;
5097 
5098   if (status == STATUS_EOF) {
5099     error_test(isfp->entry_count == 1,
5100                E_PARSEERROR, return STATUS_ERROR,
5101                print_error("%s:  File contains no %s entries.\n",
5102                            isfp->filename, file_table[format].ident));
5103     return STATUS_EOF;
5104   }
5105 
5106   /*
5107    * Read the rest of the entry, if no error has occurred.  While reading
5108    * the entry, look for a line containing the sequence's length, as well
5109    * as the first line containing the sequence.
5110    */
5111   if (status == STATUS_OK) {
5112     isfp->fp_entrystart = line;
5113 
5114     switch (format) {
5115     case FORMAT_GENBANK:
5116       isfp->entry_truelen = isfp->entry_seqlen = myatoi(line + 22, 10, '0');
5117       isfp->iflag_truelen = 1;
5118 
5119       while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
5120         if (line[0] == '/' && line[1] == '/')
5121           break;
5122         else if (mystreq(line, 'O', "ORIGIN"))
5123           isfp->fp_seqstart = end + 1;
5124         else if (end - line >= 2 && end[-1] == '.' && end[-2] == '.') {
5125           if (isfp->fp_seqstart && !isfp->gcg_infoline &&
5126               (isfp->format == FORMAT_GCG || isfp->autodetermined)) {
5127             error_test(isfp->entry_count != 1,
5128                        E_PARSEERROR, return STATUS_ERROR,
5129                        print_error("%s, entry %d:  GCG entry found, but not "
5130                                    "as only entry in file.\n", isfp->filename,
5131                                    isfp->entry_count));
5132 
5133             if (isfp->format != FORMAT_GCG) {
5134               isfp->gcg_subformat = isfp->format;
5135               isfp->format = FORMAT_GCG;
5136             }
5137             isfp->gcg_infoline = line;
5138             isfp->fp_seqstart = end + 1;
5139           }
5140         }
5141       }
5142       break;
5143 
5144     case FORMAT_PIR:
5145       while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
5146         if (line[0] == '/' && line[1] == '/' && line[2] == '/')
5147           break;
5148         else if (mystreq(line, 'S', "SEQUENCE"))
5149           isfp->fp_seqstart = end + 1;
5150         else if (mystreq(line, 'S', "SUMMARY")) {
5151           for (t=line+7; t < end && !mystreq(t, '#', "#LENGTH"); t++) ;
5152           isfp->entry_truelen = isfp->entry_seqlen = myatoi(t + 7, 10, '0');
5153           isfp->iflag_truelen = 1;
5154         }
5155         else if (end - line >= 2 && end[-1] == '.' && end[-2] == '.') {
5156           if (isfp->fp_seqstart && !isfp->gcg_infoline &&
5157               (isfp->format == FORMAT_GCG || isfp->autodetermined)) {
5158             error_test(isfp->entry_count != 1,
5159                        E_PARSEERROR, return STATUS_ERROR,
5160                        print_error("%s, entry %d:  GCG entry found, but not "
5161                                    "as only entry in file.\n", isfp->filename,
5162                                    isfp->entry_count));
5163 
5164             if (isfp->format != FORMAT_GCG) {
5165               isfp->gcg_subformat = isfp->format;
5166               isfp->format = FORMAT_GCG;
5167             }
5168             isfp->gcg_infoline = line;
5169             isfp->fp_seqstart = end + 1;
5170           }
5171         }
5172       }
5173       break;
5174 
5175     case FORMAT_EMBL:
5176     case FORMAT_SPROT:
5177       /*
5178        * If this is the first entry read in, and the format was
5179        * specified as EMBL, check that entry to see if it is in fact
5180        * a Swiss-Prot entry.
5181        */
5182       if (isfp->entry_count == 1 && isfp->format == FORMAT_EMBL) {
5183         period = count = 0;
5184         t = NULL;
5185         for (s=line+5; s < end; s++) {
5186           if (*s == ';') {
5187             count++;
5188             t = s;
5189           }
5190           else if (*s == '.')
5191             period = 1;
5192         }
5193 
5194         if (count == 2 && period && mystreq(t-3, 'P', "PRT;"))
5195           isfp->format = FORMAT_SPROT;
5196       }
5197 
5198       sqflag = 0;
5199       while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
5200         if (line[0] == '/' && line[1] == '/')
5201           break;
5202         else if (isfp->fp_seqstart == NULL && mystreq(line, ' ', "     "))
5203           isfp->fp_seqstart = line;
5204         else if (mystreq(line, 'S', "SQ")) {
5205           if (end - line > 13) {
5206             isfp->entry_truelen =
5207               isfp->entry_seqlen = myatoi(line+13, 10, '0');
5208             isfp->iflag_truelen = 1;
5209           }
5210           sqflag = 1;
5211         }
5212         else if (end - line >= 2 && end[-1] == '.' && end[-2] == '.') {
5213           if ((isfp->fp_seqstart || sqflag) && !isfp->gcg_infoline &&
5214               (isfp->format == FORMAT_GCG || isfp->autodetermined)) {
5215             error_test(isfp->entry_count != 1,
5216                        E_PARSEERROR, return STATUS_ERROR,
5217                        print_error("%s, entry %d:  GCG entry found, but not "
5218                                    "as only entry in file.\n", isfp->filename,
5219                                    isfp->entry_count));
5220 
5221             if (isfp->format != FORMAT_GCG) {
5222               isfp->gcg_subformat = isfp->format;
5223               isfp->format = FORMAT_GCG;
5224             }
5225             isfp->gcg_infoline = line;
5226             isfp->fp_seqstart = end + 1;
5227           }
5228         }
5229       }
5230       break;
5231 
5232     default:
5233       status_error(return STATUS_ERROR, "databank_read");
5234     }
5235   }
5236 
5237   /*
5238    * Check for errors during the read.
5239    */
5240   switch (status) {
5241   case STATUS_OK:    break;
5242   case STATUS_EOF:   error_test(isfp->format != FORMAT_GCG,
5243                                 E_PARSEERROR, return STATUS_ERROR,
5244                                 print_error("%s, entry %d:  Premature EOF "
5245                                             "reached.\n", isfp->filename,
5246                                             isfp->entry_count));
5247                      break;
5248   case STATUS_ERROR: return STATUS_ERROR;
5249   case STATUS_FATAL: return STATUS_FATAL;
5250   default:           status_error(return STATUS_ERROR, "databank_read");
5251   }
5252 
5253   /*
5254    * Set the final values for the entry.
5255    */
5256   if (isfp->format == FORMAT_GCG) {
5257     error_test(isfp->gcg_infoline == NULL, E_PARSEERROR, return STATUS_ERROR,
5258                print_error("%s, entry %d:  No `..' dividing line in GCG "
5259                            "entry.\n", isfp->filename, isfp->entry_count));
5260 
5261     isfp->fp_entryend = isfp->fp_top;
5262     for (s=isfp->gcg_infoline; *s != '\n'; s++) {
5263       if (mystreq(s, 'L', "LENGTH: ")) {
5264         isfp->entry_rawlen = myatoi(s + 8, 10, '0');
5265         isfp->iflag_rawlen = 1;
5266         if (isfp->entry_seqlen <= 0)
5267           isfp->entry_seqlen = isfp->entry_rawlen;
5268         break;
5269       }
5270     }
5271   }
5272   else {
5273     isfp->fp_entryend = end + 1;
5274     if (isfp->fp_seqstart == NULL)
5275       isfp->entry_seqno = isfp->entry_numseqs = 0;
5276   }
5277 
5278   return STATUS_OK;
5279 }
5280 
5281 
5282 /*
5283  * basic_read
5284  *
5285  * Parameters:  isfp  -  an opened INTSEQFILE structure
5286  *
5287  * Return: a STATUS value
5288  */
basic_read(INTSEQFILE * isfp,int flag)5289 static int basic_read(INTSEQFILE *isfp, int flag)
5290 {
5291   int status, format, header_offset, descr_line, descr_end;
5292   char keych, *s, *line, *end;
5293 
5294   isfp->fp_entrystart = isfp->fp_seqstart = isfp->fp_entryend = NULL;
5295   isfp->entry_seqlen = isfp->entry_truelen = isfp->entry_rawlen = 0;
5296   isfp->iflag_truelen = isfp->iflag_rawlen = 0;
5297   isfp->entry_count++;
5298   isfp->entry_seqno = isfp->entry_numseqs = 1;
5299   isfp->gcg_infoline = NULL;
5300 
5301   format = (isfp->format == FORMAT_GCG ? isfp->gcg_subformat : isfp->format);
5302   descr_line = descr_end = -1;
5303 
5304   if (format == FORMAT_NBRF || format == FORMAT_NBRFOLD)
5305     isfp->nbrf_header = NULL;
5306 
5307   /*
5308    * Scan to the first line.
5309    */
5310   switch (format) {
5311   case FORMAT_FASTA:
5312   case FORMAT_FASTAOLD:
5313   case FORMAT_NBRF:
5314   case FORMAT_NBRFOLD:
5315     keych = '>';
5316     break;
5317 
5318   case FORMAT_STANFORD:
5319   case FORMAT_STANFORDOLD:
5320     keych = ';';
5321     break;
5322 
5323   default:
5324     status_error(return STATUS_ERROR, "basic_read");
5325   }
5326 
5327   while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
5328     if (line[0] == keych)
5329       break;
5330 
5331   if (status == STATUS_EOF) {
5332     error_test(isfp->entry_count == 1, E_PARSEERROR, return STATUS_ERROR,
5333                print_error("%s:  File contains no %s entries.\n",
5334                            isfp->filename, file_table[format].ident));
5335     return STATUS_EOF;
5336   }
5337 
5338   /*
5339    * Read the rest of the header, if no error has occurred.
5340    */
5341   if (status == STATUS_OK) {
5342     isfp->fp_entrystart = line;
5343 
5344     if (format == FORMAT_FASTA) {
5345       descr_line = line - isfp->fp_entrystart;
5346       descr_end = end - isfp->fp_entrystart;
5347     }
5348 
5349     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
5350       if (line[0] != keych)
5351         break;
5352   }
5353 
5354   /*
5355    * Read any extra lines to the beginning of the sequence.
5356    */
5357   if (status == STATUS_OK) {
5358     switch (format) {
5359     case FORMAT_FASTA:
5360     case FORMAT_FASTAOLD:
5361       if (line[0] == ';') {
5362         while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
5363           if (line[0] != ';')
5364             break;
5365       }
5366       break;
5367 
5368     case FORMAT_NBRF:
5369     case FORMAT_NBRFOLD:
5370     case FORMAT_STANFORD:
5371     case FORMAT_STANFORDOLD:
5372       descr_line = line - isfp->fp_entrystart;
5373       descr_end = end - isfp->fp_entrystart;
5374       status = fp_get_line(isfp, &line, &end);
5375       break;
5376     }
5377   }
5378 
5379   /*
5380    * If the file ends before any sequence lines are given, return
5381    * with the entry consisting of just a header.
5382    */
5383   if (status == STATUS_EOF) {
5384     isfp->entry_seqno = isfp->entry_numseqs = 0;
5385     isfp->fp_entryend = isfp->fp_top;
5386     return STATUS_OK;
5387   }
5388 
5389   /*
5390    * Read the sequence lines up to the next line beginning with 'keych'.
5391    */
5392   if (status == STATUS_OK && line[0] != keych) {
5393     isfp->fp_seqstart = line;
5394 
5395     switch (format) {
5396     case FORMAT_FASTA:
5397     case FORMAT_FASTAOLD:
5398     case FORMAT_STANFORD:
5399     case FORMAT_STANFORDOLD:
5400       while (status == STATUS_OK && line[0] != keych) {
5401         if (end - line >= 2 && end[-1] == '.' && end[-2] == '.') {
5402           if (!isfp->gcg_infoline &&
5403               (isfp->format == FORMAT_GCG || isfp->autodetermined)) {
5404             error_test(isfp->entry_count != 1,
5405                        E_PARSEERROR, return STATUS_ERROR,
5406                        print_error("%s, entry %d:  GCG entry found, but not "
5407                                    "as only entry in file.\n", isfp->filename,
5408                                    isfp->entry_count));
5409 
5410             if (isfp->format != FORMAT_GCG) {
5411               isfp->gcg_subformat = isfp->format;
5412               isfp->format = FORMAT_GCG;
5413             }
5414             isfp->gcg_infoline = line;
5415             isfp->fp_seqstart = end + 1;
5416           }
5417         }
5418         status = fp_get_line(isfp, &line, &end);
5419       }
5420       break;
5421 
5422     case FORMAT_NBRF:
5423     case FORMAT_NBRFOLD:
5424       header_offset = 0;
5425       while (status == STATUS_OK && line[0] != keych) {
5426         if (!header_offset && line[1] == ';')
5427           header_offset = line - isfp->fp_entrystart;
5428         else if (end - line >= 2 && end[-1] == '.' && end[-2] == '.') {
5429           if (!isfp->gcg_infoline &&
5430               (isfp->format == FORMAT_GCG || isfp->autodetermined)) {
5431             error_test(isfp->entry_count != 1,
5432                        E_PARSEERROR, return STATUS_ERROR,
5433                        print_error("%s, entry %d:  GCG entry found, but not "
5434                                    "as only entry in file.\n", isfp->filename,
5435                                    isfp->entry_count));
5436 
5437             if (isfp->format != FORMAT_GCG) {
5438               isfp->gcg_subformat = isfp->format;
5439               isfp->format = FORMAT_GCG;
5440             }
5441             isfp->gcg_infoline = line;
5442             isfp->fp_seqstart = end + 1;
5443           }
5444         }
5445         status = fp_get_line(isfp, &line, &end);
5446       }
5447       if (header_offset)
5448         isfp->nbrf_header = isfp->fp_entrystart + header_offset;
5449       break;
5450     }
5451   }
5452 
5453   /*
5454    * Check for errors during the read.
5455    */
5456   switch (status) {
5457   case STATUS_OK:    isfp->fp_entryend = isfp->fp_current = line;
5458                      break;
5459   case STATUS_EOF:   isfp->fp_entryend = isfp->fp_current = isfp->fp_top;
5460                      break;
5461   case STATUS_ERROR: return STATUS_ERROR;
5462   case STATUS_FATAL: return STATUS_FATAL;
5463   default:           status_error(return STATUS_ERROR, "basic_read");
5464   }
5465 
5466   if (isfp->format == FORMAT_GCG) {
5467     error_test(status == STATUS_OK, E_PARSEERROR, return STATUS_ERROR,
5468                print_error("%s, entry %d:  Improperly formatted sequence "
5469                            "lines of GCG entry.\n", isfp->filename,
5470                            isfp->entry_count));
5471     error_test(isfp->gcg_infoline == NULL, E_PARSEERROR, return STATUS_ERROR,
5472                print_error("%s, entry %d:  No `..' dividing line in GCG "
5473                            "entry.\n", isfp->filename, isfp->entry_count));
5474   }
5475 
5476   /*
5477    * Find the sequence length.
5478    */
5479   if (isfp->gcg_infoline) {
5480     end = isfp->fp_entryend;
5481     for (s=isfp->gcg_infoline; s < end && *s != '\n'; s++) {
5482       if (mystreq(s, 'L', "LENGTH: ")) {
5483         isfp->entry_rawlen = myatoi(s + 8, 10, '0');
5484         isfp->iflag_rawlen = 1;
5485         break;
5486       }
5487     }
5488   }
5489   if (descr_line != -1) {
5490     line = isfp->fp_entrystart + descr_line;
5491     end = isfp->fp_entrystart + descr_end;
5492 
5493     flag = 0;
5494     for (s=end-1; s >= line && isspace(*s); s--) ;
5495     if (s >= line && *s == '.')
5496       for (s--; s >= line && isspace(*s); s--) ;
5497     if (s >= line && *s == ')') {
5498       for (s--; s >= line && *s != '('; s--) ;
5499       for (s--; s >= line && isspace(*s); s--) ;
5500     }
5501     if (s - line >= 3 && isspace(*(s-2)) &&
5502         ((toupper(*(s-1)) == 'B' && toupper(*s) == 'P') ||
5503          (toupper(*(s-1)) == 'A' && toupper(*s) == 'A') ||
5504          (toupper(*(s-1)) == 'C' && toupper(*s) == 'H'))) {
5505       flag++;
5506       for (s-=3; s >= line && isspace(*s); s--) ;
5507     }
5508     if (s >= line && isdigit(*s)) {
5509       flag++;
5510       while (s >= line && isdigit(*s)) s--;
5511       while (s >= line && isspace(*s)) s--;
5512     }
5513     if (s >= line && flag == 2 && *s == ',') {
5514       isfp->entry_truelen = myatoi(s + 1, 10, '0');
5515       isfp->iflag_truelen = 1;
5516     }
5517   }
5518 
5519   return STATUS_OK;
5520 }
5521 
5522 
5523 /*
5524  * basic_getseq
5525  *
5526  * The basic method for getting the sequence from an entry.  The pointer
5527  * "fp_seqstart" points to the first line of the sequence, and all of
5528  * the alphabetic characters from there to "fp_entryend" make up the
5529  * sequence.  The procedure copies those characters into the sequence
5530  * buffer.
5531  *
5532  * There are a couple minor variations included in the function.  For
5533  * GenBank, EMBL, PIR and Swissprot, the last line of the entry is ignored
5534  * (since it contains the "//" or "///" terminator).  For FASTA, the
5535  * function ignores any text after a semi-colon on a line.  For NBRF, the
5536  * function stops either at the first asterisk, the first "header" line or
5537  * the end of the entry.
5538  *
5539  * Parameters:  isfp        -  an INTSEQFILE structure that has read an
5540  *                             entry from a sequence file.
5541  *              rawseqflag  -  should the actual sequence or the raw
5542  *                             sequence text be extracted.
5543  *
5544  * Returns:  a STATUS value.
5545  */
basic_getseq(INTSEQFILE * isfp,int rawseqflag)5546 static int basic_getseq(INTSEQFILE *isfp, int rawseqflag)
5547 {
5548   int format;
5549   char *s, *end, *seq;
5550 
5551   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
5552                 print_error("   fp_entryend not set by %s's read function\n",
5553                             file_table[isfp->format].ident));
5554 
5555   if (isfp->fp_seqstart == NULL) {
5556     if (rawseqflag == GETSEQ_LENGTHS) {
5557       isfp->entry_rawlen = isfp->entry_truelen = 0;
5558       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
5559       return STATUS_OK;
5560     }
5561     else {
5562       isfp->seqlen = 0;
5563       isfp->seq[0] = '\0';
5564       set_error(E_NOSEQ);
5565       print_error("%s, entry %d:  Entry contains no sequence.\n",
5566                   isfp->filename, isfp->entry_count);
5567       return STATUS_ERROR;
5568     }
5569   }
5570 
5571   s = isfp->fp_seqstart;
5572   end = isfp->fp_entryend;
5573   format = isfp->format;
5574 
5575   /*
5576    * For GenBank, PIR, EMBL and Swissprot, move the end in front of the
5577    * "//" or "///" line.
5578    *
5579    * For NBRF, move the end to the first "header" line that appears after
5580    * the sequence.
5581    */
5582   if (format == FORMAT_GENBANK || format == FORMAT_PIR ||
5583       format == FORMAT_EMBL || format == FORMAT_SPROT) {
5584     if (*(end-1) == '\n') end--;
5585     while (end > s && *(end-1) != '\n') end--;
5586   }
5587   else if (format == FORMAT_NBRF) {
5588     if (isfp->nbrf_header != NULL)
5589       end = isfp->nbrf_header;
5590   }
5591 
5592   /*
5593    * Reallocate the sequence buffer, if necessary.
5594    */
5595   if (rawseqflag != GETSEQ_LENGTHS && end - s + 1 >= isfp->seqsize) {
5596     isfp->seqsize += end - s + 1;
5597     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
5598     if (isfp->seq == NULL) {
5599       isfp->seqsize = 0;
5600       memory_error(1, return STATUS_FATAL);
5601     }
5602   }
5603   seq = isfp->seq;
5604 
5605   /*
5606    * Extract the sequence characters.
5607    */
5608   switch (format) {
5609   case FORMAT_FASTA:
5610     switch (rawseqflag) {
5611     case GETSEQ_SEQUENCE:
5612       for ( ; s < end; s++) {
5613         if (*s == ';')
5614           while (s < end && *s != '\n') s++;
5615         else if (isalpha(*s))
5616           *seq++ = *s;
5617       }
5618       break;
5619 
5620     case GETSEQ_RAWSEQ:
5621       for ( ; s < end && *s != '*'; s++) {
5622         if (*s == ';')
5623           while (s < end && *s != '\n') s++;
5624         else if (!(isspace(*s) || isdigit(*s)))
5625           *seq++ = *s;
5626       }
5627       break;
5628 
5629     case GETSEQ_LENGTHS:
5630       isfp->entry_truelen = isfp->entry_rawlen = 0;
5631       for ( ; s < end && *s != '*'; s++) {
5632         if (*s == ';')
5633           while (s < end && *s != '\n') s++;
5634         else if (!(isspace(*s) || isdigit(*s))) {
5635           isfp->entry_rawlen++;
5636           if (isalpha(*s))
5637             isfp->entry_truelen++;
5638         }
5639       }
5640       isfp->iflag_truelen = isfp->iflag_rawlen = 1;
5641       break;
5642     }
5643     break;
5644 
5645   case FORMAT_NBRF:
5646     switch (rawseqflag) {
5647     case GETSEQ_SEQUENCE:
5648       for ( ; s < end && *s != '*'; s++)
5649         if (isalpha(*s))
5650           *seq++ = *s;
5651       break;
5652 
5653     case GETSEQ_RAWSEQ:
5654       for ( ; s < end && *s != '*'; s++)
5655         if (!(isspace(*s) || isdigit(*s)))
5656           *seq++ = *s;
5657       break;
5658 
5659     case GETSEQ_LENGTHS:
5660       isfp->entry_truelen = isfp->entry_rawlen = 0;
5661       for ( ; s < end && *s != '*'; s++) {
5662         if (!(isspace(*s) || isdigit(*s))) {
5663           isfp->entry_rawlen++;
5664           if (isalpha(*s))
5665             isfp->entry_truelen++;
5666         }
5667       }
5668       isfp->iflag_truelen = isfp->iflag_rawlen = 1;
5669       break;
5670     }
5671     break;
5672 
5673   default:
5674     switch (rawseqflag) {
5675     case GETSEQ_SEQUENCE:
5676       for ( ; s < end; s++)
5677         if (isalpha(*s))
5678           *seq++ = *s;
5679       break;
5680 
5681     case GETSEQ_RAWSEQ:
5682       for ( ; s < end; s++)
5683         if (!(isspace(*s) || isdigit(*s)))
5684           *seq++ = *s;
5685       break;
5686 
5687     case GETSEQ_LENGTHS:
5688       isfp->entry_truelen = isfp->entry_rawlen = 0;
5689       for ( ; s < end; s++) {
5690         if (!(isspace(*s) || isdigit(*s))) {
5691           isfp->entry_rawlen++;
5692           if (isalpha(*s))
5693             isfp->entry_truelen++;
5694         }
5695       }
5696       break;
5697     }
5698   }
5699 
5700   if (rawseqflag == GETSEQ_LENGTHS)
5701     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
5702   else {
5703     *seq = '\0';
5704     isfp->seqlen = seq - isfp->seq;
5705 
5706     /*
5707      * Perform checks on the sequence length.
5708      */
5709     if (isfp->seqlen == 0) {
5710       set_error(E_NOSEQ);
5711       print_error("%s, entry %d:  Entry contains no sequence.\n",
5712                   isfp->filename, isfp->entry_count);
5713       return STATUS_ERROR;
5714     }
5715     if (rawseqflag == GETSEQ_SEQUENCE &&
5716         isfp->entry_seqlen > 0 && isfp->entry_seqlen != isfp->seqlen) {
5717       set_error(E_DIFFLENGTH);
5718       print_warning("Warning:  %s, entry %d:  Entry gives seq. length of %d, "
5719                     "but %d characters found.\n", isfp->filename,
5720                     isfp->entry_count, isfp->entry_seqlen, isfp->seqlen);
5721       return STATUS_WARNING;
5722     }
5723 
5724     if (rawseqflag == GETSEQ_SEQUENCE) {
5725       isfp->entry_truelen = isfp->seqlen;
5726       isfp->iflag_truelen = 1;
5727     }
5728     else if (rawseqflag == GETSEQ_RAWSEQ) {
5729       isfp->entry_rawlen = isfp->seqlen;
5730       isfp->iflag_rawlen = 1;
5731     }
5732   }
5733 
5734   return STATUS_OK;
5735 }
5736 
5737 
5738 /*
5739  * databank_fast_read
5740  *
5741  * Parameters:  isfp  -  an opened INTSEQFILE structure
5742  *
5743  * Return: a STATUS value
5744  */
databank_fast_read(INTSEQFILE * isfp,int flag)5745 static int databank_fast_read(INTSEQFILE *isfp, int flag)
5746 {
5747   static int jt_flag = 0;
5748   static int origin_jump_table[128], summary_jump_table[128];
5749   static int sequenc_jump_table[128];
5750   register char *s;
5751   int i, count, status, shift, format, width, thin, wide, num;
5752   char *t, *line, *end, *stemp, *top;
5753 
5754   if (!jt_flag) {
5755     for (i=0; i < 128; i++)
5756       origin_jump_table[i] = 7;
5757     origin_jump_table['\n'] = 6;
5758     origin_jump_table['O'] = 5;
5759     origin_jump_table['R'] = 4;
5760     origin_jump_table['G'] = 2;
5761     origin_jump_table['I'] = 1;
5762     origin_jump_table['N'] = 0;
5763 
5764     for (i=0; i < 128; i++)
5765       summary_jump_table[i] = 8;
5766     summary_jump_table['\n'] = 7;
5767     summary_jump_table['S'] = 6;
5768     summary_jump_table['U'] = 5;
5769     summary_jump_table['M'] = 3;
5770     summary_jump_table['A'] = 2;
5771     summary_jump_table['R'] = 1;
5772     summary_jump_table['Y'] = 0;
5773 
5774     for (i=0; i < 128; i++)
5775       sequenc_jump_table[i] = 13;
5776     sequenc_jump_table['\n'] = 12;
5777     sequenc_jump_table[' '] = 7;
5778     sequenc_jump_table['S'] = sequenc_jump_table['s'] = 6;
5779     sequenc_jump_table['Q'] = sequenc_jump_table['q'] = 4;
5780     sequenc_jump_table['U'] = sequenc_jump_table['u'] = 3;
5781     sequenc_jump_table['E'] = sequenc_jump_table['e'] = 2;
5782     sequenc_jump_table['N'] = sequenc_jump_table['n'] = 1;
5783     sequenc_jump_table['C'] = sequenc_jump_table['c'] = 0;
5784 
5785     jt_flag = 1;
5786   }
5787 
5788   isfp->fp_entrystart = isfp->fp_seqstart = isfp->fp_entryend = NULL;
5789   isfp->entry_seqlen = isfp->entry_truelen = isfp->entry_rawlen = 0;
5790   isfp->iflag_truelen = isfp->iflag_rawlen = 0;
5791   isfp->entry_count++;
5792   isfp->entry_seqno = isfp->entry_numseqs = 1;
5793 
5794   format = isfp->format;
5795 
5796   /*
5797    * Simple line-by-line scanning to look for the LOCUS line.
5798    */
5799   switch (format) {
5800   case FORMAT_GBFAST:
5801     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
5802       if (line[0] == 'L' && line[1] == 'O' && line[2] == 'C' &&
5803           line[3] == 'U' && line[4] == 'S')
5804         break;
5805     }
5806     break;
5807 
5808   case FORMAT_PIRFAST:
5809     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
5810       if (line[0] == 'E' && line[1] == 'N' && line[2] == 'T' &&
5811           line[3] == 'R' && line[4] == 'Y')
5812         break;
5813     }
5814     break;
5815 
5816   case FORMAT_EMBLFAST:
5817   case FORMAT_SPFAST:
5818     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
5819       if (line[0] == 'I' && line[1] == 'D' && line[2] == ' ' &&
5820           line[3] == ' ' && line[4] == ' ')
5821         break;
5822     }
5823     break;
5824 
5825   default:
5826     status_error(return STATUS_ERROR, "databank_fast_read");
5827   }
5828 
5829   if (status == STATUS_EOF) {
5830     error_test(isfp->entry_count == 1, E_PARSEERROR, return STATUS_ERROR,
5831                print_error("%s:  File contains no %s entries.\n",
5832                            isfp->filename, file_table[format].ident));
5833     return STATUS_EOF;
5834   }
5835 
5836   if (status == STATUS_OK) {
5837     isfp->fp_entrystart = line;
5838 
5839     if (format == FORMAT_GBFAST) {
5840       isfp->entry_seqlen = myatoi(line + 22, 10, '0');
5841       isfp->entry_rawlen = isfp->entry_truelen = isfp->entry_seqlen;
5842       isfp->iflag_truelen = isfp->iflag_rawlen = 1;
5843     }
5844 
5845     /*
5846      * Perform a simplified Boyer-Moore search for the "ORIGIN", "SUMMARY"
5847      * or "SQ   Sequenc" line.
5848      *
5849      * During the Boyer-Moore search, "s" and "top" will be local copies of
5850      * the fp_current and fp_top values (normally hidden inside fp_get_line).
5851      */
5852     line = s = isfp->fp_current;
5853     top = isfp->fp_top;
5854 
5855     width = 0;
5856     switch (format) {
5857     case FORMAT_GBFAST:
5858       s += 5;
5859       while (status == STATUS_OK) {
5860         while (s < top && (shift = origin_jump_table[(int) *s]))
5861           s += shift;
5862 
5863         if (s < top) {
5864           if (s[-6] == '\n' && s[-5] == 'O' && s[-4] == 'R' &&
5865               s[-3] == 'I' && s[-2] == 'G' && s[-1] == 'I')
5866             break;
5867           else
5868             s += 7;
5869         }
5870         else {
5871           stemp = s;
5872           status = fp_read_more(isfp, &line, &stemp, &top);
5873           s = stemp;
5874         }
5875       }
5876       width = 5;
5877       break;
5878 
5879     case FORMAT_PIRFAST:
5880       s += 6;
5881       while (status == STATUS_OK) {
5882         while (s < top && (shift = summary_jump_table[(int) *s]))
5883           s += shift;
5884 
5885         if (s < top) {
5886           if (s[-7] == '\n' && s[-6] == 'S' && s[-5] == 'U' &&
5887               s[-4] == 'M' && s[-3] == 'M' && s[-2] == 'A' && s[-1] == 'R')
5888             break;
5889           else
5890             s += 8;
5891         }
5892         else {
5893           stemp = s;
5894           status = fp_read_more(isfp, &line, &stemp, &top);
5895           s = stemp;
5896         }
5897       }
5898       width = 6;
5899       break;
5900 
5901     case FORMAT_EMBLFAST:
5902     case FORMAT_SPFAST:
5903       s += 11;
5904       while (status == STATUS_OK) {
5905         while (s < top && (shift = sequenc_jump_table[(int) *s]))
5906           s += shift;
5907 
5908         if (s < top) {
5909           if (s[-12] == '\n' && s[-11] == 'S' && s[-10] == 'Q' &&
5910               s[-9] == ' ' && s[-8] == ' ' && s[-7] == ' ' && s[-6] == 'S' &&
5911               toupper(s[-5]) == 'E' && toupper(s[-4]) == 'Q' &&
5912               toupper(s[-3]) == 'U' && toupper(s[-2]) == 'E' &&
5913               toupper(s[-1]) == 'N')
5914             break;
5915           else
5916             s += 13;
5917         }
5918         else {
5919           stemp = s;
5920           status = fp_read_more(isfp, &line, &stemp, &top);
5921           s = stemp;
5922         }
5923       }
5924       width = 11;
5925       break;
5926     }
5927 
5928     if (status == STATUS_OK) {
5929       isfp->fp_current = s - width;
5930       isfp->fp_top = top;
5931     }
5932   }
5933 
5934   if (status == STATUS_OK &&
5935       (status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
5936     if (format == FORMAT_EMBLFAST || format == FORMAT_SPFAST) {
5937       isfp->entry_seqlen = myatoi(line + 14, 10, '0');
5938       isfp->entry_truelen = isfp->entry_rawlen = isfp->entry_seqlen;
5939       isfp->iflag_truelen = isfp->iflag_rawlen = 1;
5940     }
5941     else if (format == FORMAT_PIRFAST) {
5942       if (mystreq(line + 17, '#', "#LENGTH")) {
5943         isfp->entry_truelen = isfp->entry_seqlen = myatoi(line + 25, 10, '0');
5944         isfp->iflag_truelen = 1;
5945       }
5946       else {
5947         for (t=line + 7; t < end; t++) {
5948           if (*t == '#' && mystreq(t+1, 'L', "LENGTH")) {
5949             isfp->entry_truelen = isfp->entry_seqlen = myatoi(t + 8, 10, '0');
5950             isfp->iflag_truelen = 1;
5951             break;
5952           }
5953         }
5954       }
5955 
5956       if ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
5957         status = fp_get_line(isfp, &line, &end);
5958     }
5959   }
5960 
5961   if (status == STATUS_OK) {
5962     /*
5963      * The entry length must have been in the header for this code to work.
5964      */
5965     error_test(isfp->entry_seqlen == 0, E_PARSEERROR, return STATUS_ERROR,
5966                print_error("%s, entry %d:  Sequence length is missing from ",
5967                            "entry.\n", isfp->filename, isfp->entry_count));
5968     /*
5969      * Skip past the complete sequence lines (i.e., the lines containing
5970      * a full 60 characters of the sequence, and then restore the values
5971      * of fp_current and fp_top.
5972      */
5973     line = s = isfp->fp_current;
5974     top = isfp->fp_top;
5975 
5976     isfp->fp_seqstart = line;
5977 
5978     switch (format) {
5979     case FORMAT_GBFAST:    thin = 76;  wide = 76;  num = 60;  break;
5980     case FORMAT_PIRFAST:   thin = 68;  wide = 69;  num = 30;  break;
5981     case FORMAT_EMBLFAST:  thin = 71;  wide = 81;  num = 60;  break;
5982     case FORMAT_SPFAST:    thin = 71;  wide = 81;  num = 60;  break;
5983     default:
5984       status_error(return STATUS_ERROR, "databank_fast_read");
5985     }
5986 
5987     count = num;
5988     while (count < isfp->entry_seqlen) {
5989       if (s + wide < top || (s + thin < top && s[thin-1] == '\n')) {
5990         count += num;
5991         s += (s[thin-1] == '\n' ? thin : wide);
5992       }
5993       else {
5994         stemp = s;
5995         status = fp_read_more(isfp, &line, &stemp, &top);
5996         s = stemp;
5997         if (status != STATUS_OK)
5998           break;
5999       }
6000     }
6001 
6002     if (status == STATUS_OK) {
6003       isfp->fp_current = s;
6004       isfp->fp_top = top;
6005 
6006       error_test(*(s-1) != '\n', E_PARSEERROR, return STATUS_ERROR,
6007                  print_error("%s, entry %d:  Improper format of entry's "
6008                              "sequence lines.\n", isfp->filename,
6009                              isfp->entry_count));
6010     }
6011   }
6012 
6013   /*
6014    * Finally, look for the "//" line to end the sequence.  It must be
6015    * either on the current line or the next line (all previous lines were
6016    * skipped above).
6017    */
6018   if (status == STATUS_OK) {
6019     status = fp_get_line(isfp, &line, &end);
6020     if (status == STATUS_OK && (line[0] != '/' || line[1] != '/'))
6021       status = fp_get_line(isfp, &line, &end);
6022 
6023     error_test(status == STATUS_OK && (line[0] != '/' || line[1] != '/'),
6024                E_PARSEERROR, return STATUS_ERROR,
6025                print_error("%s, entry %d:  Improper format of entry's sequence"
6026                            " lines.\n", isfp->filename, isfp->entry_count));
6027   }
6028 
6029   switch (status) {
6030   case STATUS_OK:    break;
6031   case STATUS_EOF:   raise_error(E_PARSEERROR, return STATUS_ERROR,
6032                        print_error("%s, entry %d:  Premature EOF reached.\n",
6033                                    isfp->filename, isfp->entry_count));
6034   case STATUS_ERROR: return STATUS_ERROR;
6035   case STATUS_FATAL: return STATUS_FATAL;
6036   default:           status_error(return STATUS_ERROR, "databank_fast_read");
6037   }
6038 
6039   isfp->fp_entryend = end + 1;
6040 
6041   return STATUS_OK;
6042 }
6043 
6044 
6045 /*
6046  * databank_fast_getseq
6047  *
6048  * Parameters:  isfp  -  an opened INTSEQFILE structure
6049  *
6050  * Return: a STATUS value
6051  */
databank_fast_getseq(INTSEQFILE * isfp,int rawseqflag)6052 static int databank_fast_getseq(INTSEQFILE *isfp, int rawseqflag)
6053 {
6054   int count, format;
6055   register char *s, *seq;
6056 
6057   program_error(isfp->fp_seqstart == NULL, return STATUS_ERROR,
6058                 print_error("  fp_seqstart not set by %s's read function\n",
6059                             file_table[isfp->format].ident));
6060   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
6061                 print_error("   fp_entryend not set by %s's read function\n",
6062                             file_table[isfp->format].ident));
6063   program_error(isfp->entry_seqlen <= 0, return STATUS_ERROR,
6064                 print_error("   Entry %d's sequence length not set by %s's "
6065                             "read function\n", isfp->entry_count,
6066                             file_table[isfp->format].ident));
6067 
6068   if (rawseqflag == GETSEQ_LENGTHS)
6069     return STATUS_OK;
6070 
6071   /*
6072    * Reallocate the sequence buffer, if necessary.
6073    */
6074   if (isfp->entry_seqlen >= isfp->seqsize) {
6075     isfp->seqsize += isfp->entry_seqlen;
6076     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
6077     if (isfp->seq == NULL) {
6078       isfp->seqsize = 0;
6079       memory_error(1, return STATUS_FATAL);
6080     }
6081   }
6082 
6083   /*
6084    * Scan the "full" sequence lines, i.e., lines containing 60 characters
6085    * of the sequence.  Such lines contain 76 characters overall (including
6086    * the newline).
6087    */
6088   s = isfp->fp_seqstart;
6089   seq = isfp->seq;
6090   format = isfp->format;
6091 
6092   switch (format) {
6093   case FORMAT_GBFAST:
6094     for (count=60; count <= isfp->entry_seqlen; count+=60) {
6095       seq[0] = s[10];  seq[1] = s[11];  seq[2] = s[12];  seq[3] = s[13];
6096       seq[4] = s[14];  seq[5] = s[15];  seq[6] = s[16];  seq[7] = s[17];
6097       seq[8] = s[18];  seq[9] = s[19];  seq[10] = s[21]; seq[11] = s[22];
6098       seq[12] = s[23]; seq[13] = s[24]; seq[14] = s[25]; seq[15] = s[26];
6099       seq[16] = s[27]; seq[17] = s[28]; seq[18] = s[29]; seq[19] = s[30];
6100 
6101       seq[20] = s[32]; seq[21] = s[33]; seq[22] = s[34]; seq[23] = s[35];
6102       seq[24] = s[36]; seq[25] = s[37]; seq[26] = s[38]; seq[27] = s[39];
6103       seq[28] = s[40]; seq[29] = s[41]; seq[30] = s[43]; seq[31] = s[44];
6104       seq[32] = s[45]; seq[33] = s[46]; seq[34] = s[47]; seq[35] = s[48];
6105       seq[36] = s[49]; seq[37] = s[50]; seq[38] = s[51]; seq[39] = s[52];
6106 
6107       seq[40] = s[54]; seq[41] = s[55]; seq[42] = s[56]; seq[43] = s[57];
6108       seq[44] = s[58]; seq[45] = s[59]; seq[46] = s[60]; seq[47] = s[61];
6109       seq[48] = s[62]; seq[49] = s[63]; seq[50] = s[65]; seq[51] = s[66];
6110       seq[52] = s[67]; seq[53] = s[68]; seq[54] = s[69]; seq[55] = s[70];
6111       seq[56] = s[71]; seq[57] = s[72]; seq[58] = s[73]; seq[59] = s[74];
6112 
6113       seq += 60;
6114       s += 76;
6115     }
6116     break;
6117 
6118   case FORMAT_PIRFAST:
6119     for (count=30; count <= isfp->entry_seqlen; count+=30) {
6120       seq[0] = s[8];   seq[1] = s[10];  seq[2] = s[12];  seq[3] = s[14];
6121       seq[4] = s[16];  seq[5] = s[18];  seq[6] = s[20];  seq[7] = s[22];
6122       seq[8] = s[24];  seq[9] = s[26];  seq[10] = s[28]; seq[11] = s[30];
6123       seq[12] = s[32]; seq[13] = s[34]; seq[14] = s[36]; seq[15] = s[38];
6124       seq[16] = s[40]; seq[17] = s[42]; seq[18] = s[44]; seq[19] = s[46];
6125 
6126       seq[20] = s[48]; seq[21] = s[50]; seq[22] = s[52]; seq[23] = s[54];
6127       seq[24] = s[56]; seq[25] = s[58]; seq[26] = s[60]; seq[27] = s[62];
6128       seq[28] = s[64]; seq[29] = s[66];
6129 
6130       seq += 30;
6131       s += (s[67] == '\n' ? 68 : 69);
6132     }
6133     break;
6134 
6135   case FORMAT_EMBLFAST:
6136   case FORMAT_SPFAST:
6137     for (count=60; count <= isfp->entry_seqlen; count+=60) {
6138       seq[0] = s[5];   seq[1] = s[6];   seq[2] = s[7];   seq[3] = s[8];
6139       seq[4] = s[9];   seq[5] = s[10];  seq[6] = s[11];  seq[7] = s[12];
6140       seq[8] = s[13];  seq[9] = s[14];  seq[10] = s[16]; seq[11] = s[17];
6141       seq[12] = s[18]; seq[13] = s[19]; seq[14] = s[20]; seq[15] = s[21];
6142       seq[16] = s[22]; seq[17] = s[23]; seq[18] = s[24]; seq[19] = s[25];
6143 
6144       seq[20] = s[27]; seq[21] = s[28]; seq[22] = s[29]; seq[23] = s[30];
6145       seq[24] = s[31]; seq[25] = s[32]; seq[26] = s[33]; seq[27] = s[34];
6146       seq[28] = s[35]; seq[29] = s[36]; seq[30] = s[38]; seq[31] = s[39];
6147       seq[32] = s[40]; seq[33] = s[41]; seq[34] = s[42]; seq[35] = s[43];
6148       seq[36] = s[44]; seq[37] = s[45]; seq[38] = s[46]; seq[39] = s[47];
6149 
6150       seq[40] = s[49]; seq[41] = s[50]; seq[42] = s[51]; seq[43] = s[52];
6151       seq[44] = s[53]; seq[45] = s[54]; seq[46] = s[55]; seq[47] = s[56];
6152       seq[48] = s[57]; seq[49] = s[58]; seq[50] = s[60]; seq[51] = s[61];
6153       seq[52] = s[62]; seq[53] = s[63]; seq[54] = s[64]; seq[55] = s[65];
6154       seq[56] = s[66]; seq[57] = s[67]; seq[58] = s[68]; seq[59] = s[69];
6155 
6156       seq += 60;
6157       s += (s[70] == '\n' ? 71 : 81);
6158     }
6159     break;
6160   }
6161 
6162 
6163   /*
6164    * Scan the last line of the sequence, if it exists.
6165    */
6166   if (*s != '/') {
6167     while (*s != '\n') {
6168       if (isalpha(*s))
6169         *seq++ = *s;
6170       s++;
6171     }
6172   }
6173 
6174   *seq = '\0';
6175   isfp->seqlen = seq - isfp->seq;
6176 
6177   /*
6178    * Check the length of the sequence.
6179    */
6180   if (isfp->seqlen == 0) {
6181     set_error(E_NOSEQ);
6182     print_error("%s, entry %d:  Entry contains no sequence.\n",
6183                 isfp->filename, isfp->entry_count);
6184     return STATUS_ERROR;
6185   }
6186   if (isfp->entry_seqlen > 0 && isfp->entry_seqlen != isfp->seqlen) {
6187     set_error(E_DIFFLENGTH);
6188     print_warning("Warning: %s, entry %d:  Entry gives seq. length of %d, "
6189                   "but %d characters found.\n", isfp->filename,
6190                   isfp->entry_count, isfp->entry_seqlen, isfp->seqlen);
6191     return STATUS_WARNING;
6192   }
6193 
6194   return STATUS_OK;
6195 }
6196 
6197 
6198 /*
6199  * gcg_getseq
6200  *
6201  * Get the sequence for a GCG entry.  This is very similar to the
6202  * basic_getseq function, except that all periods read from the
6203  * sequence are automatically replaced with dashes.
6204  *
6205  * Parameters:  isfp        -  an INTSEQFILE structure that has read an
6206  *                             entry from a sequence file.
6207  *              rawseqflag  -  should the actual sequence or the raw
6208  *                             sequence text be extracted.
6209  *
6210  * Returns:  a STATUS value.
6211  */
gcg_getseq(INTSEQFILE * isfp,int rawseqflag)6212 static int gcg_getseq(INTSEQFILE *isfp, int rawseqflag)
6213 {
6214   int format;
6215   char ch, *s, *end, *seq;
6216 
6217   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
6218                 print_error("   fp_entryend not set by %s's read function\n",
6219                             file_table[isfp->format].ident));
6220 
6221   if (isfp->fp_seqstart == NULL) {
6222     if (rawseqflag == GETSEQ_LENGTHS) {
6223       isfp->entry_rawlen = isfp->entry_truelen = 0;
6224       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
6225       return STATUS_OK;
6226     }
6227     else {
6228       isfp->seqlen = 0;
6229       isfp->seq[0] = '\0';
6230       set_error(E_NOSEQ);
6231       print_error("%s, entry %d:  Entry contains no sequence.\n",
6232                   isfp->filename, isfp->entry_count);
6233       return STATUS_ERROR;
6234     }
6235   }
6236 
6237   s = isfp->fp_seqstart;
6238   end = isfp->fp_entryend;
6239   format = isfp->format;
6240 
6241   /*
6242    * Reallocate the sequence buffer, if necessary.
6243    */
6244   if (rawseqflag != GETSEQ_LENGTHS && end - s + 1 >= isfp->seqsize) {
6245     isfp->seqsize += end - s + 1;
6246     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
6247     if (isfp->seq == NULL) {
6248       isfp->seqsize = 0;
6249       memory_error(1, return STATUS_FATAL);
6250     }
6251   }
6252   seq = isfp->seq;
6253 
6254   /*
6255    * Extract the sequence characters.
6256    */
6257   switch (rawseqflag) {
6258   case GETSEQ_SEQUENCE:
6259     for ( ; s < end; s++) {
6260       if (*s == '>' || *s == '<' || *s == '$') {
6261         for (ch=*s++; s < end && *s != ch; s++) ;
6262       }
6263       else if (isalpha(*s))
6264         *seq++ = *s;
6265     }
6266     break;
6267 
6268   case GETSEQ_RAWSEQ:
6269     for ( ; s < end; s++) {
6270       if (*s == '>' || *s == '<' || *s == '$') {
6271         for (ch=*s++; s < end && *s != ch; s++) ;
6272       }
6273       else if (!isspace(*s) && !isdigit(*s))
6274         *seq++ = (*s == '.' ? '-' : *s);
6275     }
6276     break;
6277 
6278   case GETSEQ_LENGTHS:
6279     isfp->entry_truelen = isfp->entry_rawlen = 0;
6280     for ( ; s < end; s++) {
6281       if (*s == '>' || *seq == '<' || *s == '$') {
6282         for (ch=*s++; s < end && *s != ch; s++) ;
6283       }
6284       else if (!isspace(*s) && !isdigit(*s)) {
6285         isfp->entry_rawlen++;
6286         if (isalpha(*s))
6287           isfp->entry_truelen++;
6288       }
6289     }
6290     break;
6291   }
6292 
6293   if (rawseqflag == GETSEQ_LENGTHS)
6294     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
6295   else {
6296     *seq = '\0';
6297     isfp->seqlen = seq - isfp->seq;
6298 
6299     /*
6300      * Perform checks on the sequence length.
6301      */
6302     if (isfp->seqlen == 0) {
6303       set_error(E_NOSEQ);
6304       print_error("%s, entry %d:  Entry contains no sequence.\n",
6305                   isfp->filename, isfp->entry_count);
6306       return STATUS_ERROR;
6307     }
6308     if (rawseqflag == GETSEQ_RAWSEQ &&
6309         isfp->entry_seqlen > 0 && isfp->entry_seqlen != isfp->seqlen) {
6310       set_error(E_DIFFLENGTH);
6311       print_warning("Warning:  %s, entry %d:  Entry gives seq. length of %d, "
6312                     "but %d characters found.\n", isfp->filename,
6313                     isfp->entry_count, isfp->entry_seqlen, isfp->seqlen);
6314       return STATUS_WARNING;
6315     }
6316 
6317     if (rawseqflag == GETSEQ_SEQUENCE) {
6318       isfp->entry_truelen = isfp->seqlen;
6319       isfp->iflag_truelen = 1;
6320     }
6321     else if (rawseqflag == GETSEQ_RAWSEQ) {
6322       isfp->entry_rawlen = isfp->seqlen;
6323       isfp->iflag_rawlen = 1;
6324     }
6325   }
6326 
6327   return STATUS_OK;
6328 }
6329 
6330 
6331 /*
6332  * msf_read
6333  *
6334  * Reads GCG's MSF format.
6335  *
6336  * Parameters:  isfp  -  an opened INTSEQFILE structure
6337  *
6338  * Return: a STATUS value
6339  */
msf_read(INTSEQFILE * isfp,int flag)6340 static int msf_read(INTSEQFILE *isfp, int flag)
6341 {
6342   char *s, *t;
6343 
6344   if (isfp->entry_count == 0)
6345     return raw_read(isfp, flag);
6346 
6347   if (!flag && isfp->entry_seqno < isfp->entry_numseqs) {
6348     isfp->entry_seqno++;
6349 
6350     /*
6351      * Advance the `fp_seqstart' pointer so that it's pointing to the
6352      * header line for the now current sequence, and then get the sequence
6353      * length.
6354      */
6355     while (isfp->malign_seqno < isfp->entry_seqno) {
6356       for (s=isfp->fp_seqstart; *s != '\n'; s++) ;
6357       isfp->fp_seqstart = s + 1;
6358       for (t=s+1; *t != '\n' && isspace(*t); t++) ;
6359       if (*t != '\n')
6360         isfp->malign_seqno++;
6361     }
6362 
6363     isfp->entry_seqlen = 0;
6364     for (s=isfp->fp_seqstart; *s != '\n'; s++) {
6365       if (mystreq(s, 'L', "LEN: ")) {
6366         isfp->entry_rawlen = isfp->entry_seqlen = myatoi(s + 4, 10, '0');
6367         isfp->iflag_rawlen = 1;
6368         break;
6369       }
6370     }
6371     error_test(isfp->entry_seqlen == 0, E_PARSEERROR, return STATUS_ERROR,
6372                print_error("%s, entry 1:  Invalid format of MSF sequence "
6373                            "header lines.\n", isfp->filename));
6374 
6375     return STATUS_OK;
6376   }
6377 
6378   return STATUS_EOF;
6379 }
6380 
6381 
msf_getseq(INTSEQFILE * isfp,int rawseqflag)6382 static int msf_getseq(INTSEQFILE *isfp, int rawseqflag)
6383 {
6384   int i, namelen;
6385   char *s, *t, *end, *seq, *name;
6386 
6387   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
6388                 print_error("   fp_entryend not set by %s's read function\n",
6389                             file_table[isfp->format].ident));
6390 
6391   if (isfp->fp_seqstart == NULL) {
6392     if (rawseqflag == GETSEQ_LENGTHS) {
6393       isfp->entry_rawlen = isfp->entry_truelen = 0;
6394       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
6395       return STATUS_OK;
6396     }
6397     else {
6398       isfp->seqlen = 0;
6399       isfp->seq[0] = '\0';
6400       set_error(E_NOSEQ);
6401       print_error("%s, entry %d:  Entry contains no sequence.\n",
6402                   isfp->filename, isfp->entry_count);
6403       return STATUS_ERROR;
6404     }
6405   }
6406 
6407   while (isfp->malign_seqno < isfp->entry_seqno) {
6408     for (s=isfp->fp_seqstart; *s != '\n'; s++) ;
6409     for (t=s; *t == '\n'; )
6410       for (s=t++; *t != '\n' && isspace(*t); t++) ;
6411     isfp->fp_seqstart = s + 1;
6412     isfp->malign_seqno++;
6413   }
6414 
6415   /*
6416    * Reallocate the sequence buffer, if necessary.
6417    */
6418   if (rawseqflag != GETSEQ_LENGTHS && isfp->entry_seqlen >= isfp->seqsize) {
6419     isfp->seqsize += isfp->entry_seqlen;
6420     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
6421     if (isfp->seq == NULL) {
6422       isfp->seqsize = 0;
6423       memory_error(1, return STATUS_FATAL);
6424     }
6425   }
6426   seq = isfp->seq;
6427 
6428   if (rawseqflag == GETSEQ_LENGTHS)
6429     isfp->entry_truelen = isfp->entry_rawlen = 0;
6430 
6431   /*
6432    * Extract the sequence characters.  Every line that begins with
6433    * the name of the sequence contains sequence characters.
6434    */
6435   s = isfp->fp_seqstart;
6436   end = isfp->fp_entryend;
6437 
6438   for (s=isfp->fp_seqstart; isspace(*s); s++) ;
6439   for (s+=5; isspace(*s); s++) ;
6440   for (name=s; !isspace(*s); s++) ;
6441   namelen = s - name + 1;
6442 
6443   /*
6444    * Skip to the beginning of the sequence lines.
6445    */
6446   while (s < end && (*s != '\n' || s[1] != '/' || s[2] != '/')) s++;
6447   for (s++; s < end && *s != '\n'; s++) ;
6448   s++;
6449 
6450   /*
6451    * Extract the characters.
6452    */
6453   while (s < end) {
6454     while (s < end && *s != '\n' && isspace(*s)) s++;
6455     if (s >= end || *s == '\n') {
6456       s++;
6457       continue;
6458     }
6459 
6460     for (i=0; s < end && i < namelen && name[i] == *s; i++,s++) ;
6461     if (i < namelen)
6462       while (s < end && *s != '\n') s++;
6463     else {
6464       switch (rawseqflag) {
6465       case GETSEQ_SEQUENCE:
6466         for ( ; s < end && *s != '\n'; s++)
6467           if (isalpha(*s))
6468             *seq++ = *s;
6469         break;
6470 
6471       case GETSEQ_RAWSEQ:
6472         for ( ; s < end && *s != '\n'; s++)
6473           if (!(isspace(*s) || isdigit(*s)))
6474             *seq++ = (*s == '.' ? '-' : *s);
6475         break;
6476 
6477       case GETSEQ_LENGTHS:
6478         for ( ; s < end && *s != '\n'; s++) {
6479           if (!(isspace(*s) || isdigit(*s))) {
6480             isfp->entry_rawlen++;
6481             if (isalpha(*s))
6482               isfp->entry_truelen++;
6483           }
6484         }
6485         break;
6486       }
6487     }
6488     s++;
6489   }
6490 
6491   if (rawseqflag == GETSEQ_LENGTHS)
6492     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
6493   else {
6494     *seq = '\0';
6495     isfp->seqlen = seq - isfp->seq;
6496 
6497     /*
6498      * Perform a check on the sequence length.
6499      */
6500     if (isfp->seqlen == 0) {
6501       set_error(E_NOSEQ);
6502       print_error("%s, entry %d:  Entry contains no sequence.\n",
6503                   isfp->filename, isfp->entry_count);
6504       return STATUS_ERROR;
6505     }
6506     if (rawseqflag == GETSEQ_RAWSEQ &&
6507         isfp->entry_seqlen > 0 && isfp->entry_seqlen != isfp->seqlen) {
6508       set_error(E_DIFFLENGTH);
6509       print_warning("Warning:  %s, entry %d:  Entry gives seq. length of %d, "
6510                     "but %d characters found.\n", isfp->filename,
6511                     isfp->entry_count, isfp->entry_seqlen, isfp->seqlen);
6512       return STATUS_WARNING;
6513     }
6514 
6515     if (rawseqflag == GETSEQ_SEQUENCE) {
6516       isfp->entry_truelen = isfp->seqlen;
6517       isfp->iflag_truelen = 1;
6518     }
6519     else if (rawseqflag == GETSEQ_RAWSEQ) {
6520       isfp->entry_rawlen = isfp->seqlen;
6521       isfp->iflag_rawlen = 1;
6522     }
6523   }
6524 
6525   return STATUS_OK;
6526 }
6527 
6528 
6529 /*
6530  * fastaout_read     (FASTA output formats)
6531  *
6532  *
6533  * Parameters:  isfp  -  an opened INTSEQFILE structure
6534  *
6535  * Return: a STATUS value
6536  */
6537 #define FASTA_MODE 1
6538 #define LFASTA_MODE 2
6539 #define ALIGN_MODE 3
6540 
6541 #define MARKX0 1
6542 #define MARKX2 2
6543 #define MARKX3 3
6544 #define MARKX10 10
6545 #define NONPARSABLE -1
6546 
fastaout_read(INTSEQFILE * isfp,int flag)6547 static int fastaout_read(INTSEQFILE *isfp, int flag)
6548 {
6549   int state, runflag, markx, mode, count, status, fasta_out_bug_flag;
6550   char *s, *t, *s2, *line, *end;
6551 
6552   if (isfp->entry_count == 0) {
6553     state = 0;
6554     runflag = 1;
6555     while (runflag && (status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
6556       switch (state) {
6557       case -1:
6558         if (line == end)
6559           state = 0;
6560         break;
6561 
6562       case 0:
6563         if (strncmp(line, "From ", 5) == 0)
6564           state = -1;
6565         else if (mystreq(line, ' ', " FASTA") ||
6566                  mystreq(line, ' ', " FASTX") ||
6567                  mystreq(line, ' ', " TFASTA") ||
6568                  mystreq(line, ' ', " SSEARCH")) {
6569           isfp->fout_mode = FASTA_MODE;
6570           for (s=isfp->fout_progname,t=line+1; !isspace(*t); s++,t++)
6571             *s = *t;
6572           *s = '\0';
6573           state = 1;
6574         }
6575         else if (mystreq(line, ' ', " LFASTA") ||
6576                  mystreq(line, ' ', " LALIGN")) {
6577           isfp->fout_mode = LFASTA_MODE;
6578           for (s=isfp->fout_progname,t=line+1; !isspace(*t); s++,t++)
6579             *s = *t;
6580           *s = '\0';
6581           state = 4;
6582         }
6583         else if (mystreq(line, 'A', "ALIGN")) {
6584           isfp->fout_mode = ALIGN_MODE;
6585           for (s=isfp->fout_progname,t=line; !isspace(*t); s++,t++)
6586             *s = *t;
6587           *s = '\0';
6588           state = 8;
6589         }
6590         else
6591           state = 11;
6592         break;
6593 
6594       case 11:
6595         if (strncmp(line, "The best scores are:", 20) == 0) {
6596           isfp->fout_mode = FASTA_MODE;
6597           strcpy(isfp->fout_progname, "FASTA/FASTX/TFASTA/SSEARCH");
6598           state = 2;
6599         }
6600         else if (strncmp(line, " Comparison of:", 15) == 0) {
6601           isfp->fout_mode = LFASTA_MODE;
6602           strcpy(isfp->fout_progname, "LFASTA/LALIGN");
6603           state = 5;
6604         }
6605         else if (line[0] == '>' && line[1] == '>' && line[2] == '>') {
6606           isfp->fout_mode = LFASTA_MODE;
6607           strcpy(isfp->fout_progname, "LFASTA/LALIGN");
6608           runflag = 0;
6609         }
6610         break;
6611 
6612       case 1:
6613         if (line[0] == ' ' && line[1] == '>') {
6614           error_test(isfp->fout_id1 != NULL,
6615                      E_PARSEERROR, return STATUS_ERROR,
6616                      print_error("%s, entry %d:  Text differs from %s "
6617                                  " output format.\n", isfp->filename,
6618                                  isfp->entry_count, isfp->fout_progname));
6619 
6620           for (t=s=line+2; s < end && !isspace(*s); s++) ;
6621           if (s < end) {
6622             isfp->fout_id1 = mystrdup2(t, s);
6623             for (s++; s < end && isspace(*s); s++);
6624           }
6625 
6626           if (s < end) {
6627             for (t=s,s=end-1; s >= t && *s != ':'; s--) ;
6628             if (t < s) {
6629               for (s2=s; s2 > t && isspace(*(s2-1)); s2--) ;
6630               isfp->fout_descr1 = mystrdup2(t, s2);
6631 
6632               s += 2;
6633               if (s < end && isdigit(*s)) {
6634                 isfp->fout_len1 = *s - '0';
6635                 for (s++; s < end && isdigit(*s); s++) {
6636                   isfp->fout_len1 *= 10;
6637                   isfp->fout_len1 += *s - '0';
6638                 }
6639 
6640                 s++;
6641                 if (*s == 'a' && s[1] == 'a')
6642                   isfp->fout_alpha1 = PROTEIN;
6643                 else if (*s == 'n' && s[1] == 't')
6644                   isfp->fout_alpha1 = DNA;
6645               }
6646             }
6647           }
6648         }
6649         else if (strncmp(line, "The best scores are:", 20) == 0)
6650           state = 2;
6651         break;
6652 
6653       case 2:
6654         if (line == end || isspace(line[0]))
6655           state = 3;
6656         break;
6657 
6658       case 3:
6659         runflag = 0;
6660         break;
6661 
6662       case 4:
6663         if (strncmp(line, " Comparison of:", 15) == 0)
6664           state = 5;
6665         else if (line[0] == '>' && line[1] == '>' && line[2] == '>')
6666           runflag = 0;
6667         break;
6668 
6669       case 5:
6670         error_test(isfp->fout_id1 != NULL, E_PARSEERROR, return STATUS_ERROR,
6671                    print_error("%s, entry %d:  Text differs from %s "
6672                                " output format.\n", isfp->filename,
6673                                isfp->entry_count, isfp->fout_progname));
6674         for (s=line+2; s < end && *s != '>'; s++) ;
6675         for (t=++s; s < end && !isspace(*s); s++) ;
6676         if (s < end) {
6677           isfp->fout_id1 = mystrdup2(t, s);
6678           for (s++; s < end && isspace(*s); s++);
6679         }
6680         if (s < end) {
6681           for (t=s,s=end-1; s >= t && *s != '-'; s--) ;
6682           if (t < s) {
6683             for (s2=s; s2 > t && isspace(*(s2-1)); s2--) ;
6684             isfp->fout_descr1 = mystrdup2(t, s2);
6685             s += 2;
6686             if (s < end && isdigit(*s)) {
6687               isfp->fout_len1 = *s - '0';
6688               for (s++; s < end && isdigit(*s); s++) {
6689                 isfp->fout_len1 *= 10;
6690                 isfp->fout_len1 += *s - '0';
6691               }
6692               s++;
6693               if (*s == 'a' && s[1] == 'a')
6694                 isfp->fout_alpha1 = PROTEIN;
6695               else if (*s == 'n' && s[1] == 't')
6696                 isfp->fout_alpha1 = DNA;
6697             }
6698           }
6699         }
6700         state = 6;
6701         break;
6702 
6703       case 6:
6704         error_test(isfp->fout_id2 != NULL, E_PARSEERROR, return STATUS_ERROR,
6705                    print_error("%s, entry %d:  Text differs from %s "
6706                                " output format.\n", isfp->filename,
6707                                isfp->entry_count, isfp->fout_progname));
6708         for (s=line+2; s < end && *s != '>'; s++) ;
6709         for (t=++s; s < end && !isspace(*s); s++) ;
6710         if (s < end) {
6711           isfp->fout_id2 = mystrdup2(t, s);
6712           for (s++; s < end && isspace(*s); s++);
6713         }
6714         if (s < end) {
6715           for (t=s,s=end-1; s >= t && *s != '-'; s--) ;
6716           if (t < s) {
6717             for (s2=s; s2 > t && isspace(*(s2-1)); s2--) ;
6718             isfp->fout_descr2 = mystrdup2(t, s2);
6719             s += 2;
6720             if (s < end && isdigit(*s)) {
6721               isfp->fout_len2 = *s - '0';
6722               for (s++; s < end && isdigit(*s); s++) {
6723                 isfp->fout_len2 *= 10;
6724                 isfp->fout_len2 += *s - '0';
6725               }
6726               s++;
6727               if (*s == 'a' && s[1] == 'a')
6728                 isfp->fout_alpha2 = PROTEIN;
6729               else if (*s == 'n' && s[1] == 't')
6730                 isfp->fout_alpha2 = DNA;
6731             }
6732           }
6733         }
6734         state = 7;
6735         break;
6736 
6737       case 7:
6738         state = 2;
6739         break;
6740 
6741       case 8:
6742         if (line[0] == '>') {
6743           error_test(isfp->fout_id1 != NULL || end - line <= 54,
6744                      E_PARSEERROR, return STATUS_ERROR,
6745                      print_error("%s, entry %d:  Text differs from %s "
6746                                  " output format.\n", isfp->filename,
6747                                  isfp->entry_count, isfp->fout_progname));
6748           for (t=s=line+1; s < end && !isspace(*s); s++) ;
6749           if (s < end) {
6750             isfp->fout_id1 = mystrdup2(t, s);
6751             for (s++; s < end && isspace(*s); s++);
6752           }
6753           if (s < end) {
6754             for (s2=line+52; s2 > t && isspace(*(s2-1)); s2--) ;
6755             isfp->fout_descr1 = mystrdup2(s, s2);
6756             s = line + 52;
6757             if (s < end && isdigit(*s)) {
6758               isfp->fout_len1 = *s - '0';
6759               for (s++; s < end && isdigit(*s); s++) {
6760                 isfp->fout_len1 *= 10;
6761                 isfp->fout_len1 += *s - '0';
6762               }
6763               s++;
6764               if (*s == 'a' && s[1] == 'a')
6765                 isfp->fout_alpha1 = PROTEIN;
6766               else if (*s == 'n' && s[1] == 't')
6767                 isfp->fout_alpha1 = DNA;
6768             }
6769           }
6770           state = 9;
6771         }
6772         break;
6773 
6774       case 9:
6775         error_test(line[0] != '>' || isfp->fout_id2 != NULL || end-line <= 54,
6776                    E_PARSEERROR, return STATUS_ERROR,
6777                    print_error("%s, entry %d:  Text differs from %s "
6778                                " output format.\n", isfp->filename,
6779                                isfp->entry_count, isfp->fout_progname));
6780         for (t=s=line+1; s < end && !isspace(*s); s++) ;
6781         if (s < end) {
6782           isfp->fout_id2 = mystrdup2(t, s);
6783           for (s++; s < end && isspace(*s); s++);
6784         }
6785         if (s < end) {
6786           for (s2=line+52; s2 > t && isspace(*(s2-1)); s2--) ;
6787           isfp->fout_descr2 = mystrdup2(s, s2);
6788           s = line + 52;
6789           if (s < end && isdigit(*s)) {
6790             isfp->fout_len2 = *s - '0';
6791             for (s++; s < end && isdigit(*s); s++) {
6792               isfp->fout_len2 *= 10;
6793               isfp->fout_len2 += *s - '0';
6794             }
6795             s++;
6796             if (*s == 'a' && s[1] == 'a')
6797               isfp->fout_alpha2 = PROTEIN;
6798             else if (*s == 'n' && s[1] == 't')
6799               isfp->fout_alpha2 = DNA;
6800           }
6801         }
6802         state = 10;
6803         break;
6804 
6805       case 10:
6806         runflag = 0;
6807         break;
6808 
6809       }
6810     }
6811     switch (status) {
6812     case STATUS_OK:    break;
6813     case STATUS_EOF:   error_test(state == 11,
6814                                   E_PARSEERROR, return STATUS_ERROR,
6815                                   print_error("%s:  Cannot determine which "
6816                                               "FASTA program generated the "
6817                                               "text.\n", isfp->filename));
6818                        raise_error(E_PARSEERROR, return STATUS_ERROR,
6819                                    print_error("%s:  File contains no "
6820                                                "entries.\n", isfp->filename));
6821     case STATUS_ERROR: return STATUS_ERROR;
6822     case STATUS_FATAL: return STATUS_FATAL;
6823     default:           status_error(return STATUS_ERROR, "fastaout_read");
6824     }
6825 
6826     error_test(line == end, E_PARSEERROR, return STATUS_ERROR,
6827                print_error("%s, entry %d:  Text differs from %s "
6828                            " output format.\n", isfp->filename,
6829                            isfp->entry_count, isfp->fout_progname));
6830 
6831     isfp->fout_markx = (line[0] == '>' ? MARKX10 : NONPARSABLE);
6832     isfp->fp_current = line;
6833     isfp->entry_seqno = isfp->entry_numseqs = 0;
6834   }
6835 
6836   if (!flag && isfp->entry_seqno < isfp->entry_numseqs) {
6837     isfp->entry_seqno++;
6838     if (isfp->entry_truelen > 0)
6839       isfp->iflag_truelen = 1;
6840     if (isfp->entry_rawlen > 0)
6841       isfp->iflag_rawlen = 1;
6842     return STATUS_OK;
6843   }
6844 
6845   if (isfp->fout_mode == ALIGN_MODE && isfp->entry_count > 0)
6846     return STATUS_EOF;
6847 
6848   isfp->fp_entrystart = isfp->fp_seqstart = isfp->fp_entryend = NULL;
6849   isfp->entry_seqlen = isfp->entry_truelen = isfp->entry_rawlen = 0;
6850   isfp->iflag_truelen = isfp->iflag_rawlen = 0;
6851   isfp->entry_count++;
6852   isfp->entry_seqno = isfp->entry_numseqs = 0;
6853 
6854   markx = isfp->fout_markx;
6855   mode = isfp->fout_mode;
6856 
6857   if (markx == NONPARSABLE || markx == MARKX0 ||
6858       markx == MARKX2 || markx == MARKX3) {
6859     if (mode == LFASTA_MODE && isfp->entry_count > 1) {
6860       status = fp_get_line(isfp, &line, &end);
6861       switch (status) {
6862       case STATUS_OK:    break;
6863       case STATUS_EOF:   return STATUS_EOF;
6864       case STATUS_ERROR: return STATUS_ERROR;
6865       case STATUS_FATAL: return STATUS_FATAL;
6866       default:           status_error(return STATUS_ERROR, "fastaout_read");
6867       }
6868 
6869       error_test(line != end && !isspace(*line),
6870                  E_PARSEERROR, return STATUS_ERROR,
6871                  print_error("%s, entry %d:  Text differs from %s "
6872                              " output format.\n", isfp->filename,
6873                              isfp->entry_count, isfp->fout_progname));
6874     }
6875 
6876     state = 0;
6877     count = 0;
6878     fasta_out_bug_flag = 0;
6879     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
6880       if (state == 0 && count == 0) {
6881         if (strncmp(line, "Library scan:", 13) == 0) {
6882           error_test(isfp->entry_count == 1, E_PARSEERROR, return STATUS_ERROR,
6883                      print_error("%s:  File contains no entries.\n",
6884                                  isfp->filename));
6885           return STATUS_EOF;
6886         }
6887         else
6888           isfp->fp_entrystart = line;
6889       }
6890       count++;
6891 
6892       if (state == 0 && (line == end || (line[0] == '>' && line[1] != '>'))) {
6893         if (markx == NONPARSABLE && line[0] == '>')
6894           markx = isfp->fout_markx = MARKX3;
6895 
6896         error_test((markx != MARKX3 && line != end) ||
6897                    (markx == MARKX3 && line[0] != '>'),
6898                    E_PARSEERROR, return STATUS_ERROR,
6899                    print_error("%s, entry %d:  Text differs from %s "
6900                                " output format.\n", isfp->filename,
6901                                isfp->entry_count, isfp->fout_progname));
6902         state = (markx == MARKX3 ? 2 : 1);
6903         count = 0;
6904         if (isfp->fp_seqstart == NULL)
6905           isfp->fp_seqstart = line;
6906       }
6907       else if (state == 1) {
6908         if (count == 1) {
6909           if (line == end) {
6910             if (markx == MARKX2)
6911               fasta_out_bug_flag = 1;
6912             else {
6913               raise_error(E_PARSEERROR, return STATUS_ERROR,
6914                           print_error("%s, entry %d:  Text differs from %s "
6915                                       " output format.\n", isfp->filename,
6916                                       isfp->entry_count, isfp->fout_progname));
6917             }
6918           }
6919           else if (!isspace(*line)) {
6920             if ((line[0] == '-' && strncmp(line, "----", 4) == 0) ||
6921                 (line[0] == 'L' && strncmp(line, "Library ", 8) == 0))
6922               break;
6923             for (s=end-1; s > line && isspace(*s); s--) ;
6924             if (s - 3 > line && *s == ')' &&
6925                 ((*(s-2) == 'a' && *(s-1) == 'a') ||
6926                  (*(s-2) == 'n' && *(s-1) == 't')))
6927               break;
6928             count = 4;
6929           }
6930         }
6931         else if (count == 2 && fasta_out_bug_flag) {
6932           /*
6933            * In this FASTA bug, an extra blank line is output at the
6934            * end of the entry (appearing where the first line of the
6935            * next entry should be).  So, we need to delay the text
6936            * for the next entry beginning to the next line.
6937            */
6938           if (!isspace(*line)) {
6939             if ((line[0] == '-' && strncmp(line, "----", 4) == 0) ||
6940                 (line[0] == 'L' && strncmp(line, "Library ", 8) == 0))
6941               break;
6942             for (s=end-1; s > line && isspace(*s); s--) ;
6943             if (s - 3 > line && *s == ')' &&
6944                 ((*(s-2) == 'a' && *(s-1) == 'a') ||
6945                  (*(s-2) == 'n' && *(s-1) == 't')))
6946               break;
6947           }
6948           raise_error(E_PARSEERROR, return STATUS_ERROR,
6949                       print_error("%s, entry %d:  Text differs from %s "
6950                                   " output format.\n", isfp->filename,
6951                                   isfp->entry_count, isfp->fout_progname));
6952         }
6953         else if (count == 3) {
6954           if (line == end)
6955             count = 0;
6956           else {
6957             if (markx == NONPARSABLE)
6958               markx = isfp->fout_markx = (!isspace(*line) ? MARKX2 : MARKX0);
6959 
6960             if (markx == MARKX2)
6961               count+=2;
6962           }
6963         }
6964         else if (count == 6)
6965           count = 0;
6966       }
6967       else if (state == 2 && line[0] == '>')
6968         state = 3;
6969       else if (state == 3 && line == end)
6970         state = 4;
6971       else if (state == 4 && line != end)
6972         break;
6973     }
6974     if (status == STATUS_EOF &&
6975         ((state >= 3 && (markx == MARKX3 || mode == ALIGN_MODE)) ||
6976          (mode == ALIGN_MODE && state == 1 && (count == 0 || count == 5)))) {
6977       isfp->entry_seqno = 1;
6978       isfp->entry_numseqs = 2;
6979       isfp->fp_entryend = isfp->fp_top;
6980       return STATUS_OK;
6981     }
6982     switch (status) {
6983     case STATUS_OK:    break;
6984     case STATUS_EOF:   raise_error(E_PARSEERROR, return STATUS_ERROR,
6985                          print_error("%s, entry %d:  Premature EOF reached.\n",
6986                                      isfp->filename, isfp->entry_count));
6987     case STATUS_ERROR: return STATUS_ERROR;
6988     case STATUS_FATAL: return STATUS_FATAL;
6989     default:           status_error(return STATUS_ERROR, "fastaout_read");
6990     }
6991 
6992     error_test(markx == NONPARSABLE, E_PARSEERROR, return STATUS_ERROR,
6993                print_error("%s, entry %d:  Text differs from %s "
6994                            " output format.\n", isfp->filename,
6995                            isfp->entry_count, isfp->fout_progname));
6996 
6997     if (mode == LFASTA_MODE)
6998       isfp->fp_entryend = isfp->fp_current = end + 1;
6999     else
7000       isfp->fp_entryend = isfp->fp_current = line;
7001   }
7002   else if (markx == MARKX10) {
7003     status = fp_get_line(isfp, &line, &end);
7004     switch (status) {
7005     case STATUS_OK:    break;
7006     case STATUS_EOF:   return STATUS_EOF;
7007     case STATUS_ERROR: return STATUS_ERROR;
7008     case STATUS_FATAL: return STATUS_FATAL;
7009     default:           status_error(return STATUS_ERROR, "fastaout_read");
7010     }
7011 
7012     if (strncmp(line, "Library scan:", 13) == 0)
7013       return STATUS_EOF;
7014 
7015     error_test(line[0] != '>' || line[1] != '>',
7016                E_PARSEERROR, return STATUS_ERROR,
7017                print_error("%s, entry %d:  Text differs from %s "
7018                            " output format.\n", isfp->filename,
7019                            isfp->entry_count, isfp->fout_progname));
7020     error_test(isfp->entry_count == 1 && line[2] != '>',
7021                E_PARSEERROR, return STATUS_ERROR,
7022                print_error("%s, entry %d:  Text differs from %s "
7023                            " output format.\n", isfp->filename,
7024                            isfp->entry_count, isfp->fout_progname));
7025 
7026     if (isfp->entry_count == 1) {
7027       while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
7028         if (line[0] == '>' && line[1] == '>')
7029           break;
7030       switch (status) {
7031       case STATUS_OK:    break;
7032       case STATUS_EOF:   raise_error(E_PARSEERROR, return STATUS_ERROR,
7033                          print_error("%s, entry %d:  Premature EOF reached.\n",
7034                                      isfp->filename, isfp->entry_count));
7035       case STATUS_ERROR: return STATUS_ERROR;
7036       case STATUS_FATAL: return STATUS_FATAL;
7037       default:           status_error(return STATUS_ERROR, "fastaout_read");
7038       }
7039     }
7040 
7041     isfp->fp_entrystart = line;
7042     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
7043       if ((line[0] == '>' && line[1] == '>') ||
7044           strncmp(line, "Library scan:", 13) == 0)
7045         break;
7046       else if (isfp->fp_seqstart == NULL && line[0] == '>')
7047         isfp->fp_seqstart = line;
7048     }
7049     switch (status) {
7050     case STATUS_OK:    break;
7051     case STATUS_EOF:   isfp->fp_entryend = isfp->fp_top;
7052                        isfp->entry_seqno = 1;
7053                        isfp->entry_numseqs = 2;
7054                        return STATUS_OK;
7055     case STATUS_ERROR: return STATUS_ERROR;
7056     case STATUS_FATAL: return STATUS_FATAL;
7057     default:           status_error(return STATUS_ERROR, "fastaout_read");
7058     }
7059     isfp->fp_entryend = isfp->fp_current = line;
7060   }
7061 
7062   isfp->entry_seqno = 1;
7063   isfp->entry_numseqs = 2;
7064   return STATUS_OK;
7065 }
7066 
7067 
7068 
fastaout_getseq(INTSEQFILE * isfp,int rawseqflag)7069 static int fastaout_getseq(INTSEQFILE *isfp, int rawseqflag)
7070 {
7071   int markx, initflag, seq1only, seq2only, templen;
7072   char *s, *end, *seq, *last, *seqline1;
7073 
7074   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
7075                 print_error("   fp_entryend not set by %s's read function\n",
7076                             file_table[isfp->format].ident));
7077 
7078   if (isfp->fp_seqstart == NULL) {
7079     if (rawseqflag == GETSEQ_LENGTHS) {
7080       isfp->entry_rawlen = isfp->entry_truelen = 0;
7081       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
7082       return STATUS_OK;
7083     }
7084     else {
7085       isfp->seqlen = 0;
7086       isfp->seq[0] = '\0';
7087       set_error(E_NOSEQ);
7088       print_error("%s, entry %d:  Entry contains no sequence.\n",
7089                   isfp->filename, isfp->entry_count);
7090       return STATUS_ERROR;
7091     }
7092   }
7093 
7094   s = isfp->fp_seqstart;
7095   end = isfp->fp_entryend;
7096 
7097   /*
7098    * Reallocate the sequence buffer, if necessary.
7099    */
7100   if (rawseqflag != GETSEQ_LENGTHS && (end - s + 1) / 2 >= isfp->seqsize) {
7101     isfp->seqsize += (end - s + 1) / 2;
7102     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
7103     if (isfp->seq == NULL) {
7104       isfp->seqsize = 0;
7105       memory_error(1, return STATUS_FATAL);
7106     }
7107   }
7108   seq = isfp->seq;
7109 
7110   if (rawseqflag == GETSEQ_LENGTHS)
7111     isfp->entry_truelen = isfp->entry_rawlen = 0;
7112 
7113   /*
7114    * Extract the sequence characters.
7115    */
7116   last = NULL;
7117   markx = isfp->fout_markx;
7118   if (markx == MARKX0 || markx == MARKX2) {
7119     initflag = 1;
7120     while (1) {
7121       while (s < end && *s != '\n') s++;    /* Skip blank line */
7122       s++;
7123 
7124       error_test(s >= end && initflag, E_PARSEERROR, return STATUS_ERROR,
7125                  print_error("%s, entry %d:  Entry contains no sequences.\n",
7126                              isfp->filename, isfp->entry_count));
7127       initflag = 0;
7128       if (s >= end || (*s == '-' && s[1] == '-' && s[2] == '-'))
7129         break;
7130 
7131       if (*s == '\n' && s+1 == end)  /* This handles the FASTA output bug */
7132         break;                       /* described in fastaout_read        */
7133 
7134       seq1only = seq2only = 0;
7135 
7136       if (s < end && isspace(*s)) {
7137         while (s < end && *s != '\n') s++;    /* Skip seq 1 positions line */
7138         s++;
7139       }
7140       else
7141         seq2only = 1;
7142 
7143       seqline1 = s;
7144 
7145       if (isfp->entry_seqno == 2) {
7146         if (!seq2only) {
7147           while (s < end && *s != '\n') s++;  /* Skip sequence 1 line */
7148           s++;
7149 
7150           if (s < end && *s == '\n') {
7151             seq1only = 1;
7152             s = seqline1;
7153           }
7154           else if (markx == MARKX0) {
7155             while (s < end && *s != '\n') s++;  /* Skip line of matches */
7156             s++;
7157           }
7158         }
7159       }
7160 
7161       error_test(s >= end, E_PARSEERROR, return STATUS_ERROR,
7162                  print_error("%s, entry %d:  Premature end of entry.\n",
7163                              isfp->filename, isfp->entry_count));
7164 
7165       if (!seq1only && !seq2only &&
7166           markx == MARKX2 && isfp->entry_seqno == 2) {
7167         switch (rawseqflag) {
7168         case GETSEQ_SEQUENCE:
7169           for (s+=7,last=seqline1+7; s < end && *s != '\n'; s++,last++) {
7170             if (*s == '.' && isalpha(*last))
7171               *seq++ = *last;
7172             else if (isalpha(*s))
7173               *seq++ = *s;
7174           }
7175           break;
7176 
7177         case GETSEQ_RAWSEQ:
7178           for (s+=7,last=seqline1+7; s < end && *s != '\n'; s++,last++) {
7179             if (*s == '.' && !(isspace(*last) || isdigit(*last)))
7180               *seq++ = *last;
7181             else if (*s != ' ')
7182               *seq++ = *s;
7183             else
7184               *seq++ = '-';
7185           }
7186           break;
7187 
7188         case GETSEQ_LENGTHS:
7189           for (s+=7,last=seqline1+7; s < end && *s != '\n'; s++,last++) {
7190             isfp->entry_rawlen++;
7191             if (isalpha(*s) || (*s == '.' && isalpha(*last)))
7192               isfp->entry_truelen++;
7193           }
7194           break;
7195         }
7196       }
7197       else if ((!seq1only && !seq2only) ||
7198                (seq1only && isfp->entry_seqno == 1) ||
7199                (seq2only && isfp->entry_seqno == 2)) {
7200         switch (rawseqflag) {
7201         case GETSEQ_SEQUENCE:
7202           for (s+=7; s < end && *s != '\n'; s++)
7203             if (isalpha(*s))
7204               *seq++ = *s;
7205           break;
7206 
7207         case GETSEQ_RAWSEQ:
7208           for (s+=7; s < end && *s != '\n'; s++)
7209             *seq++ = (*s == ' ' ? '-' : *s);
7210           break;
7211 
7212         case GETSEQ_LENGTHS:
7213           for (s+=7; s < end && *s != '\n'; s++) {
7214             isfp->entry_rawlen++;
7215             if (isalpha(*s))
7216               isfp->entry_truelen++;
7217           }
7218           break;
7219         }
7220       }
7221       else {
7222         switch (rawseqflag) {
7223         case GETSEQ_SEQUENCE:
7224         case GETSEQ_RAWSEQ:
7225           for (s+=7; s < end && *s != '\n'; s++)
7226             *seq++ = '-';
7227           break;
7228 
7229         case GETSEQ_LENGTHS:
7230           for (s+=7; s < end && *s != '\n'; s++) {
7231             isfp->entry_rawlen++;
7232             if (isalpha(*s))
7233               isfp->entry_truelen++;
7234           }
7235           break;
7236         }
7237       }
7238       s++;
7239 
7240       error_test(s >= end && initflag, E_PARSEERROR, return STATUS_ERROR,
7241                  print_error("%s, entry %d:  Premature end of entry.\n",
7242                              isfp->filename, isfp->entry_count));
7243 
7244       if (*s == '\n' && !seq1only && isfp->entry_seqno == 1)
7245         seq1only = 1;
7246 
7247       if (seq1only)
7248         ;                          /* We're at the blank line, so do nothing */
7249       else if (seq2only) {
7250         while (s < end && *s != '\n') s++;      /* Skip seq 2 positions line */
7251         s++;
7252       }
7253       else if (isfp->entry_seqno == 1) {
7254         if (markx == MARKX0) {
7255           while (s < end && *s != '\n') s++;    /* Skip line of matches */
7256           s++;
7257         }
7258         while (s < end && *s != '\n') s++;      /* Skip sequence 2 line */
7259         s++;
7260         if (markx == MARKX0) {
7261           while (s < end && *s != '\n') s++;    /* Skip seq 2 positions line */
7262           s++;
7263         }
7264       }
7265       else {   /* isfp->entry_seqno == 2 */
7266         if (markx == MARKX0) {
7267           while (s < end && *s != '\n') s++;    /* Skip seq 2 positions line */
7268           s++;
7269         }
7270       }
7271     }
7272   }
7273   else if (markx == MARKX3 || markx == MARKX10) {
7274     while (s < end && *s != '\n') s++;
7275 
7276     templen = 0;
7277     if (isfp->entry_seqno == 2) {
7278       if (markx == MARKX3)
7279         for (s++; s < end && (*s != '\n' || s[1] != '>'); s++) ;
7280       else {
7281         while (s < end && (*s != '\n' || s[1] != '>')) {
7282           if (s[1] == ';')
7283             for (s++; s < end && *s != '\n'; s++) ;
7284           else
7285             for (s++; s < end && *s != '\n'; s++)
7286               templen++;
7287         }
7288       }
7289 
7290       for (s++; s < end && *s != '\n'; s++) ;
7291     }
7292 
7293     for (s++; s < end && *s != '>' && *s != '\n'; s++) {
7294       if (*s == ';')
7295         while (s < end && *s != '\n') s++;
7296       else {
7297         switch (rawseqflag) {
7298         case GETSEQ_SEQUENCE:
7299           for ( ; s < end && *s != '\n'; s++)
7300             if (isalpha(*s))
7301               *seq++ = *s;
7302           break;
7303 
7304         case GETSEQ_RAWSEQ:
7305           for ( ; s < end && *s != '\n'; s++)
7306             *seq++ = (*s == ' ' ? '-' : *s);
7307           break;
7308 
7309         case GETSEQ_LENGTHS:
7310           for ( ; s < end && *s != '\n'; s++) {
7311             isfp->entry_rawlen++;
7312             if (isalpha(*s))
7313               isfp->entry_truelen++;
7314           }
7315           break;
7316         }
7317       }
7318     }
7319 
7320     if (markx == MARKX10) {
7321       if (isfp->entry_seqno == 1) {
7322         error_test(s >= end, E_PARSEERROR, return STATUS_ERROR,
7323                    print_error("%s, entry %d:  Premature end of entry.\n",
7324                                isfp->filename, isfp->entry_count));
7325 
7326         if (*s != '>') {
7327           while (s < end && (*s != '\n' || s[1] != '>')) s++;
7328           s++;
7329         }
7330         while (s < end && *s != '\n') s++;
7331         while (s < end && (*s != '\n' || s[1] != '>')) {
7332           if (s[1] == ';')
7333             for (s++; s < end && *s != '\n'; s++) ;
7334           else
7335             for (s++; s < end && *s != '\n'; s++)
7336               templen++;
7337         }
7338       }
7339 
7340       while (seq - isfp->seq < templen)
7341         *seq++ = '-';
7342     }
7343   }
7344 
7345   if (rawseqflag == GETSEQ_LENGTHS)
7346     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
7347   else {
7348     *seq = '\0';
7349     isfp->seqlen = seq - isfp->seq;
7350 
7351     /*
7352      * Perform checks on the sequence length.
7353      */
7354     if (isfp->seqlen == 0) {
7355       set_error(E_NOSEQ);
7356       print_error("%s, entry %d:  Entry contains no sequence.\n",
7357                   isfp->filename, isfp->entry_count);
7358       return STATUS_ERROR;
7359     }
7360 
7361     if (rawseqflag == GETSEQ_SEQUENCE) {
7362       isfp->entry_truelen = isfp->seqlen;
7363       isfp->iflag_truelen = 1;
7364     }
7365     else if (rawseqflag == GETSEQ_RAWSEQ) {
7366       isfp->entry_rawlen = isfp->seqlen;
7367       isfp->iflag_rawlen = 1;
7368     }
7369   }
7370 
7371   return STATUS_OK;
7372 }
7373 
7374 
7375 
7376 /*
7377  * blastout_read     (BLAST output formats)
7378  *
7379  *
7380  * Parameters:  isfp  -  an opened INTSEQFILE structure
7381  *
7382  * Return: a STATUS value
7383  */
blastout_read(INTSEQFILE * isfp,int flag)7384 static int blastout_read(INTSEQFILE *isfp, int flag)
7385 {
7386   int status, count;
7387   char ch, *s, *t, *line, *end;
7388 
7389   if (isfp->entry_count == 0) {
7390     isfp->fout_alpha1 = UNKNOWN;
7391     isfp->fout_progname[0] = '\0';
7392     isfp->fout_descr1 = isfp->fout_descr2 = NULL;
7393 
7394     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
7395       if (mystreq(line, 'Q', "QUERY=") ||
7396           (mystreq(line, 'B', "BLAST") &&
7397            ((ch = toupper(line[6])) == 'N' || ch == 'P' || ch == 'X')))
7398         break;
7399     }
7400 
7401     if (status == STATUS_OK && toupper(line[0]) == 'B') {
7402       memcpy(isfp->fout_progname, line, 6);
7403       isfp->fout_progname[6] = '\0';
7404       ch = toupper(line[5]);
7405       if (ch == 'N')
7406         isfp->fout_alpha1 = DNA;
7407       else if (ch == 'P' || ch == 'X')
7408         isfp->fout_alpha1 = PROTEIN;
7409 
7410       while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
7411         if (mystreq(line, 'Q', "QUERY="))
7412           break;
7413     }
7414     else if (status == STATUS_OK)
7415       strcpy(isfp->fout_progname, "BLASTN/BLASTP/BLASTX");
7416 
7417     if (status == STATUS_OK) {
7418       for (s=end-1; s >= line && isspace(*s); s--) ;
7419       if (s - 4 >= line && strncmp(s-4, "bases", 5) == 0) {
7420         for (s-=5; s >= line && isspace(*s); s--) ;
7421         for ( ; s >= line && isdigit(*s); s--) ;
7422         for ( ; s >= line && isspace(*s); s--) ;
7423         if (s >= line && *s == ',')
7424           end = s;
7425       }
7426 
7427       for (t=line+6; t < end && isspace(*t); t++) ;
7428       if (t < s)
7429         isfp->fout_descr1 = mystrdup2(t, s);
7430 
7431       status = fp_get_line(isfp, &line, &end);
7432     }
7433 
7434     if (status == STATUS_OK) {
7435       for (s=line; s < end && *s != '('; s++) ;
7436       if (s < end)
7437         isfp->fout_len1 = myatoi(s+1, 10, '0');
7438 
7439       while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
7440         if (line[0] == '>')
7441           break;
7442     }
7443 
7444     isfp->entry_seqno = isfp->entry_numseqs = 0;
7445 
7446     switch (status) {
7447     case STATUS_OK:     break;
7448     case STATUS_EOF:    raise_error(E_PARSEERROR, return STATUS_ERROR,
7449                                     print_error("%s, entry %d:  Premature EOF "
7450                                                 "reached.\n", isfp->filename,
7451                                                 isfp->entry_count));
7452     case STATUS_ERROR:  return STATUS_ERROR;
7453     case STATUS_FATAL:  return STATUS_FATAL;
7454     default:
7455       status_error(return STATUS_ERROR, "blastout_read");
7456     }
7457 
7458     isfp->fp_current = line;
7459   }
7460 
7461   if (!flag && isfp->entry_seqno < isfp->entry_numseqs) {
7462     isfp->entry_seqno++;
7463     if (isfp->entry_truelen > 0)
7464       isfp->iflag_truelen = 1;
7465     if (isfp->entry_rawlen > 0)
7466       isfp->iflag_rawlen = 1;
7467 
7468     return STATUS_OK;
7469   }
7470 
7471   isfp->fp_entrystart = isfp->fp_seqstart = isfp->fp_entryend = NULL;
7472   isfp->entry_seqlen = isfp->entry_truelen = isfp->entry_rawlen = 0;
7473   isfp->iflag_truelen = isfp->iflag_rawlen = 0;
7474   isfp->entry_count++;
7475   isfp->entry_seqno = isfp->entry_numseqs = 0;
7476 
7477   status = fp_get_line(isfp, &line, &end);
7478   while (status == STATUS_OK && line == end)
7479     status = fp_get_line(isfp, &line, &end);
7480 
7481   /*
7482    * Check for the end of the input.
7483    *
7484    * The line here must either be the beginning of a sequence's matches
7485    * (and so start with '>') or it must be in the middle of a sequence's
7486    * matches (which is signalled by the strings "  Plus Strand HSPs:",
7487    * "  Minus Strand HSPs:" and " Score =".
7488    */
7489   if (status == STATUS_OK) {
7490     if (line[0] == '>')
7491       ;
7492     else {
7493       for (s=line; s < end && isspace(*s); s++) ;
7494       if (s == end || (!mystreq(s, 'P', "PLUS") && !mystreq(s, 'M', "MINUS") &&
7495                        !mystreq(s, 'S', "SCORE")))
7496         return STATUS_EOF;
7497     }
7498   }
7499 
7500   /*
7501    * If at the first of a sequence's matches, read the header to get
7502    * the sequence's oneline description and length.
7503    */
7504   if (status == STATUS_OK && line[0] == '>') {
7505     isfp->fp_entrystart = line;
7506     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
7507       for (s=line; s < end && isspace(*s); s++) ;
7508       error_test(s == end, E_PARSEERROR, return STATUS_ERROR,
7509                  print_error("%s, entry %d:  Invalid format of BLAST-output "
7510                              "entry.\n", isfp->filename, isfp->entry_count));
7511       if (mystreq(s, 'L', "LENGTH =")) {
7512         isfp->fout_len2 = myatoi(s+8, 10, '0');
7513         break;
7514       }
7515     }
7516 
7517     if (status == STATUS_OK) {
7518       isfp->fout_descr2 = mystrdup2(isfp->fp_entrystart, line);
7519       for (s=t=isfp->fout_descr2; *s; ) {
7520         while (*s && *s != '\n') {
7521           if (s != t)
7522             *t++ = *s;
7523           s++;
7524         }
7525 
7526         if (*s == '\n') {
7527           while (*s && isspace(*s)) s++;
7528           if (*s)
7529             *t++ = ' ';
7530         }
7531       }
7532       if (s != t)
7533         *t = '\0';
7534 
7535       if ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
7536         status = fp_get_line(isfp, &line, &end);
7537     }
7538   }
7539 
7540   /*
7541    * Advance to the beginning of the alignment lines.
7542    */
7543   if (status == STATUS_OK) {
7544     isfp->fp_entrystart = line;
7545 
7546     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK)
7547       if (mystreq(line, 'Q', "QUERY:"))
7548         break;
7549   }
7550 
7551   /*
7552    * Scan to the end of the alignment lines.
7553    */
7554   if (status == STATUS_OK) {
7555     isfp->fp_seqstart = line;
7556 
7557     count = 0;
7558     while (status == STATUS_OK) {
7559       if (count == 4) {
7560         if (!mystreq(line, 'Q', "QUERY:"))
7561           break;
7562         count = 0;
7563       }
7564 
7565       count++;
7566       status = fp_get_line(isfp, &line, &end);
7567     }
7568   }
7569 
7570   switch (status) {
7571   case STATUS_OK:     break;
7572   case STATUS_EOF:    raise_error(E_PARSEERROR, return STATUS_ERROR,
7573                                   print_error("%s, entry %d:  Premature EOF "
7574                                               "reached.\n", isfp->filename,
7575                                               isfp->entry_count));
7576   case STATUS_ERROR:  return STATUS_ERROR;
7577   case STATUS_FATAL:  return STATUS_FATAL;
7578   default:
7579     status_error(return STATUS_ERROR, "blastout_read");
7580   }
7581 
7582   isfp->fp_entryend = isfp->fp_current = line;
7583   isfp->entry_seqno = 1;
7584   isfp->entry_numseqs = 2;
7585 
7586   return STATUS_OK;
7587 }
7588 
7589 
7590 
blastout_getseq(INTSEQFILE * isfp,int rawseqflag)7591 static int blastout_getseq(INTSEQFILE *isfp, int rawseqflag)
7592 {
7593   char *s, *end, *seq;
7594 
7595   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
7596                 print_error("   fp_entryend not set by %s's read function\n",
7597                             file_table[isfp->format].ident));
7598 
7599   if (isfp->fp_seqstart == NULL) {
7600     if (rawseqflag == GETSEQ_LENGTHS) {
7601       isfp->entry_rawlen = isfp->entry_truelen = 0;
7602       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
7603       return STATUS_OK;
7604     }
7605     else {
7606       isfp->seqlen = 0;
7607       isfp->seq[0] = '\0';
7608       set_error(E_NOSEQ);
7609       print_error("%s, entry %d:  Entry contains no sequence.\n",
7610                   isfp->filename, isfp->entry_count);
7611       return STATUS_ERROR;
7612     }
7613   }
7614 
7615   s = isfp->fp_seqstart;
7616   end = isfp->fp_entryend;
7617 
7618   /*
7619    * Reallocate the sequence buffer, if necessary.
7620    */
7621   if (rawseqflag != GETSEQ_LENGTHS && (end - s + 1) / 2 >= isfp->seqsize) {
7622     isfp->seqsize += (end - s + 1) / 2;
7623     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
7624     if (isfp->seq == NULL) {
7625       isfp->seqsize = 0;
7626       memory_error(1, return STATUS_FATAL);
7627     }
7628   }
7629   seq = isfp->seq;
7630 
7631   if (rawseqflag == GETSEQ_LENGTHS)
7632     isfp->entry_truelen = isfp->entry_rawlen = 0;
7633 
7634   /*
7635    * Extract the sequence characters.
7636    */
7637   if (isfp->entry_seqno == 2) {
7638     while (s < end && *s != '\n') s++;
7639     for (s++; s < end && *s != '\n'; s++) ;
7640     s++;
7641   }
7642 
7643   while (s < end) {
7644     switch (rawseqflag) {
7645     case GETSEQ_SEQUENCE:
7646       for (s+=6; s < end && *s != '\n'; s++) {
7647         if (isalpha(*s))
7648           *seq++ = *s;
7649       }
7650       break;
7651 
7652     case GETSEQ_RAWSEQ:
7653       for (s+=6; s < end && *s != '\n'; s++) {
7654         if (!isspace(*s) && !isdigit(*s))
7655           *seq++ = *s;
7656       }
7657       break;
7658 
7659     case GETSEQ_LENGTHS:
7660       for (s+=6; s < end && *s != '\n'; s++) {
7661         if (!isspace(*s) && !isdigit(*s)) {
7662           isfp->entry_rawlen++;
7663           if (isalpha(*s))
7664             isfp->entry_truelen++;
7665         }
7666       }
7667       break;
7668     }
7669 
7670     for (s++; s < end && *s != '\n'; s++) ;
7671     for (s++; s < end && *s != '\n'; s++) ;
7672     for (s++; s < end && *s != '\n'; s++) ;
7673     s++;
7674   }
7675 
7676   if (rawseqflag == GETSEQ_LENGTHS)
7677     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
7678   else {
7679     *seq = '\0';
7680     isfp->seqlen = seq - isfp->seq;
7681 
7682     /*
7683      * Perform checks on the sequence length.
7684      */
7685     if (isfp->seqlen == 0) {
7686       set_error(E_NOSEQ);
7687       print_error("%s, entry %d:  Entry contains no sequence.\n",
7688                   isfp->filename, isfp->entry_count);
7689       return STATUS_ERROR;
7690     }
7691 
7692     if (rawseqflag == GETSEQ_SEQUENCE) {
7693       isfp->entry_truelen = isfp->seqlen;
7694       isfp->iflag_truelen = 1;
7695     }
7696     else if (rawseqflag == GETSEQ_RAWSEQ) {
7697       isfp->entry_rawlen = isfp->seqlen;
7698       isfp->iflag_rawlen = 1;
7699     }
7700   }
7701 
7702   return STATUS_OK;
7703 }
7704 
7705 
7706 
phylip_read(INTSEQFILE * isfp,int flag)7707 static int phylip_read(INTSEQFILE *isfp, int flag)
7708 {
7709   int count, status, numseqs, seqlen, int_seqpos, int_seqnum;
7710   int int_flag, int_runflag, int_lcount, seq_seqpos, seq_seqnum;
7711   int seq_flag, seq_runflag, init_flag, len, num_trees, multi_flag;
7712   int state, flagcount, aflag, cflag, fflag, mflag, uflag, wflag;
7713   char ch, *s, *s2, *t2, *line, *end, *int_entryend, *seq_entryend, *seqstart;
7714 
7715   if (isfp->entry_count == 0) {
7716     isfp->entry_seqno = isfp->entry_numseqs = 0;
7717     isfp->phylip_origfmt = isfp->format;
7718   }
7719 
7720   if (!flag && isfp->entry_seqno < isfp->entry_numseqs) {
7721     isfp->entry_seqno++;
7722     if (isfp->entry_rawlen > 0)
7723       isfp->iflag_rawlen = 1;
7724     return STATUS_OK;
7725   }
7726 
7727   isfp->fp_entrystart = isfp->fp_seqstart = isfp->fp_entryend = NULL;
7728   isfp->entry_seqlen = isfp->entry_truelen = isfp->entry_rawlen = 0;
7729   isfp->iflag_truelen = isfp->iflag_rawlen = 0;
7730   isfp->entry_count++;
7731   isfp->entry_seqno = isfp->entry_numseqs = 0;
7732 
7733   s = NULL;
7734   while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
7735     for (s=line; s < end && isspace(*s); s++) ;
7736     if (s == end)
7737       continue;
7738     else if (isdigit(*s))
7739       break;
7740     else
7741       raise_error(E_PARSEERROR, return STATUS_ERROR,
7742                   print_error("%s, entry %d:  Invalid first line of PHYLIP "
7743                               "entry.\n", isfp->filename, isfp->entry_count));
7744   }
7745   switch (status) {
7746   case STATUS_OK:     break;
7747   case STATUS_EOF:    error_test(isfp->entry_count == 1,
7748                                  E_PARSEERROR, return STATUS_ERROR,
7749                                  print_error("%s, entry %d:  File contains no "
7750                                              "entries.\n", isfp->filename,
7751                                              isfp->entry_count));
7752                       return STATUS_EOF;
7753   case STATUS_ERROR:  return STATUS_ERROR;
7754   case STATUS_FATAL:  return STATUS_FATAL;
7755   default:            status_error(return STATUS_ERROR, "phylip_read");
7756   }
7757 
7758   isfp->fp_entrystart = line;
7759   numseqs = *s - '0';
7760   for (s++; s < end && isdigit(*s); s++) {
7761     numseqs *= 10;
7762     numseqs += *s - '0';
7763   }
7764   while (s < end && isspace(*s)) s++;
7765   error_test(s == end || !isdigit(*s), E_PARSEERROR, return STATUS_ERROR,
7766              print_error("%s, entry %d:  Invalid first line of PHYLIP "
7767                          "entry.\n", isfp->filename, isfp->entry_count));
7768   seqlen = *s - '0';
7769   for (s++; s < end && isdigit(*s); s++) {
7770     seqlen *= 10;
7771     seqlen += *s - '0';
7772   }
7773 
7774   flagcount = aflag = cflag = fflag = mflag = uflag = wflag = 0;
7775   for ( ; s < end; s++) {
7776     switch (*s) {
7777     case 'a':  case 'A':  aflag = 1; flagcount++; break;
7778     case 'c':  case 'C':  cflag = 1; flagcount++; break;
7779     case 'f':  case 'F':  fflag = 1; flagcount++; break;
7780     case 'm':  case 'M':  mflag = 1; flagcount++; break;
7781     case 'u':  case 'U':  uflag = 1; break;
7782     case 'w':  case 'W':  wflag = 1; flagcount++; break;
7783     }
7784   }
7785 
7786   int_seqnum = int_seqpos = int_lcount = 0;
7787   int_runflag = int_flag = 1;
7788   int_entryend = NULL;
7789 
7790   seq_seqnum = seq_seqpos = 0;
7791   seq_runflag = seq_flag = 1;
7792   seq_entryend = NULL;
7793 
7794   init_flag = 1;
7795   multi_flag = 0;
7796   seqstart = NULL;
7797   while ((int_runflag || seq_runflag) &&
7798          (status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
7799     for (s=line; s < end && isspace(*s); s++) ;
7800     if (s == end)
7801       continue;
7802 
7803     if (flagcount) {
7804       ch = toupper(line[0]);
7805       if (aflag && ch == 'A') {
7806         for (s2=line+1,t2="NCESTOR  "; s2 < end && *t2; s2++,t2++)
7807           if (*s2 != ' ' && toupper(*s2) != *t2)
7808             break;
7809         if (*t2)
7810           flagcount = 0;
7811         else
7812           continue;
7813       }
7814       else if (cflag && ch == 'C') {
7815         for (s2=line+1,t2="ATEGORIES"; s2 < end && *t2; s2++,t2++)
7816           if (*s2 != ' ' && toupper(*s2) != *t2)
7817             break;
7818         if (*t2)
7819           flagcount = 0;
7820         else
7821           continue;
7822       }
7823       else if (fflag && ch == 'F') {
7824         for (s2=line+1,t2="ACTORS   "; s2 < end && *t2; s2++,t2++)
7825           if (*s2 != ' ' && toupper(*s2) != *t2)
7826             break;
7827         if (*t2)
7828           flagcount = 0;
7829         else
7830           continue;
7831       }
7832       else if (mflag && ch == 'M') {
7833         for (s2=line+1,t2="IXTURE   "; s2 < end && *t2; s2++,t2++)
7834           if (*s2 != ' ' && toupper(*s2) != *t2)
7835             break;
7836         if (*t2)
7837           flagcount = 0;
7838         else
7839           continue;
7840       }
7841       else if (wflag && ch == 'W') {
7842         for (s2=line+1,t2="EIGHTS   "; s2 < end && *t2; s2++,t2++)
7843           if (*s2 != ' ' && toupper(*s2) != *t2)
7844             break;
7845         if (*t2)
7846           flagcount = 0;
7847         else
7848           continue;
7849       }
7850       else
7851         flagcount = 0;
7852     }
7853 
7854     len = s - line;
7855 
7856     if (init_flag) {
7857       seqstart = line;
7858       init_flag = 0;
7859     }
7860 
7861     if (seq_runflag) {
7862       if (seq_flag) {
7863         if (end - line < 10 || len >= 10)
7864           seq_runflag = 0;
7865         else {
7866           s = line + 10;
7867           seq_flag = 0;
7868         }
7869       }
7870       else {
7871         s = line;
7872         multi_flag = 0;
7873       }
7874 
7875       if (seq_runflag) {
7876         for ( ; s < end; s++)
7877           if (!(isspace(*s) || isdigit(*s)))
7878             seq_seqpos++;
7879 
7880         if (seq_seqpos > seqlen)
7881           seq_runflag = 0;
7882         else if (seq_seqpos == seqlen) {
7883           seq_seqnum++;
7884           if (seq_seqnum == numseqs) {
7885             seq_entryend = end + 1;
7886             seq_runflag = 0;
7887           }
7888           else {
7889             seq_seqpos = 0;
7890             seq_flag = 1;
7891           }
7892         }
7893       }
7894     }
7895 
7896     if (int_runflag) {
7897       if (int_flag) {
7898         if (end - line < 10 || len >= 10)
7899           int_runflag = 0;
7900         else
7901           s = line + 10;
7902       }
7903       else {
7904         s = line;
7905         multi_flag = 0;
7906       }
7907 
7908       if (int_runflag) {
7909         for (count=0; s < end; s++)
7910           if (!(isspace(*s) || isdigit(*s)))
7911             count++;
7912 
7913         if (int_seqnum == 0) {
7914           int_lcount = count;
7915           int_seqpos += count;
7916           if (int_seqpos > seqlen)
7917             int_runflag = 0;
7918           int_seqnum++;
7919         }
7920         else {
7921           if (count != int_lcount)
7922             int_runflag = 0;
7923           else if (++int_seqnum == numseqs) {
7924             if (int_seqpos == seqlen) {
7925               int_entryend = end + 1;
7926               int_runflag = 0;
7927             }
7928             else {
7929               int_seqnum = 0;
7930               int_lcount = 0;
7931               int_flag = 0;
7932             }
7933           }
7934         }
7935       }
7936     }
7937   }
7938   switch (status) {
7939   case STATUS_OK:     break;
7940   case STATUS_EOF:    raise_error(E_PARSEERROR, return STATUS_ERROR,
7941                         print_error("%s, entry %d:  Premature EOF reached.\n",
7942                                     isfp->filename, isfp->entry_count));
7943   case STATUS_ERROR:  return STATUS_ERROR;
7944   case STATUS_FATAL:  return STATUS_FATAL;
7945   default:            status_error(return STATUS_ERROR, "phylip_read");
7946   }
7947 
7948   error_test(!seq_entryend && !int_entryend, E_PARSEERROR, return STATUS_ERROR,
7949              print_error("%s, entry %d:  Entry text does not match PHYLIP "
7950                          "format.\n", isfp->filename, isfp->entry_count));
7951   error_test(multi_flag && seq_entryend && int_entryend,
7952              E_PARSEERROR, return STATUS_ERROR,
7953              print_error("%s, entry %d:  Ambiguous entry, may be interleaved "
7954                          "or sequential.\n", isfp->filename,
7955                          isfp->entry_count));
7956 
7957   status = STATUS_OK;
7958   if (seq_entryend && (multi_flag || isfp->phylip_origfmt != FORMAT_PHYINT)) {
7959     if (multi_flag && isfp->phylip_origfmt == FORMAT_PHYINT) {
7960       set_error(E_INVFORMAT);
7961       print_warning("Warning:  %s, entry %d:  PHYLIP Interleaved format "
7962                     "specified, but Sequential format found.\n",
7963                     isfp->filename, isfp->entry_count);
7964       status = STATUS_WARNING;
7965     }
7966     isfp->format = FORMAT_PHYSEQ;
7967     isfp->fp_entryend = seq_entryend;
7968   }
7969   else {
7970     if (isfp->phylip_origfmt == FORMAT_PHYSEQ) {
7971       set_error(E_INVFORMAT);
7972       print_warning("Warning:  %s, entry %d:  PHYLIP Sequential format "
7973                     "specified, but Interleaved format found.\n",
7974                     isfp->filename, isfp->entry_count);
7975       status = STATUS_WARNING;
7976     }
7977     isfp->format = FORMAT_PHYINT;
7978     isfp->fp_entryend = int_entryend;
7979   }
7980 
7981   if (uflag) {
7982     isfp->fp_entryend = NULL;
7983     state = num_trees = 0;
7984     while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
7985       if (!state) {
7986         for (s=line; s < end && isspace(*s); s++) ;
7987         if (s == end)
7988           continue;
7989         error_test(!isdigit(*s),  E_PARSEERROR, return STATUS_ERROR,
7990                    print_error("%s, entry %d:  Invalid user tree section of "
7991                                "PHYLIP entry.\n", isfp->filename,
7992                                isfp->entry_count));
7993         num_trees = *s - '0';
7994         for (s++; s < end && isdigit(*s); s++) {
7995           num_trees *= 10;
7996           num_trees += *s - '0';
7997         }
7998         state = 1;
7999       }
8000       else {
8001         for (s=line; s < end && *s != ';'; s++) ;
8002         if (s == end)
8003           continue;
8004         if (--num_trees == 0) {
8005           isfp->fp_entryend = end + 1;
8006           break;
8007         }
8008       }
8009     }
8010     switch (status) {
8011     case STATUS_OK:     break;
8012     case STATUS_EOF:    raise_error(E_PARSEERROR, return STATUS_ERROR,
8013                          print_error("%s, entry %d:  Premature EOF reached.\n",
8014                                      isfp->filename, isfp->entry_count));
8015     case STATUS_ERROR:  return STATUS_ERROR;
8016     case STATUS_FATAL:  return STATUS_FATAL;
8017     default:            status_error(return STATUS_ERROR, "phylip_read");
8018     }
8019   }
8020 
8021   isfp->fp_seqstart = seqstart;
8022   isfp->malign_seqno = 1;
8023 
8024   isfp->entry_seqno = 1;
8025   isfp->entry_numseqs = numseqs;
8026   isfp->entry_rawlen = isfp->entry_seqlen = seqlen;
8027   isfp->iflag_rawlen = 1;
8028   return status;
8029 }
8030 
8031 
phyint_getseq(INTSEQFILE * isfp,int rawseqflag)8032 static int phyint_getseq(INTSEQFILE *isfp, int rawseqflag)
8033 {
8034   int seqpos, seqnum, flag;
8035   char *s, *end, *seq, *line;
8036 
8037   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
8038                 print_error("   fp_entryend not set by %s's read function\n",
8039                             file_table[isfp->format].ident));
8040 
8041   if (isfp->fp_seqstart == NULL) {
8042     if (rawseqflag == GETSEQ_LENGTHS) {
8043       isfp->entry_rawlen = isfp->entry_truelen = 0;
8044       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
8045       return STATUS_OK;
8046     }
8047     else {
8048       isfp->seqlen = 0;
8049       isfp->seq[0] = '\0';
8050       set_error(E_NOSEQ);
8051       print_error("%s, entry %d:  Entry contains no sequence.\n",
8052                   isfp->filename, isfp->entry_count);
8053       return STATUS_ERROR;
8054     }
8055   }
8056 
8057   while (isfp->malign_seqno < isfp->entry_seqno) {
8058     for (s=isfp->fp_seqstart; isspace(*s); s++) ;
8059     for (s++; *s != '\n'; s++) ;
8060     isfp->fp_seqstart = s + 1;
8061     isfp->malign_seqno++;
8062   }
8063 
8064   /*
8065    * Reallocate the sequence buffer, if necessary.
8066    */
8067   if (rawseqflag != GETSEQ_LENGTHS && isfp->entry_seqlen >= isfp->seqsize) {
8068     isfp->seqsize += isfp->entry_seqlen;
8069     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
8070     if (isfp->seq == NULL) {
8071       isfp->seqsize = 0;
8072       memory_error(1, return STATUS_FATAL);
8073     }
8074   }
8075   seq = isfp->seq;
8076 
8077   if (rawseqflag == GETSEQ_LENGTHS)
8078     isfp->entry_truelen = isfp->entry_rawlen = 0;
8079 
8080   /*
8081    * Extract the sequence characters.
8082    */
8083   s = isfp->fp_seqstart;
8084   end = isfp->fp_entryend;
8085 
8086   seqpos = 0;
8087   seqnum = isfp->malign_seqno;
8088   flag = 1;
8089 
8090   while (s < end) {
8091     for (line=s; s < end && isspace(*s) && *s != '\n'; s++) ;
8092     if (s == end)
8093       break;
8094     else if (*s == '\n') {
8095       s++;
8096       continue;
8097     }
8098 
8099     if (flag)
8100       s = line + 10;
8101 
8102     if (seqnum == isfp->malign_seqno) {
8103       switch (rawseqflag) {
8104       case GETSEQ_SEQUENCE:
8105         for ( ; s < end && *s != '\n'; s++) {
8106           if (!(isspace(*s) || isdigit(*s))) {
8107             seqpos++;
8108             if (isalpha(*s))
8109               *seq++ = *s;
8110           }
8111         }
8112         break;
8113 
8114       case GETSEQ_RAWSEQ:
8115         for ( ; s < end && *s != '\n'; s++) {
8116           if (!(isspace(*s) || isdigit(*s))) {
8117             seqpos++;
8118             *seq++ = *s;
8119           }
8120         }
8121         break;
8122 
8123       case GETSEQ_LENGTHS:
8124         for ( ; s < end && *s != '\n'; s++) {
8125           if (!(isspace(*s) || isdigit(*s))) {
8126             seqpos++;
8127             isfp->entry_rawlen++;
8128             if (isalpha(*s))
8129               isfp->entry_truelen++;
8130           }
8131         }
8132         break;
8133       }
8134       if (seqpos == isfp->entry_seqlen)
8135         break;
8136     }
8137     else
8138       for ( ; s < end && *s != '\n'; s++) ;
8139 
8140     s++;
8141     if (seqnum++ == isfp->entry_numseqs) {
8142       seqnum = 1;
8143       flag = 0;
8144     }
8145   }
8146 
8147   if (rawseqflag == GETSEQ_LENGTHS)
8148     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
8149   else {
8150     *seq = '\0';
8151     isfp->seqlen = seq - isfp->seq;
8152 
8153     /*
8154      * Perform checks on the sequence length.
8155      */
8156     if (isfp->seqlen == 0) {
8157       set_error(E_NOSEQ);
8158       print_error("%s, entry %d:  Entry contains no sequence.\n",
8159                   isfp->filename, isfp->entry_count);
8160       return STATUS_ERROR;
8161     }
8162     if (rawseqflag && isfp->entry_seqlen > 0 &&
8163         isfp->entry_seqlen != isfp->seqlen) {
8164       set_error(E_DIFFLENGTH);
8165       print_warning("Warning:  %s, entry %d:  Entry gives seq. length of %d, "
8166                     "but %d characters found.\n", isfp->filename,
8167                     isfp->entry_count, isfp->entry_seqlen, isfp->seqlen);
8168       return STATUS_WARNING;
8169     }
8170 
8171    if (rawseqflag == GETSEQ_SEQUENCE) {
8172       isfp->entry_truelen = isfp->seqlen;
8173       isfp->iflag_truelen = 1;
8174     }
8175     else if (rawseqflag == GETSEQ_RAWSEQ) {
8176       isfp->entry_rawlen = isfp->seqlen;
8177       isfp->iflag_rawlen = 1;
8178     }
8179   }
8180 
8181   return STATUS_OK;
8182 }
8183 
physeq_getseq(INTSEQFILE * isfp,int rawseqflag)8184 static int physeq_getseq(INTSEQFILE *isfp, int rawseqflag)
8185 {
8186   int seqlen, seqpos;
8187   char *s, *end, *seq;
8188 
8189   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
8190                 print_error("   fp_entryend not set by %s's read function\n",
8191                             file_table[isfp->format].ident));
8192 
8193   if (isfp->fp_seqstart == NULL) {
8194     if (rawseqflag == GETSEQ_LENGTHS) {
8195       isfp->entry_rawlen = isfp->entry_truelen = 0;
8196       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
8197       return STATUS_OK;
8198     }
8199     else {
8200       isfp->seqlen = 0;
8201       isfp->seq[0] = '\0';
8202       set_error(E_NOSEQ);
8203       print_error("%s, entry %d:  Entry contains no sequence.\n",
8204                   isfp->filename, isfp->entry_count);
8205       return STATUS_ERROR;
8206     }
8207   }
8208 
8209   while (isfp->malign_seqno < isfp->entry_seqno) {
8210     s = isfp->fp_seqstart+10;
8211     end = isfp->fp_entryend;
8212     seqlen = isfp->entry_seqlen;
8213     for (seqpos=0; s < end && seqpos < seqlen; s++)
8214       if (!(isspace(*s) || isdigit(*s)))
8215         seqpos++;
8216     for ( ; *s != '\n'; s++) ;
8217     isfp->fp_seqstart = s + 1;
8218     isfp->malign_seqno++;
8219   }
8220 
8221   /*
8222    * Reallocate the sequence buffer, if necessary.
8223    */
8224   if (rawseqflag != GETSEQ_LENGTHS && isfp->entry_seqlen >= isfp->seqsize) {
8225     isfp->seqsize += isfp->entry_seqlen;
8226     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
8227     if (isfp->seq == NULL) {
8228       isfp->seqsize = 0;
8229       memory_error(1, return STATUS_FATAL);
8230     }
8231   }
8232   seq = isfp->seq;
8233 
8234   if (rawseqflag == GETSEQ_LENGTHS)
8235     isfp->entry_truelen = isfp->entry_rawlen = 0;
8236 
8237   /*
8238    * Extract the sequence characters.
8239    */
8240   s = isfp->fp_seqstart + 10;
8241   end = isfp->fp_entryend;
8242 
8243   seqlen = isfp->entry_seqlen;
8244   switch (rawseqflag) {
8245   case GETSEQ_SEQUENCE:
8246     for (seqpos=0; s < end && seqpos < seqlen; s++) {
8247       if (!(isspace(*s) || isdigit(*s))) {
8248         seqpos++;
8249         if (isalpha(*s))
8250           *seq++ = *s;
8251       }
8252     }
8253     break;
8254 
8255   case GETSEQ_RAWSEQ:
8256     for (seqpos=0; s < end && seqpos < seqlen; s++) {
8257       if (!(isspace(*s) || isdigit(*s))) {
8258         seqpos++;
8259         *seq++ = *s;
8260       }
8261     }
8262     break;
8263 
8264   case GETSEQ_LENGTHS:
8265     for (seqpos=0; s < end && seqpos < seqlen; s++) {
8266       if (!(isspace(*s) || isdigit(*s))) {
8267         seqpos++;
8268         isfp->entry_rawlen++;
8269         if (isalpha(*s))
8270           isfp->entry_truelen++;
8271       }
8272     }
8273     break;
8274   }
8275 
8276   if (rawseqflag == GETSEQ_LENGTHS)
8277     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
8278   else {
8279     *seq = '\0';
8280     isfp->seqlen = seq - isfp->seq;
8281 
8282     /*
8283      * Perform checks on the sequence length.
8284      */
8285     if (isfp->seqlen == 0) {
8286       set_error(E_NOSEQ);
8287       print_error("%s, entry %d:  Entry contains no sequence.\n",
8288                   isfp->filename, isfp->entry_count);
8289       return STATUS_ERROR;
8290     }
8291     if (rawseqflag && isfp->entry_seqlen > 0 &&
8292         isfp->entry_seqlen != isfp->seqlen) {
8293       set_error(E_DIFFLENGTH);
8294       print_warning("Warning:  %s, entry %d:  Entry gives seq. length of %d, "
8295                     "but %d characters found.\n", isfp->filename,
8296                     isfp->entry_count, isfp->entry_seqlen, isfp->seqlen);
8297       return STATUS_WARNING;
8298     }
8299 
8300    if (rawseqflag == GETSEQ_SEQUENCE) {
8301       isfp->entry_truelen = isfp->seqlen;
8302       isfp->iflag_truelen = 1;
8303     }
8304     else if (rawseqflag == GETSEQ_RAWSEQ) {
8305       isfp->entry_rawlen = isfp->seqlen;
8306       isfp->iflag_rawlen = 1;
8307     }
8308   }
8309 
8310   return STATUS_OK;
8311 }
8312 
8313 
8314 
clustal_read(INTSEQFILE * isfp,int flag)8315 static int clustal_read(INTSEQFILE *isfp, int flag)
8316 {
8317   int status, count;
8318   char *s, *end;
8319 
8320   if (isfp->entry_count == 0) {
8321     isfp->entry_count++;
8322     isfp->entry_seqlen = isfp->entry_truelen = isfp->entry_rawlen = 0;
8323     isfp->iflag_truelen = isfp->iflag_rawlen = 0;
8324     isfp->entry_seqno = isfp->entry_numseqs = 0;
8325 
8326     isfp->fp_entrystart = isfp->fp_current;
8327     isfp->fp_seqstart = isfp->fp_entryend = NULL;
8328 
8329     /*
8330      * The file should contain only a single entry, so just read the
8331      * whole file.
8332      */
8333     status = fp_read_all(isfp);
8334     switch (status) {
8335     case STATUS_OK:    break;
8336     case STATUS_EOF:   raise_error(E_PARSEERROR, return STATUS_ERROR,
8337                          print_error("%s:  Empty file.\n", isfp->filename));
8338     case STATUS_ERROR: return STATUS_ERROR;
8339     case STATUS_FATAL: return STATUS_FATAL;
8340     default:           status_error(return STATUS_ERROR, "clustal_read");
8341     }
8342 
8343     isfp->fp_entryend = isfp->fp_top;
8344 
8345     /*
8346      * Find out how many sequences occur in the file.
8347      */
8348     end = isfp->fp_entryend;
8349     for (s=isfp->fp_entrystart; s < end && *s != '\n'; s++) ;
8350     for ( ; s < end && (*s != '\n' || isspace(s[1])); s++) ;
8351     isfp->fp_seqstart = s+1;
8352     for (count=0; s < end && (*s != '\n' || !isspace(s[1])); s++)
8353       if (*s == '\n')
8354         count++;
8355 
8356     error_test(s == end || count == 0, E_PARSEERROR, return STATUS_ERROR,
8357                print_error("%s, entry %d:  Invalid format of CLUSTALW "
8358                            "entry.\n", isfp->filename, isfp->entry_count));
8359 
8360     isfp->malign_seqno = isfp->entry_seqno = 1;
8361     isfp->entry_numseqs = count;
8362     return STATUS_OK;
8363   }
8364 
8365   if (!flag && isfp->entry_seqno < isfp->entry_numseqs) {
8366     isfp->entry_seqno++;
8367     if (isfp->entry_rawlen > 0)
8368       isfp->iflag_rawlen = 1;
8369     return STATUS_OK;
8370   }
8371 
8372   return STATUS_EOF;
8373 }
8374 
8375 
clustal_getseq(INTSEQFILE * isfp,int rawseqflag)8376 static int clustal_getseq(INTSEQFILE *isfp, int rawseqflag)
8377 {
8378   int count, seqsize;
8379   char *s, *end, *seq;
8380 
8381   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
8382                 print_error("   fp_entryend not set by %s's read function\n",
8383                             file_table[isfp->format].ident));
8384 
8385   if (isfp->fp_seqstart == NULL) {
8386     if (rawseqflag == GETSEQ_LENGTHS) {
8387       isfp->entry_rawlen = isfp->entry_truelen = 0;
8388       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
8389       return STATUS_OK;
8390     }
8391     else {
8392       isfp->seqlen = 0;
8393       isfp->seq[0] = '\0';
8394       set_error(E_NOSEQ);
8395       print_error("%s, entry %d:  Entry contains no sequence.\n",
8396                   isfp->filename, isfp->entry_count);
8397       return STATUS_ERROR;
8398     }
8399   }
8400 
8401   s = isfp->fp_seqstart;
8402   end = isfp->fp_entryend;
8403 
8404   if (isfp->malign_seqno < isfp->entry_seqno) {
8405     while (isfp->malign_seqno < isfp->entry_seqno) {
8406       for ( ; s < end && *s != '\n'; s++) ;
8407       s++;
8408       isfp->malign_seqno++;
8409     }
8410     isfp->fp_seqstart = s;
8411   }
8412 
8413   /*
8414    * Reallocate the sequence buffer, if necessary.
8415    */
8416   if (rawseqflag != GETSEQ_LENGTHS) {
8417     seqsize = (isfp->fp_entryend - isfp->fp_entrystart) /
8418               (isfp->entry_numseqs + 1);
8419     if (seqsize >= isfp->seqsize) {
8420       isfp->seqsize += seqsize;
8421       isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
8422       if (isfp->seq == NULL) {
8423         isfp->seqsize = 0;
8424         memory_error(1, return STATUS_FATAL);
8425       }
8426     }
8427   }
8428   seq = isfp->seq;
8429 
8430   if (rawseqflag == GETSEQ_LENGTHS)
8431     isfp->entry_truelen = isfp->entry_rawlen = 0;
8432 
8433   /*
8434    * Extract the sequence characters.
8435    */
8436   while (s < end) {
8437     /*
8438      * We're at a line of the sequence, so get it, then skip to the
8439      * end of the block, the beginning of the next block, and then
8440      * count lines down to the next piece of the sequence.
8441      */
8442     switch (rawseqflag) {
8443     case GETSEQ_SEQUENCE:
8444       for (s+=15; s < end && *s != '\n'; s++)
8445         if (isalpha(*s))
8446           *seq++ = *s;
8447       break;
8448 
8449     case GETSEQ_RAWSEQ:
8450       for (s+=15; s < end && *s != '\n'; s++)
8451         if (!(isspace(*s) || isdigit(*s)))
8452           *seq++ = *s;
8453       break;
8454 
8455     case GETSEQ_LENGTHS:
8456       for (s+=15; s < end && *s != '\n'; s++) {
8457         if (!(isspace(*s) || isdigit(*s))) {
8458           isfp->entry_rawlen++;
8459           if (isalpha(*s))
8460             isfp->entry_truelen++;
8461         }
8462       }
8463       break;
8464     }
8465 
8466     for (s++; s < end && (*s != '\n' || !isspace(s[1])); s++) ;
8467     for (s++; s < end && (*s != '\n' || isspace(s[1])); s++) ;
8468     for (count=0; s < end && (*s != '\n' || !isspace(s[1])); s++)
8469       if (*s == '\n' && ++count == isfp->malign_seqno)
8470         break;
8471   }
8472 
8473   if (rawseqflag == GETSEQ_LENGTHS)
8474     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
8475   else {
8476     *seq = '\0';
8477     isfp->seqlen = seq - isfp->seq;
8478 
8479     /*
8480      * Perform a check on the sequence length.
8481      */
8482     if (isfp->seqlen == 0) {
8483       set_error(E_NOSEQ);
8484       print_error("%s, entry %d:  Entry contains no sequence.\n",
8485                   isfp->filename, isfp->entry_count);
8486       return STATUS_ERROR;
8487     }
8488 
8489    if (rawseqflag == GETSEQ_SEQUENCE) {
8490       isfp->entry_truelen = isfp->seqlen;
8491       isfp->iflag_truelen = 1;
8492     }
8493     else if (rawseqflag == GETSEQ_RAWSEQ) {
8494       isfp->entry_rawlen = isfp->seqlen;
8495       isfp->iflag_rawlen = 1;
8496     }
8497   }
8498 
8499   return STATUS_OK;
8500 }
8501 
8502 
8503 /*
8504  * asn_read   (ASN.1 file format)
8505  *
8506  * This parser (which, remember, only extracts the entry)
8507  * assumes the following:
8508  *    1) The file text consists of a hierarchy of records, where a record
8509  *       consists of a text string identifier and then a pair of matching
8510  *       braces bounding the contents of the record.  Consecutive records
8511  *       on the same level of the hierarchy are separated by commas.
8512  *    2) The file can be completely free form, with the identifiers, braces
8513  *       and commas occurring anywhere.  In addition, single-quoted and
8514  *       double-quoted strings can occur anywhere in the record, and
8515  *       braces and commas in those strings are ignored (as far as the
8516  *       hierarchy of records is concerned).
8517  *    3) The entries in the file are the "seq" records.  The beginning
8518  *       of the record is found by first looking for the string "seq {"
8519  *       (where "seq" is the complete identifier) and then searching
8520  *       for the matching close brace.
8521  *    4) The "seq" records can occur anywhere in the hierarchy, except that
8522  *       they cannot be nested, one within another.
8523  *
8524  * Parameters:  isfp  -  an opened INTSEQFILE structure
8525  *
8526  * Return: a STATUS value
8527  */
8528 #define LOOKFOR_BIOSET 0
8529 #define LOOKFOR_SEQSET 1
8530 #define LOOKFOR_SEQ 2
8531 #define INENTRY 3
8532 
asn_read(INTSEQFILE * isfp,int flag)8533 static int asn_read(INTSEQFILE *isfp, int flag)
8534 {
8535   int level, seq_level, status, count, state, oldpe;
8536   char ch, *line, *end, *s, *t, *top, *lenstr, *lenend, *seqstr;
8537 
8538   isfp->fp_entrystart = isfp->fp_seqstart = isfp->fp_entryend = NULL;
8539   isfp->entry_seqlen = isfp->entry_truelen = isfp->entry_rawlen = 0;
8540   isfp->iflag_truelen = isfp->iflag_rawlen = 0;
8541   isfp->entry_count++;
8542   isfp->entry_seqno = isfp->entry_numseqs = 0;
8543 
8544   /*
8545    * Use local copies of fp_current and fp_top throughout this procedure.
8546    */
8547   s = isfp->fp_current;
8548   top = isfp->fp_top;
8549 
8550   /*
8551    * Get the complete sequence entry.
8552    */
8553   state = (isfp->entry_count == 1 ? LOOKFOR_BIOSET : LOOKFOR_SEQ);
8554   seq_level = (isfp->entry_count == 1 ? 2 : 0);
8555   line = s;
8556   level = 0;
8557   while ((status = fp_get_line(isfp, &line, &end)) == STATUS_OK) {
8558     s = line;
8559     while (1) {
8560       while (s < end && isspace(*s)) s++;
8561       if (s == end)
8562         break;
8563 
8564       switch (*s) {
8565       case '"':
8566       case '\'':
8567         ch = *s++;
8568         while (1) {
8569           while (s < end && *s != ch && *(s-1) != '\\') s++;
8570           if (s < end)
8571             break;
8572 
8573           status = fp_get_line(isfp, &line, &end);
8574           switch (status) {
8575           case STATUS_OK:     break;
8576           case STATUS_EOF:    raise_error(E_PARSEERROR, return STATUS_ERROR,
8577                                 print_error("%s, entry %d:  Premature EOF "
8578                                             "reached.\n", isfp->filename,
8579                                             isfp->entry_count));
8580           case STATUS_ERROR:  return STATUS_ERROR;
8581           case STATUS_FATAL:  return STATUS_FATAL;
8582           default:            status_error(return STATUS_ERROR, "asn_read");
8583           }
8584           s = line;
8585         }
8586         s++;
8587         break;
8588 
8589       case '{':
8590         level++;
8591         if (state == LOOKFOR_SEQ && level - 1 == seq_level &&
8592             mystreq(s-4, 'S', "SEQ {")) {
8593           /*
8594            * Set the beginning of the entry mark to either the beginning
8595            * of the line (if "seq {" is the first thing on the line) or
8596            * right at the "seq {" string.
8597            */
8598           for (t=s-5; t > line && (*t == ' ' || *t == '\t'); t--) ;
8599           if (t == line)
8600             isfp->fp_entrystart = t;
8601           else if (*t == '\n')
8602             isfp->fp_entrystart = t + 1;
8603           else
8604             isfp->fp_entrystart = s - 4;
8605 
8606           state = INENTRY;
8607         }
8608         s++;
8609         break;
8610 
8611       case '}':
8612         if (state == INENTRY && level - 1 == seq_level)
8613           goto ASN_LOOP_END;
8614 
8615         level--;
8616         s++;
8617         break;
8618 
8619       default:
8620         if (isalpha(*s)) {
8621           if (state == LOOKFOR_BIOSET && level == 0 &&
8622               mystreq(s, 'B', "BIOSEQ-SET "))
8623             state = LOOKFOR_SEQSET;
8624           else if (state == LOOKFOR_SEQSET && level == 1 &&
8625                    mystreq(s, 'S', "SEQ-SET "))
8626             state = LOOKFOR_SEQ;
8627         }
8628         while (!isspace(*s)) s++;
8629       }
8630     }
8631   }
8632 ASN_LOOP_END:
8633   switch (status) {
8634   case STATUS_OK:     break;
8635   case STATUS_EOF:    if (state != INENTRY) {
8636                         error_test(isfp->entry_count == 1,
8637                                    E_PARSEERROR, return STATUS_ERROR,
8638                                    print_error("%s:  File contains no "
8639                                                "entries.\n", isfp->filename));
8640                         return STATUS_EOF;
8641                       }
8642                       else {
8643                         raise_error(E_PARSEERROR, return STATUS_ERROR,
8644                           print_error("%s, entry %d:  Premature EOF "
8645                                       "reached.\n", isfp->filename,
8646                                       isfp->entry_count));
8647                       }
8648   case STATUS_ERROR:  return STATUS_ERROR;
8649   case STATUS_FATAL:  return STATUS_FATAL;
8650   default:            status_error(return STATUS_ERROR, "asn_read");
8651   }
8652 
8653   isfp->fp_entryend = s + 1;
8654 
8655   /*
8656    * Get the sequence length, if it is there.
8657    */
8658   lenstr = seqstr = NULL;
8659   oldpe = pe_flag;
8660   pe_flag = PE_NONE;
8661   count = asn_parse(isfp->fp_entrystart, isfp->fp_entryend,
8662                     "seq.inst.length", &lenstr, &lenend,
8663                     "seq.inst.seq-data", &seqstr, NULL,
8664                     NULL);
8665   pe_flag = oldpe;
8666   error_test(count == -1, E_PARSEERROR, return STATUS_ERROR,
8667              print_error("%s, entry %d:  Invalid format of ASN.1 entry.\n",
8668                          isfp->filename, isfp->entry_count));
8669 
8670   if (count > 0) {
8671     if (lenstr != NULL) {
8672       isfp->entry_seqlen = myatoi(lenstr + 6, 10, '0');
8673       isfp->entry_rawlen = isfp->entry_truelen = isfp->entry_seqlen;
8674       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
8675 
8676       error_test(isfp->entry_seqlen == 0, E_PARSEERROR, return STATUS_ERROR,
8677                  print_error("%s, entry %d:  Cannot parse `seq.inst.length' ",
8678                              "sub-record.\n", isfp->filename,
8679                              isfp->entry_count));
8680     }
8681 
8682     if (seqstr != NULL) {
8683       isfp->fp_seqstart = seqstr;
8684       isfp->entry_seqno = isfp->entry_numseqs = 1;
8685     }
8686   }
8687 
8688   return STATUS_OK;
8689 }
8690 
8691 
8692 /*
8693  * asn_getseq   (ASN.1 file format)
8694  *
8695  * This parser (which extracts the sequence and its length)
8696  * assumes the following:
8697  *    1) The file text consists of a hierarchy of records, where a record
8698  *       consists of a text string identifier and then a pair of matching
8699  *       braces bounding the contents of the record (except for simple
8700  *       records which contain only a string or a number).  Consecutive
8701  *       records on the same level of the hierarchy are separated by commas.
8702  *    2) The file can be completely free form, with the identifiers, braces
8703  *       and commas occurring anywhere.  In addition, single-quoted and
8704  *       double-quoted strings can occur anywhere in the record, and
8705  *       braces and commas in those strings are ignored (as far as the
8706  *       hierarchy of records is concerned.
8707  *    3) Inside the entry, the sequence and its length are given in the
8708  *       seq.inst.seq-data and seq.inst.length (where "a.b.c" describes
8709  *       a portion of the record hierarchy).  The length record consists
8710  *       of a single number, and the seq-data record consists of a
8711  *       string naming the format and then a double-quoted string giving
8712  *       the sequence characters (formatted appropriately).
8713  *    4) For the sequence to be retrieved, the format of the seq-data
8714  *       record must be one of the following:  iupacna, iupacaa, ncbi2na.
8715  *
8716  * Parameters:  isfp        -  an opened INTSEQFILE structure
8717  *              rawseqflag  -  should the actual sequence or the raw
8718  *                             sequence text be extracted.
8719  *
8720  * Return: a STATUS value
8721  */
8722 #define IUPAC -1
8723 #define DNA2 -2
8724 #define DNA4 -3
8725 
asn_getseq(INTSEQFILE * isfp,int rawseqflag)8726 static int asn_getseq(INTSEQFILE *isfp, int rawseqflag)
8727 {
8728   int size, count, state, oldpe;
8729   char ch, *t, *seq, *seqstr, *seqend, buf[8];
8730 
8731   program_error(isfp->fp_entryend == NULL, return STATUS_ERROR,
8732                 print_error("   fp_entryend not set by %s's read function\n",
8733                             file_table[isfp->format].ident));
8734 
8735   if (isfp->fp_seqstart == NULL) {
8736     if (rawseqflag == GETSEQ_LENGTHS) {
8737       isfp->entry_rawlen = isfp->entry_truelen = 0;
8738       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
8739       return STATUS_OK;
8740     }
8741     else {
8742       isfp->seqlen = 0;
8743       isfp->seq[0] = '\0';
8744       set_error(E_NOSEQ);
8745       print_error("%s, entry %d:  Entry contains no sequence.\n",
8746                   isfp->filename, isfp->entry_count);
8747       return STATUS_ERROR;
8748     }
8749   }
8750 
8751   seqstr = NULL;
8752   oldpe = pe_flag;
8753   pe_flag = PE_NONE;
8754   count = asn_parse(isfp->fp_seqstart, isfp->fp_entryend,
8755                     "seq-data", &seqstr, &seqend,
8756                     NULL);
8757   pe_flag = oldpe;
8758   error_test(count == -1, E_PARSEERROR, return STATUS_ERROR,
8759              print_error("%s, entry %d:  Invalid format of ASN.1 entry.\n",
8760                          isfp->filename, isfp->entry_count));
8761 
8762   if (seqstr == NULL) {
8763     if (rawseqflag == GETSEQ_LENGTHS) {
8764       isfp->entry_rawlen = isfp->entry_truelen = 0;
8765       isfp->iflag_rawlen = isfp->iflag_truelen = 1;
8766       return STATUS_OK;
8767     }
8768     else {
8769       isfp->seqlen = 0;
8770       isfp->seq[0] = '\0';
8771       set_error(E_NOSEQ);
8772       print_error("%s, entry %d:  Entry contains no sequence.\n",
8773                   isfp->filename, isfp->entry_count);
8774       return STATUS_ERROR;
8775     }
8776   }
8777 
8778   /*
8779    * Determine the format that the sequence is encoded in.
8780    */
8781   for (t=seqstr+8; t < seqend && isspace(*t); t++) ;
8782   error_test(t+8 >= seqend, E_PARSEERROR, return STATUS_ERROR,
8783              print_error("%s, entry %d:  Premature end of `seq.inst.seq-data' "
8784                          "sub-record.\n", isfp->filename, isfp->entry_count));
8785 
8786   if (mystreq(t, 'I', "IUPACNA") || mystreq(t, 'I', "IUPACAA"))
8787     state = IUPAC;
8788   else if (mystreq(t, 'N', "NCBI2NA"))
8789     state = DNA2;
8790   else if (mystreq(t, 'N', "NCBI4NA"))
8791     state = DNA4;
8792   else {
8793     memcpy(buf, t, 7);
8794     buf[7] = '\0';
8795     raise_error(E_PARSEERROR, return STATUS_ERROR,
8796                 print_error("%s, entry %d:  Sequence encoding (%7s) is "
8797                             "not supported.\n", isfp->filename,
8798                             isfp->entry_count, buf));
8799   }
8800 
8801   /*
8802    * Reallocate the sequence buffer, if necesssary.
8803    */
8804   if (isfp->entry_seqlen > 0)
8805     size = isfp->entry_seqlen;
8806   else {
8807     size = seqend - seqstr;
8808     if (state == DNA2)
8809       size *= 2;
8810   }
8811   if (rawseqflag != GETSEQ_LENGTHS && size + 1 >= isfp->seqsize) {
8812     isfp->seqsize += size + 1;
8813     isfp->seq = (char *) realloc(isfp->seq, isfp->seqsize);
8814     if (isfp->seq == NULL) {
8815       isfp->seqsize = 0;
8816       memory_error(1, return STATUS_FATAL);
8817     }
8818   }
8819 
8820   /*
8821    * Scan through the characters of the sequence, storing them
8822    * into the sequence buffer.
8823    */
8824   for (t+=8; t < seqend && isspace(*t); t++) ;
8825   error_test(t + 7 >= seqend, E_PARSEERROR, return STATUS_ERROR,
8826              print_error("%s, entry %d:  Premature end of `seq.inst.seq-data' "
8827                          "sub-record.\n", isfp->filename, isfp->entry_count));
8828   error_test(*t != '"' && *t != '\'', E_PARSEERROR, return STATUS_ERROR,
8829              print_error("%s, entry %d:  Invalid format of `inst.seq-data' "
8830                          "sub-record.\n", isfp->filename, isfp->entry_count));
8831 
8832   seq = isfp->seq;
8833   for (ch=*t++; t < seqend && *t != ch; t++) {
8834     if (isspace(*t))
8835       continue;
8836 
8837     if (rawseqflag == GETSEQ_LENGTHS) {
8838       isfp->entry_truelen += (state == DNA2 ? 2 : 1);
8839       isfp->entry_rawlen += (state == DNA2 ? 2 : 1);
8840     }
8841     else {
8842       if (state == IUPAC)
8843         *seq++ = *t;
8844       else if (state == DNA2) {
8845         switch (*t) {
8846         case '0':  *seq++ = 'A'; *seq++ = 'A';  break;
8847         case '1':  *seq++ = 'A'; *seq++ = 'C';  break;
8848         case '2':  *seq++ = 'A'; *seq++ = 'G';  break;
8849         case '3':  *seq++ = 'A'; *seq++ = 'T';  break;
8850         case '4':  *seq++ = 'C'; *seq++ = 'A';  break;
8851         case '5':  *seq++ = 'C'; *seq++ = 'C';  break;
8852         case '6':  *seq++ = 'C'; *seq++ = 'G';  break;
8853         case '7':  *seq++ = 'C'; *seq++ = 'T';  break;
8854         case '8':  *seq++ = 'G'; *seq++ = 'A';  break;
8855         case '9':  *seq++ = 'G'; *seq++ = 'C';  break;
8856         case 'A':  *seq++ = 'G'; *seq++ = 'G';  break;
8857         case 'B':  *seq++ = 'G'; *seq++ = 'T';  break;
8858         case 'C':  *seq++ = 'T'; *seq++ = 'A';  break;
8859         case 'D':  *seq++ = 'T'; *seq++ = 'C';  break;
8860         case 'E':  *seq++ = 'T'; *seq++ = 'G';  break;
8861         case 'F':  *seq++ = 'T'; *seq++ = 'T';  break;
8862         default:
8863           raise_error(E_PARSEERROR, return STATUS_ERROR,
8864                       print_error("%s, entry %d:  Invalid character `%c' in "
8865                                   "sequence's ncbi2na encoding.\n",
8866                                   isfp->filename, isfp->entry_count, *t));
8867         }
8868       }
8869       else if (state == DNA4) {
8870         switch (*t) {
8871         case '1':  *seq++ = 'A';  break;
8872         case '2':  *seq++ = 'C';  break;
8873         case '3':  *seq++ = 'M';  break;
8874         case '4':  *seq++ = 'G';  break;
8875         case '5':  *seq++ = 'R';  break;
8876         case '6':  *seq++ = 'S';  break;
8877         case '7':  *seq++ = 'V';  break;
8878         case '8':  *seq++ = 'T';  break;
8879         case '9':  *seq++ = 'W';  break;
8880         case 'A':  case 'a':  *seq++ = 'Y';  break;
8881         case 'B':  case 'b':  *seq++ = 'H';  break;
8882         case 'C':  case 'c':  *seq++ = 'K';  break;
8883         case 'D':  case 'd':  *seq++ = 'D';  break;
8884         case 'E':  case 'e':  *seq++ = 'B';  break;
8885         case 'F':  case 'f':  *seq++ = 'N';  break;
8886         default:
8887           raise_error(E_PARSEERROR, return STATUS_ERROR,
8888                       print_error("%s, entry %d:  Invalid character `%c' in "
8889                                   "sequence's ncbi4na encoding.\n",
8890                                   isfp->filename, isfp->entry_count, *t));
8891         }
8892       }
8893       else {
8894         program_error(1, return STATUS_ERROR,
8895                       print_error("   Illegal state value %d in asn_getseq.\n",
8896                                   state));
8897       }
8898     }
8899   }
8900   error_test(t == seqend, E_PARSEERROR, return STATUS_ERROR,
8901              print_error("%s, entry %d:  Premature end of "
8902                          "`seq.inst.seq-data' sub-record.\n", isfp->filename,
8903                          isfp->entry_count));
8904 
8905   if (rawseqflag == GETSEQ_LENGTHS) {
8906     if (isfp->entry_seqlen > 0 && state == DNA2 &&
8907         (isfp->entry_seqlen == isfp->entry_truelen - 1)) {
8908       isfp->entry_truelen--;
8909       isfp->entry_rawlen--;
8910     }
8911     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
8912   }
8913   else {
8914     isfp->seqlen = seq - isfp->seq;
8915     isfp->seq[isfp->seqlen] = '\0';
8916 
8917     /*
8918      * Check if a sequence was found and the length of the sequence.
8919      */
8920     if (isfp->seqlen == 0) {
8921       set_error(E_NOSEQ);
8922       print_error("%s, entry %d:  Entry contains no sequence.\n",
8923                   isfp->filename, isfp->entry_count);
8924       return STATUS_ERROR;
8925     }
8926 
8927     if (isfp->entry_seqlen == 0 ||
8928         (state == DNA2 && isfp->entry_seqlen == isfp->seqlen - 1))
8929       isfp->entry_seqlen = isfp->seqlen;
8930     else if (isfp->entry_seqlen != isfp->seqlen) {
8931       set_error(E_DIFFLENGTH);
8932       print_warning("Warning:  %s, entry %d:  Entry gives seq. length of %d, "
8933                     "but %d characters found.\n", isfp->filename,
8934                     isfp->entry_count, isfp->entry_seqlen, isfp->seqlen);
8935       return STATUS_WARNING;
8936     }
8937 
8938     isfp->entry_truelen = isfp->entry_rawlen = isfp->seqlen;
8939     isfp->iflag_truelen = isfp->iflag_rawlen = 1;
8940   }
8941 
8942   return STATUS_OK;
8943 }
8944 
8945 
8946 
8947 typedef struct {
8948   int braceflag, matchflag, len;
8949   char *name;
8950 } APSTACK;
8951 
testmatch(char * s,APSTACK * stack,int size)8952 static int testmatch(char *s, APSTACK *stack, int size)
8953 {
8954   int i, j;
8955   char *t;
8956 
8957   for (i=0; i < size; i++) {
8958     if (stack[i].braceflag) {
8959       if (i == 0 || stack[i-1].braceflag) {
8960         if (*s != '{')
8961           return 0;
8962         s++;
8963       }
8964     }
8965     else {
8966       while (*s && *s == '{') {
8967         s++;
8968         if (*s && *s != '.')
8969           return 0;
8970 
8971         if (*s == '.')
8972           s++;
8973       }
8974 
8975       t = stack[i].name;
8976       for (j=0; j < stack[i].len && *s && *s != '.'; j++,s++,t++)
8977         if (toupper(*s) != toupper(*t))
8978           return 0;
8979 
8980       if (j < stack[i].len || (*s && *s != '.'))
8981         return 0;
8982     }
8983     if (*s == '.')
8984       s++;
8985   }
8986 
8987   if (*s)
8988     return 0;
8989   else
8990     return 1;
8991 }
8992 
asn_parse(char * begin,char * end,...)8993 int asn_parse(char *begin, char *end, ...)
8994 {
8995   static struct apstruct {
8996     int matched;
8997     char *name, **bptr, **eptr;
8998   } list[32];
8999   APSTACK stack[128];
9000   int i, num, pos, count, numlist;
9001   char qch, *s, *t, *name;
9002   va_list ap;
9003 
9004   if (!ctype_initflag)
9005     init_ctype();
9006 
9007   reset_errors();
9008   param_error(begin == NULL, return -1, "asn_parse", "arg 1 is NULL");
9009   param_error(end == NULL, return -1, "asn_parse", "arg 2 is NULL");
9010 
9011   va_start(ap, end);
9012   numlist = 0;
9013   while (numlist < 32 && (name = va_arg(ap, char *)) != NULL) {
9014     list[numlist].name = name;
9015     list[numlist].bptr = va_arg(ap, char **);
9016     list[numlist].eptr = va_arg(ap, char **);
9017     list[numlist].matched = 0;
9018     numlist++;
9019   }
9020 
9021   s = begin;
9022   while (s < end && isspace(*s)) s++;
9023   if (s == end)
9024     return 0;
9025 
9026   if (*s == '{')
9027     s++;
9028 
9029   pos = count = 0;
9030   while (count < numlist) {
9031     /*
9032      * Skip whitespace
9033      */
9034     while (s < end && isspace(*s)) s++;
9035     if (s == end)
9036       break;
9037 
9038     /*
9039      * Handle the next token of the string.
9040      */
9041     switch (*s) {
9042     case '{':
9043       stack[pos].name = "";
9044       stack[pos].len = 0;
9045       stack[pos].braceflag = 1;
9046       stack[pos].matchflag = -1;
9047       pos++;
9048 
9049       if (pos == 1 || stack[pos-2].braceflag) {
9050         for (i=0; i < numlist; i++) {
9051           if (!list[i].matched && testmatch(list[i].name, stack, pos)) {
9052             stack[pos-1].matchflag = i;
9053             list[i].matched = 1;
9054             if (list[i].bptr != NULL)
9055               *list[i].bptr = s;
9056             break;
9057           }
9058         }
9059       }
9060 
9061       s++;
9062       break;
9063 
9064     case ':':
9065       error_test(s[1] != ':' || s[2] != '=', E_PARSEERROR, return -1,
9066                  print_error("Error in asn_parse:  Cannot parse given "
9067                              "ASN.1 text.\n"));
9068       s += 3;
9069       break;
9070 
9071     case '}':
9072       while (pos > 0 && !stack[pos-1].braceflag) {
9073         if (stack[pos-1].matchflag != -1) {
9074           num = stack[pos-1].matchflag;
9075           if (list[num].eptr != NULL)
9076             *list[num].eptr = s;
9077           count++;
9078         }
9079         pos--;
9080       }
9081       if (pos == 0)
9082         goto AP_LOOP_END;
9083 
9084       pos--;
9085       s++;
9086       break;
9087 
9088     case ',':
9089       while (pos > 0 && !stack[pos-1].braceflag) {
9090         if (stack[pos-1].matchflag != -1) {
9091           num = stack[pos-1].matchflag;
9092           if (list[num].eptr != NULL)
9093             *list[num].eptr = s;
9094           count++;
9095         }
9096         pos--;
9097       }
9098       s++;
9099       break;
9100 
9101     case '\'':
9102     case '"':
9103       qch = *s++;
9104       while (s < end && (*s != qch || *(s-1) == '\\')) s++;
9105       error_test(s == end, E_PARSEERROR, return -1,
9106                  print_error("Error in asn_parse:  Reached end of text while "
9107                              "inside quoted string.\n"));
9108       s++;
9109       break;
9110 
9111     default:
9112       for (t=s; s < end && (isalnum(*s) || *s == '-' || *s == '_'); s++) ;
9113       error_test(t == s, E_PARSEERROR, return -1,
9114                  print_error("Error in asn_parse:  Cannot parse given "
9115                              "ASN.1 text.\n"));
9116       error_test(s == end && pos > 0, E_PARSEERROR, return -1,
9117                  print_error("Error in asn_parse:  Given ASN.1 text contains "
9118                              "an incomplete record.\n"));
9119 
9120       stack[pos].name = t;
9121       stack[pos].len = s - t;
9122       stack[pos].braceflag = 0;
9123       stack[pos].matchflag = -1;
9124       pos++;
9125 
9126       for (i=0; i < numlist; i++) {
9127         if (!list[i].matched && testmatch(list[i].name, stack, pos)) {
9128           stack[pos-1].matchflag = i;
9129           list[i].matched = 1;
9130           if (list[i].bptr != NULL)
9131             *list[i].bptr = t;
9132           break;
9133         }
9134       }
9135     }
9136   }
9137 AP_LOOP_END:
9138   while (count < numlist && pos > 0 && !stack[pos-1].braceflag) {
9139     if (stack[pos-1].matchflag != -1) {
9140       num = stack[pos-1].matchflag;
9141       if (list[num].eptr != NULL)
9142         *list[num].eptr = s;
9143       count++;
9144     }
9145     pos--;
9146   }
9147   error_test(count < numlist && pos > 0, E_PARSEERROR, return -1,
9148              print_error("Error in asn_parse:  Given ASN.1 text contains an "
9149                          "incomplete record.\n"));
9150 
9151   return count;
9152 }
9153 
9154 
9155 
9156 
9157 /*
9158  *
9159  * Internal procedures to containing some of the grunge details
9160  * of the file reading.
9161  *
9162  *    fp_get_line, fp_read_more
9163  *
9164  *
9165  *
9166  */
9167 
9168 /*
9169  * fp_get_line
9170  *
9171  * This function reads the next line of the file (if it exists) and
9172  * sets the "line_out" and "end_out" values to point to the beginning
9173  * and end of that next line.  It also increments the fp_current and
9174  * (possibly) the fp_top fields of the INTSEQFILE structure past this
9175  * line.
9176  *
9177  * This function will also skip the beginning of line pointer past any
9178  * formfeed characters (since at least one database has the habit of
9179  * including formfeeds in its sequence files).
9180  *
9181  * Parameters:  isfp  -  an opened INTSEQFILE structure
9182  *              line_out  -  the location where the put the pointer to the
9183  *                           beginning of the line
9184  *              end_out   -  the location where the put the pointer to the
9185  *                           end of the line (i.e. the newline character)
9186  *
9187  * Return: a STATUS value
9188  */
fp_get_line(INTSEQFILE * isfp,char ** line_out,char ** end_out)9189 static int fp_get_line(INTSEQFILE *isfp, char **line_out, char **end_out)
9190 {
9191   register char *s, *top;
9192   int status;
9193   char *stemp, *line;
9194 
9195   line = s = isfp->fp_current;
9196   top = isfp->fp_top;
9197   while (1) {
9198     /*
9199      * Look for the next newline.
9200      */
9201     if (isfp->isendtagged)
9202       while (*s != '\n') s++;
9203     else {
9204       while (s + 10 <= top && *s != '\n' && *++s != '\n' && *++s != '\n' &&
9205              *++s != '\n' && *++s != '\n' && *++s != '\n' && *++s != '\n' &&
9206              *++s != '\n' && *++s != '\n' && *++s != '\n')
9207         s++;
9208       while (s < top && *s != '\n') s++;
9209     }
9210 
9211     /*
9212      * If the newline is an actual character in the file (and not
9213      * just the terminator signalling the end of the characters
9214      * read so far), check for formfeed characters at the "beginning"
9215      * of the line, and set the pointers to the line.
9216      *
9217      * Otherwise, read more of the file and go back through the loop.
9218      */
9219     if (s < top) {
9220       while (*line == '\f') line++;
9221 
9222       *line_out = line;
9223       *end_out = s;
9224       isfp->fp_current = s+1;
9225 
9226       return STATUS_OK;
9227     }
9228     else {
9229       stemp = s;
9230       status = fp_read_more(isfp, &line, &stemp, &isfp->fp_top);
9231       s = stemp;
9232       top = isfp->fp_top;
9233       switch (status) {
9234       case STATUS_OK:
9235         continue;
9236 
9237       case STATUS_EOF:
9238         while (line < s && *line == '\f') line++;
9239 
9240         /*
9241          * If the line is empty, return EOF.  Else, return the line.
9242          */
9243         if (s == line)
9244           return STATUS_EOF;
9245 
9246         *line_out = line;
9247         *end_out = s;
9248         isfp->fp_current = s;
9249 
9250         return STATUS_OK;
9251 
9252       case STATUS_ERROR:
9253       case STATUS_FATAL:
9254         isfp->fp_current = s;
9255         return status;
9256 
9257       default:
9258         status_error(return STATUS_ERROR, "fp_get_line");
9259       }
9260     }
9261   }
9262 }
9263 
9264 
9265 /*
9266  * fp_read_more
9267  *
9268  * This procedure reads in another block of the file (if it exists).
9269  * It performs one of three actions based on the situation.  First, if
9270  * there is sufficient room at the end of the "fp_buffer", then the
9271  * new block is read there.  Second, if the beginning of the current
9272  * entry (the text of previous entries are forgotten) occurs above
9273  * the halfway point of the buffer, then the current entry's text is
9274  * shifted to the beginning of the buffer and the new block is read
9275  * into the end of the buffer.  Third, if the current entry's block
9276  * is over half the size of the buffer, the buffer is dynamically
9277  * reallocated at double its size and then the new block is read into
9278  * the new region.
9279  *
9280  * The last three parameters are pointers into the buffer, where the
9281  * first two can point anywhere in the current entry and the third must
9282  * point to the top of the text in the buffer (i.e., it must either be
9283  * the "fp_top" field or a local copy of that field).  Usually, the first
9284  * two pointers are the beginning of the current line and the current point
9285  * in the text scan (hence the names "line" and "s"), but this is not
9286  * required.
9287  *
9288  * The reason those pointers must be passed in is that the text may
9289  * be shifted in the buffer or the buffer itself might be reallocated.
9290  * In that event, those three pointers and the INTSEQFILE fields
9291  * "fp_entrystart", "fp_seqstart" and "fp_entryend" are reset appropriately.
9292  * NOTE:  fp_current and fp_top are NOT reset, so they must either be
9293  *        passed in as the parameters, or updated from the variables
9294  *        passed in as the parameters.
9295  *
9296  * Parameters:  isfp  -  an opened INTSEQFILE structure
9297  *              line_out  -  the location of a pointer into the "fp_buffer"
9298  *              s_out     -  the location of a pointer into the "fp_buffer"
9299  *              top_out   -  the location of a pointer which points to
9300  *                           the top of the valid text in "fp_buffer".
9301  *
9302  * Return: a STATUS value
9303  */
fp_read_more(INTSEQFILE * isfp,char ** line_out,char ** s_out,char ** top_out)9304 static int fp_read_more(INTSEQFILE *isfp, char **line_out, char **s_out,
9305                         char **top_out)
9306 {
9307   int size, shift;
9308   char *top, *bottom, *buffer_top, *oldbuffer;
9309 
9310   /*
9311    * Functions like seqfgcgify, seqfungcgify and others must use the
9312    * read operations to parse the entry they're given.  They do that
9313    * by setting up a dummy INTSEQFILE structure with the entry as the
9314    * file buffer.  They also set the filename field to the empty string
9315    * to signal that nothing more should be read when the read operations
9316    * reach the end of the entry string (since no file or pipe is open).
9317    * In the normal reading process, filename is always an non-empty string.
9318    *
9319    * Thus, this test to return an EOF signal when the filename is an
9320    * empty string.
9321    */
9322   if (isfp->filename[0] == '\0')
9323     return STATUS_EOF;
9324 
9325   /*
9326    * Set local copies of the bottom and top of the section of the file
9327    * which must be kept in memory (such as the piece of the current
9328    * entry read in so far), along with the top of the buffer.
9329    */
9330   top = *top_out;
9331   bottom = (isfp->fp_entrystart != NULL ? isfp->fp_entrystart : *line_out);
9332   buffer_top = isfp->fp_buffer + isfp->fp_bufsize;
9333 
9334 
9335   /*
9336    * If the file is mapped, then redo the map for the next chunk of
9337    * the file.  If the new mapping fails, then try to read it
9338    * normally.
9339    */
9340 #ifdef ISMAPABLE
9341   if (isfp->ismapped) {
9342     int newmapsize, newfilepos, offset, status;
9343     caddr_t addr;
9344 
9345     if (isfp->filepos + isfp->mapsize == isfp->filesize)
9346       return STATUS_EOF;
9347 
9348     if (top - bottom >= isfp->mapsize / 2) {
9349       newmapsize = isfp->mapsize * 2;
9350       newfilepos = isfp->filepos;
9351       if (newfilepos + newmapsize > isfp->filesize)
9352         newmapsize = isfp->filesize - newfilepos;
9353 
9354       offset = bottom - isfp->fp_buffer;
9355     }
9356     else {
9357       newfilepos = isfp->filepos + isfp->mapsize;
9358       newfilepos -= top - bottom;
9359       offset = newfilepos % MYPAGESIZE;
9360       newfilepos -= offset;
9361 
9362       newmapsize = isfp->filesize - newfilepos;
9363       if (newmapsize > MAXMAPSIZE)
9364         newmapsize = MAXMAPSIZE;
9365     }
9366 
9367     munmap(isfp->fp_buffer, isfp->mapsize);
9368     addr = mmap(0, newmapsize, PROT_READ, MAP_SHARED, isfp->input_fd,
9369                 newfilepos);
9370 
9371     /*
9372      * If the new mapping works, recalculate all of the offsets, and
9373      * store the new map's information in "isfp" and return.
9374      *
9375      * If the new mapping fails, then turn "ismapped" off, malloc an
9376      * array for fp_buffer, set the file pointer to the bottommost byte
9377      * and read the next section.
9378      */
9379     if (addr != (caddr_t) -1) {
9380       *line_out = addr + offset + (*line_out - bottom);
9381       *s_out = addr + offset + (*s_out - bottom);
9382 
9383       if (isfp->fp_entrystart)
9384         isfp->fp_entrystart = addr + offset + (isfp->fp_entrystart - bottom);
9385       if (isfp->fp_seqstart)
9386         isfp->fp_seqstart = addr + offset + (isfp->fp_seqstart - bottom);
9387       if (isfp->fp_entryend)
9388         isfp->fp_entryend = addr + offset + (isfp->fp_seqstart - bottom);
9389 
9390       isfp->fp_buffer = addr;
9391       isfp->fp_bufsize = newmapsize;
9392       *top_out = isfp->fp_buffer + newmapsize;
9393       isfp->filepos = newfilepos;
9394       isfp->fp_bytepos = newfilepos;
9395 
9396       return STATUS_OK;
9397     }
9398     else {
9399       isfp->ismapped = 0;
9400 
9401       isfp->fp_bufsize = top - bottom + INIT_BUFSIZE;
9402       isfp->fp_buffer = (char *) malloc(isfp->fp_bufsize);
9403       if (isfp->fp_buffer == NULL) {
9404         isfp->fp_bufsize = 0;
9405         memory_error(1, return STATUS_FATAL);
9406       }
9407 
9408       status = seek_raw_file(isfp->input_fd, newfilepos + offset);
9409       error_test(status != STATUS_OK, E_READFAILED, return STATUS_ERROR,
9410                  print_error("%s:  %s\n", isfp->filename,
9411                              sys_errlist[errno]));
9412 
9413       size = read_raw_file(isfp->input_fd, isfp->fp_buffer,
9414                            isfp->fp_bufsize - 1);
9415       error_test(size <= top - bottom, E_READFAILED, return STATUS_ERROR,
9416                  print_error("%s:  %s\n", isfp->filename,
9417                              sys_errlist[errno]));
9418 
9419       isfp->fp_bytepos = newfilepos + offset;
9420 
9421       *line_out = isfp->fp_buffer + (*line_out - bottom);
9422       *s_out = isfp->fp_buffer + (*s_out - bottom);
9423 
9424       if (isfp->fp_entrystart)
9425         isfp->fp_entrystart = isfp->fp_buffer + (isfp->fp_entrystart - bottom);
9426       if (isfp->fp_seqstart)
9427         isfp->fp_seqstart = isfp->fp_buffer + (isfp->fp_seqstart - bottom);
9428       if (isfp->fp_entryend)
9429         isfp->fp_entryend = isfp->fp_buffer + (isfp->fp_seqstart - bottom);
9430 
9431       *top_out = isfp->fp_buffer + size;
9432       **top_out = '\n';
9433       isfp->isendtagged = 1;
9434 
9435       return STATUS_OK;
9436     }
9437   }
9438 #endif
9439 
9440 
9441   /*
9442    * If there is enough room left in the buffer, skip to the reading
9443    * part.
9444    */
9445   if (buffer_top - top > CONCAT_READ_POINT)
9446     ;
9447 
9448   /*
9449    * If the bottom is over the halfway point, then shift the current
9450    * contents without reallocating.
9451    */
9452   else if (bottom - isfp->fp_buffer > isfp->fp_bufsize / 2) {
9453     memcpy(isfp->fp_buffer, bottom, top - bottom);
9454 
9455     shift = bottom - isfp->fp_buffer;
9456     isfp->fp_bytepos += shift;
9457     top -= shift;
9458     *s_out -= shift;
9459     *line_out -= shift;
9460 
9461     if (isfp->fp_entrystart)
9462       isfp->fp_entrystart -= shift;
9463     if (isfp->fp_seqstart)
9464       isfp->fp_seqstart -= shift;
9465     if (isfp->fp_entryend)
9466       isfp->fp_entryend -= shift;
9467   }
9468 
9469   /*
9470    * Otherwise, double the size of the buffer.
9471    */
9472   else {
9473     oldbuffer = isfp->fp_buffer;
9474 
9475     isfp->fp_bufsize += isfp->fp_bufsize;
9476     isfp->fp_buffer = (char *) realloc(isfp->fp_buffer, isfp->fp_bufsize);
9477     if (isfp->fp_buffer == NULL) {
9478       isfp->fp_bufsize = 0;
9479       memory_error(1, return STATUS_FATAL);
9480     }
9481     buffer_top = isfp->fp_buffer + isfp->fp_bufsize;
9482 
9483     top = &isfp->fp_buffer[top - oldbuffer];
9484     *s_out = &isfp->fp_buffer[*s_out - oldbuffer];
9485     *line_out = &isfp->fp_buffer[*line_out - oldbuffer];
9486 
9487     if (isfp->fp_entrystart)
9488       isfp->fp_entrystart = &isfp->fp_buffer[isfp->fp_entrystart - oldbuffer];
9489     if (isfp->fp_seqstart)
9490       isfp->fp_seqstart = &isfp->fp_buffer[isfp->fp_seqstart - oldbuffer];
9491     if (isfp->fp_entryend)
9492       isfp->fp_entryend = &isfp->fp_buffer[isfp->fp_seqstart - oldbuffer];
9493   }
9494 
9495   /*
9496    * Read in the next chunk of the file.
9497    */
9498   size = read_raw_file(isfp->input_fd, top, buffer_top - top - 1);
9499   if (size > 0) {
9500     *top_out = top + size;
9501     **top_out = '\n';
9502     isfp->isendtagged = 1;
9503 
9504     return STATUS_OK;
9505   }
9506   else if (size == 0) {
9507     *top_out = top;
9508     **top_out = '\n';
9509     isfp->isendtagged = 1;
9510 
9511     return STATUS_EOF;
9512   }
9513   else {
9514     raise_error(E_READFAILED, return STATUS_ERROR,
9515                 print_error("%s:  %s\n", isfp->filename, sys_errlist[errno]));
9516   }
9517 }
9518 
9519 
9520 /*
9521  * fp_read_all
9522  *
9523  * Read the complete file into memory, using repeated calls to fp_read_more.
9524  *
9525  * Parameters:  isfp  -  an opened INTSEQFILE structure
9526  *
9527  * Return: a STATUS value
9528  */
fp_read_all(INTSEQFILE * isfp)9529 static int fp_read_all(INTSEQFILE *isfp)
9530 {
9531   int status;
9532   char *line, *s, *top;
9533 
9534   line = isfp->fp_current;
9535   top = isfp->fp_top;
9536   status = STATUS_OK;
9537   while (status == STATUS_OK) {
9538     s = top;
9539     status = fp_read_more(isfp, &line, &s, &top);
9540     switch (status) {
9541     case STATUS_OK:     break;
9542     case STATUS_EOF:    break;
9543     case STATUS_ERROR:  return STATUS_ERROR;
9544     case STATUS_FATAL:  return STATUS_FATAL;
9545     default:            status_error(return STATUS_ERROR, "fp_read_all");
9546     }
9547   }
9548   isfp->fp_current = line;
9549   isfp->fp_top = top;
9550   return (isfp->fp_current < isfp->fp_top ? STATUS_OK : STATUS_EOF);
9551 }
9552 
9553 
9554 
9555 
9556 
9557 /*
9558  *
9559  *
9560  * Get Information Section
9561  *
9562  *
9563  *
9564  */
9565 
9566 typedef struct {
9567   SEQINFO *info;
9568   int count, size, error;
9569 } INFO;
9570 
9571 static int is_ncbiprefix(char *);
9572 static char *parse_ncbi_idlist(INFO *, char *, char *);
9573 
9574 
start_info(INFO * info,INTSEQFILE * isfp,int flag)9575 static void start_info(INFO *info, INTSEQFILE *isfp, int flag)
9576 {
9577   info->info = isfp->info;
9578   info->size = isfp->infobufsize;
9579   info->count = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO
9580                     ? sizeof(SEQINFO) : isfp->infosize);
9581   info->error = 0;
9582 
9583   if (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO)
9584     memset(info->info, 0, sizeof(SEQINFO));
9585   else {
9586     if (flag == SEQINFO_DATE)  info->info->date = NULL;
9587     if (flag == SEQINFO_IDLIST)  info->info->idlist = NULL;
9588     if (flag == SEQINFO_DESCRIPTION)  info->info->description = NULL;
9589     if (flag == SEQINFO_COMMENT)  info->info->comment = NULL;
9590     if (flag == SEQINFO_ORGANISM)  info->info->organism = NULL;
9591     if (flag == SEQINFO_FRAGMENT)  info->info->isfragment = 0;
9592     if (flag == SEQINFO_CIRCULAR)  info->info->iscircular = 0;
9593     if (flag == SEQINFO_ALPHABET)  info->info->alphabet = UNKNOWN;
9594     if (flag == SEQINFO_STARTPOS)  info->info->fragstart = UNKNOWN;
9595   }
9596 }
9597 
finish_info(INFO * info,INTSEQFILE * isfp)9598 static void finish_info(INFO *info, INTSEQFILE *isfp)
9599 {
9600   if (!info->error) {
9601     isfp->info = info->info;
9602     isfp->infosize = info->count;
9603     isfp->infobufsize = info->size;
9604   }
9605   else {
9606     isfp->info = NULL;
9607     isfp->infosize = isfp->infobufsize = 0;
9608     isfp->istatus = INFO_NONE;
9609     isfp->iflag_date = isfp->iflag_idlist = isfp->iflag_description = 0;
9610     isfp->iflag_comment = isfp->iflag_organism = isfp->iflag_fragment = 0;
9611     isfp->iflag_circular = isfp->iflag_alphabet = isfp->iflag_fragstart = 0;
9612   }
9613 }
9614 
9615 
setup_field(INFO * info,int field,int len)9616 static int setup_field(INFO *info, int field, int len)
9617 {
9618   int date_offset, list_offset, desc_offset, comm_offset, org_offset;
9619   int his_offset, fname_offset, db_offset, fmt_offset;
9620   char ch, *t, *a, *b, *ptr, **fieldptr, *istruct, *itop;
9621 
9622   if (info->error || len <= 0)
9623     return 0;
9624 
9625   istruct = (char *) info->info;
9626   itop = istruct + info->count;
9627   switch (field) {
9628   case SEQINFO_FILENAME:     fieldptr = &info->info->filename;  break;
9629   case SEQINFO_DBNAME:       fieldptr = &info->info->dbname;  break;
9630   case SEQINFO_FORMAT:       fieldptr = &info->info->format;  break;
9631   case SEQINFO_IDLIST:       fieldptr = &info->info->idlist;  break;
9632   case SEQINFO_DATE:         fieldptr = &info->info->date;  break;
9633   case SEQINFO_DESCRIPTION:  fieldptr = &info->info->description;  break;
9634   case SEQINFO_COMMENT:      fieldptr = &info->info->comment;  break;
9635   case SEQINFO_ORGANISM:     fieldptr = &info->info->organism;  break;
9636   case SEQINFO_HISTORY:      fieldptr = &info->info->history;  break;
9637   default:
9638     return 0;
9639   }
9640   ptr = *fieldptr;
9641 
9642   if (ptr != NULL && ptr >= istruct && ptr < itop) {
9643     for (t=ptr; *t; t++) ;
9644 
9645     /*
9646      * If the field exists and is in the middle of the strings,
9647      * move it to the end using the three reversals trick, i.e.,
9648      * from the string AB, the string rev(rev(A),rev(B)) is BA.
9649      */
9650     if (t+1 < itop) {
9651       for (a=ptr,b=t; a < b; a++,b--) {
9652         ch = *a;
9653         *a = *b;
9654         *b = ch;
9655       }
9656       for (a=t+1,b=itop-1; a < b; a++,b--) {
9657         ch = *a;
9658         *a = *b;
9659         *b = ch;
9660       }
9661       for (a=ptr,b=itop-1; a < b; a++,b--) {
9662         ch = *a;
9663         *a = *b;
9664         *b = ch;
9665       }
9666 
9667       if (info->info->filename > ptr && info->info->filename < itop)
9668         info->info->filename -= t - ptr + 1;
9669       if (info->info->dbname > ptr && info->info->dbname < itop)
9670         info->info->dbname -= t - ptr + 1;
9671       if (info->info->format > ptr && info->info->format < itop)
9672         info->info->format -= t - ptr + 1;
9673       if (info->info->date > ptr && info->info->date < itop)
9674         info->info->date -= t - ptr + 1;
9675       if (info->info->idlist > ptr && info->info->idlist < itop)
9676         info->info->idlist -= t - ptr + 1;
9677       if (info->info->description > ptr && info->info->description < itop)
9678         info->info->description -= t - ptr + 1;
9679       if (info->info->comment > ptr && info->info->comment < itop)
9680         info->info->comment -= t - ptr + 1;
9681       if (info->info->organism > ptr && info->info->organism < itop)
9682         info->info->organism -= t - ptr + 1;
9683       if (info->info->history > ptr && info->info->history < itop)
9684         info->info->history -= t - ptr + 1;
9685       *fieldptr = itop - (t - ptr + 1);
9686     }
9687   }
9688 
9689   /*
9690    * Now realloc the information if there's not enough space.
9691    */
9692   if (info->count + len + 1 >= info->size) {
9693     date_offset = list_offset = desc_offset = -1;
9694     comm_offset = org_offset = his_offset = -1;
9695     fname_offset = db_offset = fmt_offset = -1;
9696 
9697     if (info->info->filename >= istruct && info->info->filename < itop)
9698       fname_offset = info->info->filename - istruct;
9699     if (info->info->dbname >= istruct && info->info->dbname < itop)
9700       db_offset = info->info->dbname - istruct;
9701     if (info->info->format >= istruct && info->info->format < itop)
9702       fmt_offset = info->info->format - istruct;
9703     if (info->info->date >= istruct && info->info->date < itop)
9704       date_offset = info->info->date - istruct;
9705     if (info->info->idlist >= istruct && info->info->idlist < itop)
9706       list_offset = info->info->idlist - istruct;
9707     if (info->info->description >= istruct && info->info->description < itop)
9708       desc_offset = info->info->description - istruct;
9709     if (info->info->comment >= istruct && info->info->comment < itop)
9710       comm_offset = info->info->comment - istruct;
9711     if (info->info->organism >= istruct && info->info->organism < itop)
9712       org_offset = info->info->organism - istruct;
9713     if (info->info->history >= istruct && info->info->history < itop)
9714       his_offset = info->info->history - istruct;
9715 
9716     info->size += info->size + len + 1;
9717     info->info = (SEQINFO *) realloc(info->info, info->size);
9718     if (info->info == NULL) {
9719       info->error = 1;
9720       return 0;
9721     }
9722     else {
9723       istruct = (char *) info->info;
9724       if (fname_offset != -1) info->info->filename = istruct + fname_offset;
9725       if (db_offset != -1)    info->info->dbname = istruct + db_offset;
9726       if (fmt_offset != -1)   info->info->format = istruct + fmt_offset;
9727       if (date_offset != -1)  info->info->date = istruct + date_offset;
9728       if (list_offset != -1)  info->info->idlist = istruct + list_offset;
9729       if (desc_offset != -1)  info->info->description = istruct + desc_offset;
9730       if (comm_offset != -1)  info->info->comment = istruct + comm_offset;
9731       if (org_offset != -1)   info->info->organism = istruct + org_offset;
9732       if (his_offset != -1)   info->info->history = istruct + his_offset;
9733     }
9734   }
9735 
9736   return 1;
9737 }
9738 
9739 
is_idprefix(char * s)9740 static int is_idprefix(char *s)
9741 {
9742   return (isalnum(s[0]) && isalnum(s[1]) &&
9743           (s[2] == ':' ||
9744            (isalnum(s[2]) && (s[3] == ':' ||
9745                               (isalnum(s[3]) && s[4] == ':')))));
9746 }
9747 
9748 
add_id(INFO * info,char * type,char * s,char * end)9749 static void add_id(INFO *info, char *type, char *s, char *end)
9750 {
9751   int len;
9752   char *t, *s2, *t2, buffer[72];
9753 
9754   if (info->error)
9755     return;
9756 
9757   while (s < end && isspace(*s)) s++;
9758   while (s < end && isspace(*(end-1))) end--;
9759   if (s < end && (*s == '\'' || *s == '"') &&
9760       (*(end-1) == '\'' || *(end-1) == '"')) {
9761     s++;
9762     end--;
9763     while (s < end && isspace(*s)) s++;
9764     while (s < end && isspace(*(end-1))) end--;
9765   }
9766   if (s >= end)
9767     return;
9768 
9769   if ((end - s == 7 && mystreq(s, '(', "(BELOW)")) ||
9770       (end - s == 7 && mystreq(s, 'U', "UNKNOWN")) ||
9771       (end - s == 6 && mystreq(s, 'U', "UNKNWN")))
9772     return;
9773 
9774   if (is_ncbiprefix(s) && parse_ncbi_idlist(info, s, end) != s)
9775     return;
9776 
9777   s2 = buffer;
9778   len = 0;
9779   if (is_idprefix(s)) {
9780     for (t2=s; *t2 != ':'; s2++,t2++,len++)
9781       *s2 = tolower(*t2);
9782     for ( ; t2 < end; s2++,t2++,len++)
9783       *s2 = *t2;
9784   }
9785   else {
9786     for (t2=type; *t2; s2++,t2++,len++)
9787       *s2 = tolower(*t2);
9788     *s2++ = ':';
9789     len++;
9790     for (t2=s; t2 < end; s2++,t2++,len++)
9791       *s2 = *t2;
9792   }
9793   *s2 = '\0';
9794 
9795   if (!setup_field(info, SEQINFO_IDLIST, s2 - buffer + 2))
9796     return;
9797 
9798   if (info->info->idlist == NULL)
9799     t = info->info->idlist = ((char *) info->info) + info->count;
9800   else {
9801     for (s2=info->info->idlist; *s2; ) {
9802       for (t2=buffer; *t2 && *s2 && *s2 != '|'; s2++,t2++)
9803         if (toupper(*t2) != toupper(*s2))
9804           break;
9805 
9806       if (!*t2 && (!*s2 || *s2 == '|'))
9807         return;
9808       else {
9809         while (*s2 && *s2 != '|') s2++;
9810         if (*s2) s2++;
9811       }
9812     }
9813 
9814     t = ((char *) info->info) + info->count - 1;
9815     *t++ = '|';
9816   }
9817 
9818   strcpy(t, buffer);
9819   info->count += len + 1;
9820 }
9821 
set_filename(INFO * info,char * s)9822 static void set_filename(INFO *info, char *s)
9823 {
9824   char *t, *end;
9825 
9826   if (info->error || s == NULL)
9827     return;
9828 
9829   for (end=s; *end; end++)
9830     ;
9831 
9832   if (!setup_field(info, SEQINFO_FILENAME, end - s + 1))
9833     return;
9834 
9835   if (info->info->filename == NULL)
9836     t = info->info->filename = ((char *) info->info) + info->count;
9837   else
9838     t = info->info->filename;
9839 
9840   while (s < end)
9841     *t++ = *s++;
9842   *t = '\0';
9843   info->count = t+1 - ((char *) info->info);
9844 }
9845 
set_dbname(INFO * info,char * s)9846 static void set_dbname(INFO *info, char *s)
9847 {
9848   char *t, *end;
9849 
9850   if (info->error || s == NULL)
9851     return;
9852 
9853   for (end=s; *end; end++)
9854     ;
9855 
9856   if (!setup_field(info, SEQINFO_DBNAME, end - s + 1))
9857     return;
9858 
9859   if (info->info->dbname == NULL)
9860     t = info->info->dbname = ((char *) info->info) + info->count;
9861   else
9862     t = info->info->dbname;
9863 
9864   while (s < end)
9865     *t++ = *s++;
9866   *t = '\0';
9867   info->count = t+1 - ((char *) info->info);
9868 }
9869 
set_format(INFO * info,char * s)9870 static void set_format(INFO *info, char *s)
9871 {
9872   char *t, *end;
9873 
9874   if (info->error || s == NULL)
9875     return;
9876 
9877   for (end=s; *end; end++)
9878     ;
9879 
9880   if (!setup_field(info, SEQINFO_FORMAT, end - s + 1))
9881     return;
9882 
9883   if (info->info->format == NULL)
9884     t = info->info->format = ((char *) info->info) + info->count;
9885   else
9886     t = info->info->format;
9887 
9888   while (s < end)
9889     *t++ = *s++;
9890   *t = '\0';
9891   info->count = t+1 - ((char *) info->info);
9892 }
9893 
set_date(INFO * info,char * s,char * end)9894 static void set_date(INFO *info, char *s, char *end)
9895 {
9896   char *t;
9897 
9898   if (info->error)
9899     return;
9900 
9901   if (end == NULL)
9902     for (end=s; *end; end++)
9903       ;
9904 
9905   while (s < end && isspace(*s)) s++;
9906   while (s < end && isspace(*(end-1))) end--;
9907   if (s < end && (*s == '\'' || *s == '"') &&
9908       (*(end-1) == '\'' || *(end-1) == '"')) {
9909     s++;
9910     end--;
9911     while (s < end && isspace(*s)) s++;
9912     while (s < end && isspace(*(end-1))) end--;
9913   }
9914   if (s >= end || (end - s == 11 && mystreq(s, '0', "01-JAN-0000")))
9915     return;
9916 
9917   if (!setup_field(info, SEQINFO_DATE, end - s + 1))
9918     return;
9919 
9920   if (info->info->date == NULL)
9921     t = info->info->date = ((char *) info->info) + info->count;
9922   else
9923     t = info->info->date;
9924 
9925   while (s < end)
9926     *t++ = *s++;
9927   *t = '\0';
9928   info->count = t+1 - ((char *) info->info);
9929 }
9930 
add_description(INFO * info,char * s,char * end)9931 static void add_description(INFO *info, char *s, char *end)
9932 {
9933   int count;
9934   char *t, *s2;
9935 
9936   if (info->error)
9937     return;
9938 
9939   if (end == NULL)
9940     for (end=s; *end; end++)
9941       ;
9942 
9943   while (s < end && (isspace(*s) || *s == '>' || *s == ';')) s++;
9944   while (s < end && isspace(*(end-1))) end--;
9945   if (s < end && (*s == '\'' || *s == '"') &&
9946       (*(end-1) == '\'' || *(end-1) == '"')) {
9947     s++;
9948     end--;
9949     while (s < end && isspace(*s)) s++;
9950     while (s < end && isspace(*(end-1))) end--;
9951   }
9952   if (s < end && (*(end-1) == '.' || *(end-1) == ';')) end--;
9953   if (s >= end)
9954     return;
9955 
9956   for (count=1,s2=s; s2 < end; s2++)
9957     if (*s2 == '\t')
9958       count += 2;
9959 
9960   if (!setup_field(info, SEQINFO_DESCRIPTION, end - s + count))
9961     return;
9962 
9963   if (info->info->description == NULL)
9964     t = info->info->description = ((char *) info->info) + info->count;
9965   else {
9966     t = ((char *) info->info) + info->count - 1;
9967     *t++ = ' ';
9968   }
9969 
9970   while (s < end) {
9971     if (*s == '\n') {
9972       *t++ = ' ';
9973       while (s < end && isspace(*s)) s++;
9974     }
9975     else if (*s == '\t') {
9976       *t++ = ' ';
9977       *t++ = ' ';
9978       *t++ = ' ';
9979       s++;
9980     }
9981     else
9982       *t++ = *s++;
9983   }
9984   *t = '\0';
9985   info->count = t+1 - ((char *) info->info);
9986 }
9987 
add_organism(INFO * info,char * s,char * end)9988 static void add_organism(INFO *info, char *s, char *end)
9989 {
9990   int count;
9991   char *t, *s2;
9992 
9993   if (info->error)
9994     return;
9995 
9996   if (end == NULL)
9997     for (end=s; *end; end++)
9998       ;
9999 
10000   while (s < end && (isspace(*s) || *s == '>' || *s == ';')) s++;
10001   while (s < end && isspace(*(end-1))) end--;
10002   if (s < end && (*s == '\'' || *s == '"') &&
10003       (*(end-1) == '\'' || *(end-1) == '"')) {
10004     s++;
10005     end--;
10006     while (s < end && isspace(*s)) s++;
10007     while (s < end && isspace(*(end-1))) end--;
10008   }
10009   if (s < end && *(end-1) == ';') end--;
10010   if (s >= end)
10011     return;
10012 
10013   for (count=1,s2=s; s2 < end; s2++)
10014     if (*s2 == '\t')
10015       count += 2;
10016 
10017   if (!setup_field(info, SEQINFO_ORGANISM, end - s + count))
10018     return;
10019 
10020   if (info->info->organism == NULL)
10021     t = info->info->organism = ((char *) info->info) + info->count;
10022   else {
10023     t = ((char *) info->info) + info->count - 1;
10024     *t++ = ' ';
10025   }
10026 
10027   while (s < end) {
10028     if (*s == '\n') {
10029       *t++ = ' ';
10030       while (s < end && isspace(*s)) s++;
10031     }
10032     else if (*s == '\t') {
10033       *t++ = ' ';
10034       *t++ = ' ';
10035       *t++ = ' ';
10036       s++;
10037     }
10038     else
10039       *t++ = *s++;
10040   }
10041   *t = '\0';
10042   info->count = t+1 - ((char *) info->info);
10043 }
10044 
add_comment(INFO * info,char * s,char * end,int stripflag,int stripsize)10045 static void add_comment(INFO *info, char *s, char *end, int stripflag,
10046                         int stripsize)
10047 {
10048   int i;
10049   char *t;
10050 
10051   if (info->error)
10052     return;
10053 
10054   if (end == NULL)
10055     for (end=s; *end; end++)
10056       ;
10057 
10058   if (stripflag) {
10059     while (s < end && ((isspace(*s) && stripsize) || *s == '>' || *s == ';'))
10060       s++;
10061     while (s < end && isspace(*(end-1))) end--;
10062     if (s < end && (*s == '\'' || *s == '"') &&
10063         (*(end-1) == '\'' || *(end-1) == '"')) {
10064       s++;
10065       end--;
10066       while (s < end && isspace(*s)) s++;
10067       while (s < end && isspace(*(end-1))) end--;
10068     }
10069   }
10070   if (s > end)
10071     return;
10072 
10073   if (!setup_field(info, SEQINFO_COMMENT, end - s + 2))
10074     return;
10075 
10076   if (info->info->comment == NULL)
10077     t = info->info->comment = ((char *) info->info) + info->count;
10078   else
10079     t = ((char *) info->info) + info->count - 1;
10080 
10081   while (s < end) {
10082     if (*s == '\n') {
10083       s++;
10084       if (stripsize == -1)
10085         while (s < end && isspace(*s) && *s != '\n') s++;
10086       else if (stripsize > 0)
10087         for (i=0; s < end && *s != '\n' && i < stripsize; i++) s++;
10088 
10089       /*
10090        * When you find a continuation line (two spaces and then a non-space),
10091        * merge it with previous line.
10092        */
10093       if (stripsize > 0 && s + 3 < end &&
10094           s[0] == ' ' && s[1] == ' ' && s[2] != ' ') {
10095         s += 2;
10096         *t++ = ' ';
10097       }
10098       else if (s < end)
10099         *t++ = '\n';
10100     }
10101     else
10102       *t++ = *s++;
10103   }
10104   *t++ = '\n';
10105   *t = '\0';
10106   info->count = t+1 - ((char *) info->info);
10107 }
10108 
add_history(INFO * info,char * s,char * end,int stripsize)10109 static void add_history(INFO *info, char *s, char *end, int stripsize)
10110 {
10111   int i;
10112   char *t;
10113 
10114   if (info->error)
10115     return;
10116 
10117   if (end == NULL)
10118     for (end=s; *end; end++)
10119       ;
10120 
10121   while (s < end && ((isspace(*s) && stripsize) || *s == '>' || *s == ';'))
10122     s++;
10123   while (s < end && isspace(*(end-1))) end--;
10124   if (s < end && (*s == '\'' || *s == '"') &&
10125       (*(end-1) == '\'' || *(end-1) == '"')) {
10126     s++;
10127     end--;
10128     while (s < end && isspace(*s)) s++;
10129     while (s < end && isspace(*(end-1))) end--;
10130   }
10131   if (s < end && (*(end-1) == '.' || *(end-1) == ';')) end--;
10132   if (s > end)
10133     return;
10134 
10135   if (!setup_field(info, SEQINFO_HISTORY, end - s + 2))
10136     return;
10137 
10138   if (info->info->history == NULL)
10139     t = info->info->history = ((char *) info->info) + info->count;
10140   else
10141     t = ((char *) info->info) + info->count - 1;
10142 
10143   while (s < end) {
10144     if (*s == '\n') {
10145       s++;
10146       if (stripsize == -1)
10147         while (s < end && isspace(*s) && *s != '\n') s++;
10148       else if (stripsize > 0)
10149         for (i=0; s < end && *s != '\n' && i < stripsize; i++) s++;
10150 
10151       /*
10152        * When you find a continuation line (two spaces and then a non-space),
10153        * merge it with previous line.
10154        */
10155       if (s + 3 < end && s[0] == ' ' && s[1] == ' ' && s[2] != ' ') {
10156         s += 2;
10157         *t++ = ' ';
10158       }
10159       else
10160         *t++ = '\n';
10161     }
10162     else
10163       *t++ = *s++;
10164   }
10165   *t++ = '\n';
10166   *t = '\0';
10167   info->count = t+1 - ((char *) info->info);
10168 }
10169 
10170 
add_retrieval(INFO * info,int type,char * string)10171 static void add_retrieval(INFO *info, int type, char *string)
10172 {
10173   char buffer[128];
10174   if (type == 0)
10175     sprintf(buffer, "SEQIO retrieval from plain file.   %s", get_today());
10176   else if (type == 1)
10177     sprintf(buffer, "SEQIO retrieval from %s-format entry.   %s",
10178             string, get_today());
10179   else if (type == 2)
10180     sprintf(buffer, "SEQIO retrieval from %s database entry.   %s",
10181             string, get_today());
10182   else if (type == 3)
10183     sprintf(buffer, "SEQIO retrieval from %s output.   %s",
10184             string, get_today());
10185   else
10186     return;
10187 
10188   add_history(info, buffer, NULL, 0);
10189 }
10190 
10191 
10192 /*
10193  * Valid alphabet names:
10194  *    1) A prefix of "ds-", "ss-" or "ms-" may precede the name.
10195  *    2) If it ends with "DNA", it's DNA.
10196  *    3) If it ends with "RNA", it's RNA.
10197  *    4) "PRT", "AA", "Amino...", "Protein...", "Peptide..." are Protein
10198  *       (where "..." means that extra text can follow the string).
10199  */
get_alphabet(char * s,char * end)10200 static int get_alphabet(char *s, char *end)
10201 {
10202   char ch;
10203 
10204   if (end == NULL)
10205     for (end=s; *end; end++)
10206       ;
10207 
10208   while (s < end && isspace(*s)) s++;
10209   while (end > s && (isspace(*(end-1)) || *(end-1) == ',')) end--;
10210 
10211   if (end - s > 3 && s[2] == '-' && toupper(s[1]) == 'S' &&
10212       ((ch = toupper(s[0])) == 'D' || ch == 'S' || ch == 'M'))
10213     s += 3;
10214 
10215   if (end - s < 2)
10216     return UNKNOWN;
10217 
10218   if (end - s >= 2 && toupper(*(end-2)) == 'N' && toupper(*(end-1)) == 'A') {
10219     if (end - s == 2)
10220       return DNA;
10221 
10222     ch = toupper(*(end-3));
10223     if (ch == 'D')
10224       return DNA;
10225     else if (ch == 'R')
10226       return RNA;
10227   }
10228 
10229   ch = toupper(s[0]);
10230   if (ch == 'A') {
10231     if ((end - s == 2 && toupper(s[1]) == 'A') ||
10232         (end - s >= 5 && mystreq(s, 'A', "AMINO")))
10233       return PROTEIN;
10234   }
10235   else if (ch == 'P') {
10236     if ((end - s == 3 && mystreq(s+1, 'R', "RT")) ||
10237         (end - s >= 7 && mystreq(s+1, 'R', "ROTEIN")) ||
10238         (end - s >= 7 && mystreq(s+1, 'E', "EPTIDE")))
10239       return PROTEIN;
10240   }
10241 
10242   return UNKNOWN;
10243 }
10244 
set_alphabet(INFO * info,int alpha)10245 static void set_alphabet(INFO *info, int alpha)
10246 {
10247   if (!info->error)
10248     info->info->alphabet = alpha;
10249 }
10250 
set_fragment(INFO * info,int isfrag)10251 static void set_fragment(INFO *info, int isfrag)
10252 {
10253   if (!info->error)
10254     info->info->isfragment = isfrag;
10255 }
10256 
set_circular(INFO * info,int iscirc)10257 static void set_circular(INFO *info, int iscirc)
10258 {
10259   if (!info->error)
10260     info->info->iscircular = iscirc;
10261 }
10262 
set_fragstart(INFO * info,int pos)10263 static void set_fragstart(INFO *info, int pos)
10264 {
10265   if (!info->error)
10266     info->info->fragstart = pos;
10267 }
10268 
10269 
parse_comment(INFO * info,char * start,char * end,int stripflag,int stripsize,int flag)10270 static void parse_comment(INFO *info, char *start, char *end, int stripflag,
10271                           int stripsize, int flag)
10272 {
10273   int i, allflag;
10274   char *s, *t, *line, *comend, *first_seqio, *init_seqio;
10275 
10276   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
10277 
10278   /*
10279    * Find the end of the comments and the beginning of the "SEQIO" lines.
10280    * They may be separated by a blank line.
10281    */
10282   first_seqio = init_seqio = comend = NULL;
10283   line = start;
10284   for (s=start; s < end; ) {
10285     if (mystreq(s, 'S', "SEQIO") ||
10286         (first_seqio != NULL && *s == ' ' && s[1] == ' ' && s[2] != ' ')) {
10287       if (!first_seqio) {
10288         first_seqio = line;
10289         init_seqio = s;
10290       }
10291     }
10292     else if (*s == '\n') {
10293       comend = line;
10294       first_seqio = init_seqio = NULL;
10295     }
10296     else
10297       first_seqio = init_seqio = comend = NULL;
10298 
10299     while (s < end && *s != '\n') s++;
10300     if (s+1 >= end)
10301       break;
10302     line = ++s;
10303 
10304     if (stripsize) {
10305       if (stripsize == -1)
10306         while (s < end && isspace(*s) && *s != '\n') s++;
10307       else
10308         for (i=0; s < end && *s != '\n' && i < stripsize; i++) s++;
10309     }
10310   }
10311 
10312   if (!comend)
10313     comend = (first_seqio ? first_seqio : end);
10314   if (comend > start && (flag == SEQINFO_ALL || flag == SEQINFO_COMMENT)) {
10315     add_comment(info, start, comend, stripflag, stripsize);
10316     if (flag == SEQINFO_COMMENT)
10317       return;
10318   }
10319 
10320   if (first_seqio && (allflag || flag == SEQINFO_IDLIST)) {
10321     while (init_seqio < end && mystreq(init_seqio, 'S', "SEQIO REFS: ")) {
10322       for (s=init_seqio+11; s < end && isspace(*s) && *s != '\n'; s++) ;
10323       if (is_idprefix(s)) {
10324         while (s < end && !isspace(*s)) {
10325           for (t=s; s < end && !isspace(*s) && *s != '|' && *s != ','; s++) ;
10326           add_id(info, "", t, s);
10327           if (s < end && !isspace(*s)) s++;
10328         }
10329 
10330         while (s < end && *s != '\n') s++;
10331 
10332         if (s == end)
10333           init_seqio = end;
10334         else {
10335           s++;
10336           if (stripsize) {
10337             if (stripsize == -1)
10338               while (s < end && isspace(*s) && *s != '\n') s++;
10339             else
10340               for (i=0; s < end && *s != '\n' && i < stripsize; i++) s++;
10341           }
10342           init_seqio = s;
10343         }
10344       }
10345     }
10346 
10347     if (init_seqio < end && flag == SEQINFO_ALL)
10348       add_history(info, init_seqio, end, stripsize);
10349   }
10350 }
10351 
10352 
parse_oneline(INFO * info,char * start,char * end,int info_flag)10353 static void parse_oneline(INFO *info, char *start, char *end, int info_flag)
10354 {
10355   int i, flag, alphabet, tseqflag, num1, num2, allflag;
10356   char *s, *t, *alphastart, *alphaend;
10357 
10358   if (end == NULL)
10359     for (end=start; *end; end++) ;
10360 
10361   while (start < end && isspace(*start)) start++;
10362   while (end > start && isspace(*(end-1))) end--;
10363   if (start == end)
10364     return;
10365 
10366   allflag = (info_flag == SEQINFO_ALL || info_flag == SEQINFO_ALLINFO);
10367 
10368   /*
10369    * First, find and parse the length section, if it exists.
10370    * The length section is detected by looking for the following things
10371    * occurring at the end of the string: 1) a comma, 2) the digits of
10372    * the length, 3) one of "aa", "bp" or "ch", 4) optionally a "(...)"
10373    * string and 5) optionally a ".".  If any of 1, 2 or 3 is missing,
10374    * the text at the end of the string is considered part of the
10375    * organism string (or possibly the description string.
10376    */
10377   alphastart = NULL;
10378   alphabet = 0;
10379   s = end - 1;
10380 
10381   if (*s == '.')
10382     for (s--; s >= start && isspace(*s); s--) ;
10383 
10384   if (s >= start && *s == ')') {
10385     for (s--; s >= start && *s != '('; s--) ;
10386     if (s >= start) {
10387       alphastart = s--;
10388       while (s >= start && isspace(*s)) s--;
10389     }
10390   }
10391 
10392   flag = 1;
10393   if (s - 1 >= start) {
10394     s--;
10395     if (((toupper(*s) == 'A' && toupper(s[1]) == 'A' && (i = 1)) ||
10396          (toupper(*s) == 'B' && toupper(s[1]) == 'P' && (i = 2)) ||
10397          (toupper(*s) == 'C' && toupper(s[1]) == 'H' && (i = 3)))) {
10398       alphabet = (i == 1 ? PROTEIN : (i == 2 ? DNA : UNKNOWN));
10399       for (s--; s >= start && isspace(*s); s--) ;
10400     }
10401     else
10402       flag = 0;
10403   }
10404 
10405   if (flag && s >= start) {
10406     if (isdigit(*s)) {
10407       while (s >= start && isdigit(*s)) s--;
10408       while (s >= start && isspace(*s)) s--;
10409     }
10410     else
10411       flag = 0;
10412   }
10413 
10414   if (flag && s >= start && *s == ',') {
10415     alphaend = end;
10416     end = s;
10417 
10418     if (allflag || info_flag == SEQINFO_ALPHABET ||
10419         info_flag == SEQINFO_CIRCULAR || info_flag == SEQINFO_FRAGMENT ||
10420         info_flag == SEQINFO_STARTPOS) {
10421       if (alphabet != UNKNOWN && (allflag || info_flag == SEQINFO_ALPHABET))
10422         set_alphabet(info, alphabet);
10423 
10424       if (alphastart != NULL) {
10425         s = alphastart + 1;
10426         while (s < alphaend && isspace(*s)) s++;
10427         while (s < alphaend && *s != ')') {
10428           for (t=s; s < alphaend && *s != ')' && !isspace(*s); s++) ;
10429           if ((alphabet = get_alphabet(t, s)) != UNKNOWN &&
10430               (allflag || info_flag == SEQINFO_ALPHABET))
10431             set_alphabet(info, alphabet);
10432 
10433           if ((allflag || info_flag == SEQINFO_FRAGMENT) &&
10434               (mystreq(t, 'F', "FRAGMENT") || mystreq(t, 'P', "PARTIAL")))
10435             set_fragment(info, 1);
10436 
10437           if ((allflag || info_flag == SEQINFO_FRAGMENT ||
10438                info_flag == SEQINFO_STARTPOS) &&
10439               toupper(*t) == 'F' && t[1] == '.' && isspace(t[2]) &&
10440               sscanf(t+2, " %d-%d", &num1, &num2) == 2) {
10441             if (info_flag != SEQINFO_STARTPOS)
10442               set_fragment(info, 1);
10443             if (info_flag != SEQINFO_FRAGMENT)
10444               set_fragstart(info, num1);
10445             while (s < alphaend && isspace(*s)) s++;
10446             while (s < alphaend && !isspace(*s)) s++;
10447           }
10448 
10449           if ((allflag || info_flag == SEQINFO_CIRCULAR) &&
10450               mystreq(t, 'C', "CIRCULAR"))
10451             set_circular(info, 1);
10452 
10453           while (s < alphaend && isspace(*s)) s++;
10454         }
10455       }
10456     }
10457   }
10458   if (info_flag == SEQINFO_ALPHABET || info_flag == SEQINFO_CIRCULAR ||
10459       info_flag == SEQINFO_STARTPOS)
10460     return;
10461 
10462   /*
10463    * Next, check for a list of identifiers at the beginning of the string.
10464    * The identifier list is of the following form:
10465    *
10466    *   1) It may begin with a '~', followed by 5-12 alphanumeric
10467    *      characters.  That string is treated as an accession number.
10468    *      That string (and a following '|') is skipped before checking
10469    *      for parts 2a or 2b.
10470    *   2a) If 2-4 alphanumeric characters are then followed by a ':',
10471    *       the initial non-whitespace segment is treated as a SEQIO
10472    *       identifier list.
10473    *   2b) If 2-3 alphanumeric characters are then followed by a '|'
10474    *       the initial non-whitespace segment is treated as an NCBI
10475    *       Search Format identifier list.
10476    */
10477   s = start;
10478   if (*s == '~') {
10479     for (t=s++; s < end && isalnum(*s); s++) ;
10480     if (s - t >= 6 && s - t <= 13) {
10481       if (allflag || info_flag == SEQINFO_IDLIST)
10482         add_id(info, "acc", t+1, s);
10483       if (*s == '|')
10484         s++;
10485     }
10486     else
10487       s = t;
10488   }
10489 
10490   if (is_idprefix(s)) {
10491     while (s < end && !isspace(*s)) {
10492       for (t=s; s < end && !isspace(*s) && *s != '|'; s++) ;
10493       if (allflag || info_flag == SEQINFO_IDLIST)
10494         add_id(info, "", t, s);
10495       if (s < end && *s == '|') {
10496         s++;
10497         if (!is_idprefix(s)) {
10498           while (s < end && !isspace(*s)) s++;
10499           break;
10500         }
10501       }
10502     }
10503   }
10504   else if (is_ncbiprefix(s)) {
10505     if (allflag || info_flag == SEQINFO_IDLIST)
10506       s = parse_ncbi_idlist(info, s, end);
10507     while (s < end && !isspace(*s)) s++;
10508   }
10509 
10510   /*
10511    * Check the end of the line for one of the keywords "(fragment)",
10512    * "(fragments)" or "(tentative sequence)" in the cases where no
10513    * length segment was found.  This is the case when alphastart is
10514    * set and is less than the end of the line.
10515    */
10516   tseqflag = 0;
10517   if ((allflag || info_flag == SEQINFO_FRAGMENT ||
10518        info_flag == SEQINFO_DESCRIPTION || info_flag == SEQINFO_ORGANISM) &&
10519       alphastart != NULL && alphastart < end) {
10520     if (mystreq(alphastart, '(', "(FRAGMENT)") ||
10521         mystreq(alphastart, '(', "(FRAGMENTS)")) {
10522       if (info_flag != SEQINFO_DESCRIPTION)
10523         set_fragment(info, 1);
10524       for (end=alphastart-1; end > start && isspace(*(end-1)); end--) ;
10525     }
10526     else if (mystreq(alphastart, '(', "(TENTATIVE SEQUENCE)")) {
10527       tseqflag = 1;
10528       for (end=alphastart-1; end > start && isspace(*(end-1)); end--) ;
10529     }
10530   }
10531 
10532   if (info_flag == SEQINFO_FRAGMENT)
10533     return;
10534 
10535   /*
10536    * Finally, find the description and organism sections in the rest of the
10537    * line.  The separation between description and organism occurs at
10538    * the string " - ", and if that doesn't appear, then the rest of the text
10539    * is considered the description.
10540    */
10541   for (start=s; start < end && isspace(*start); start++) ;
10542   s = start;
10543   while (s + 1 < end && !(*s == '-' && s[-1] == ' ' && s[1] == ' '))
10544     s++;
10545 
10546   if (s + 1 < end) {
10547     if (allflag || info_flag == SEQINFO_ORGANISM)
10548       add_organism(info, s+1, end);
10549     end = s - 1;
10550   }
10551 
10552   if (allflag || info_flag == SEQINFO_DESCRIPTION) {
10553     add_description(info, start, end);
10554     if (tseqflag)
10555       add_description(info, "(tentative sequence)", NULL);
10556   }
10557 }
10558 
10559 
10560 #define ncbi_idtable_size 13
10561 static struct {
10562   char *prefix;
10563   int len;
10564   char *type1, *type2;
10565 } ncbi_idtable[ncbi_idtable_size] = {
10566   { "gi|",  3, "gi",  NULL },
10567   { "bbs|", 4, "bbs", NULL },
10568   { "bbm|", 4, "bbm", NULL },
10569   { "gb|",  3, "acc", "gb" },
10570   { "gp|",  3, "acc", "gp" },
10571   { "emb|", 4, "acc", "embl" },
10572   { "pir|", 4, "acc", "pir" },
10573   { "sp|",  3, "acc", "sp" },
10574   { "dbj|", 4, "acc", "ddbj" },
10575   { "prf|", 4, "acc", "prf" },
10576   { "pdb|", 4, "pdb", "-" },
10577   { "oth|", 4, "acc", "oth" },
10578   { "lcl|", 4, "oth" }
10579 };
10580 
is_ncbiprefix(char * s)10581 static int is_ncbiprefix(char *s)
10582 {
10583   return (isalpha(s[0]) && isalpha(s[1]) &&
10584           (s[2] == '|' || (isalpha(s[2]) && s[3] == '|')));
10585 }
10586 
parse_ncbi_idlist(INFO * info,char * s,char * end)10587 static char *parse_ncbi_idlist(INFO *info, char *s, char *end)
10588 {
10589   int i;
10590   char *t, *t2, *t3, *pref1, *pref2;
10591 
10592   while (s < end && !isspace(*s)) {
10593     if (end - s <= 5)
10594       return s;
10595 
10596     for (i=0; i < ncbi_idtable_size; i++)
10597       if (myncasecmp(s, ncbi_idtable[i].prefix, ncbi_idtable[i].len) == 0)
10598         break;
10599 
10600     if (i < ncbi_idtable_size) {
10601       pref1 = ncbi_idtable[i].type1;
10602       pref2 = ncbi_idtable[i].type2;
10603 
10604       s += ncbi_idtable[i].len;
10605       for (t=s; s < end && !isspace(*s) && *s != '|'; s++) ;
10606       if (t == s || (pref2 != NULL && (s == end || isspace(*s))))
10607         return t;
10608 
10609       if (pref2 == NULL)
10610         add_id(info, pref1, t, s);
10611       else {
10612         for (t2=++s; s < end && !isspace(*s) && *s != '|'; s++) ;
10613         if (t2 == s)
10614           return t;
10615 
10616         if (pref2 != NULL && pref2[0] != '-')
10617           add_id(info, pref2, t2, s);
10618         add_id(info, pref1, t, t2-1);
10619       }
10620 
10621       if (s < end && *s == '|')
10622         s++;
10623     }
10624     else if (mystreq(s, 'P', "PAT|")) {
10625       for (t=(s+=4); s < end && !isspace(*s) && *s != '|'; s++) ;
10626       if (t == s || s == end || isspace(*s))
10627         return t;
10628 
10629       for (t2=s++; s < end && !isspace(*s) && *s != '|'; s++) ;
10630       if (t2 == s || s == end || isspace(*s))
10631         return t;
10632 
10633       for (t3=s++; s < end && !isspace(*s) && *s != '|'; s++) ;
10634       if (t3 == s)
10635         return t;
10636 
10637       *t2 = *t3 = '.';
10638       add_id(info, "pat", t, s);
10639       *t2 = *t3 = '|';
10640     }
10641     else if (mystreq(s, 'G', "GNL|")) {
10642       for (t=(s+=4); s < end && !isspace(*s) && *s != '|'; s++) ;
10643       if (t == s || s == end || isspace(*s))
10644         return t;
10645 
10646       for (t2=s++; s < end && !isspace(*s) && *s != '|'; s++) ;
10647       if (t2 == s)
10648         return t;
10649 
10650       *t2 = ':';
10651       add_id(info, "oth", t, s);
10652       *t2 = '|';
10653     }
10654     else
10655       return s;
10656   }
10657 
10658   return s;
10659 }
10660 
10661 
parse_gcg_oneline(INFO * info,char * line,int flag)10662 void parse_gcg_oneline(INFO *info, char *line, int flag)
10663 {
10664   int gcgflag, month;
10665   char *s, *t, date[16];
10666 
10667   for (s=line; *s != '\n' && isspace(*s); s++) ;
10668 
10669   gcgflag = 0;
10670   while (*s != '\n') {
10671     if (mystreq(s, 'M', "MSF: ")) {
10672       while (*s != '\n' && !isspace(*s)) s++;
10673       while (*s != '\n' && isspace(*s)) s++;
10674     }
10675     else if (mystreq(s, 'L', "LENGTH: ")) {
10676       while (*s != '\n' && !isspace(*s)) s++;
10677       while (*s != '\n' && isspace(*s)) s++;
10678     }
10679     else if (mystreq(s, 'T', "TYPE: ")) {
10680       while (*s != '\n' && !isspace(*s)) s++;
10681       while (*s != '\n' && isspace(*s)) s++;
10682       if (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO ||
10683           flag == SEQINFO_ALPHABET) {
10684         if (toupper(*s) == 'N')
10685           set_alphabet(info, DNA);
10686         else if (toupper(*s) == 'P')
10687           set_alphabet(info, PROTEIN);
10688       }
10689     }
10690     else if (mystreq(s, 'C', "CHECK: ")) {
10691       while (*s != '\n' && !isspace(*s)) s++;
10692       while (*s != '\n' && isspace(*s)) s++;
10693     }
10694     else if ((month = isamonth(s)) != 0) {
10695       while (*s != '\n' && !isspace(*s)) s++;
10696       while (*s != '\n' && isspace(*s)) s++;
10697 
10698       if ((flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO ||
10699            flag == SEQINFO_DATE) && !info->error && info->info->date == NULL &&
10700           isdigit(*s) && isdigit(s[1]) && s[2] == ',' &&
10701           s[3] == ' ' && isdigit(s[4]) && isdigit(s[5]) &&
10702           isdigit(s[6]) && isdigit(s[7]) && isspace(s[8])) {
10703         date[0] = *s;
10704         date[1] = s[1];
10705         date[2] = '-';
10706         date[3] = months[month][0];
10707         date[4] = months[month][1];
10708         date[5] = months[month][2];
10709         date[6] = '-';
10710         date[7] = s[4];
10711         date[8] = s[5];
10712         date[9] = s[6];
10713         date[10] = s[7];
10714         date[11] = '\0';
10715         set_date(info, date, date + 11);
10716       }
10717       for (s+=8; *s != '\n' && isspace(*s); s++) ;
10718     }
10719     else if (gcgflag == 0) {
10720       if (!info->error && info->info->description == NULL) {
10721         for (t=s; !isspace(*t); t++) ;
10722         add_description(info, s, t);
10723       }
10724     }
10725 
10726     while (*s != '\n' && !isspace(*s)) s++;
10727     while (*s != '\n' && isspace(*s)) s++;
10728     gcgflag = 1;
10729   }
10730 }
10731 
10732 
10733 
10734 static char *gistring, *giptr, *giend, *gilastline;
10735 
gi_startline(char * s,int len)10736 static void gi_startline(char *s, int len)
10737 {
10738   giptr = gistring = s;
10739   giend = s + len;
10740   gilastline = NULL;
10741 }
10742 
gi_getline(char ** line_out,char ** end_out,int flag)10743 static int gi_getline(char **line_out, char **end_out, int flag)
10744 {
10745   char *s;
10746 
10747   if (gilastline != NULL) {
10748     giptr = gilastline;
10749     gilastline = NULL;
10750   }
10751 
10752   if (giptr >= giend)
10753     return 0;
10754 
10755   s = giptr;
10756   while (1) {
10757     while (s < giend && *s != '\n') s++;
10758     if (!flag || s+1 >= giend || !isspace(s[1]))
10759       break;
10760     s++;
10761   }
10762 
10763   *line_out = giptr;
10764   *end_out = giptr = s;
10765 
10766   if (giptr < giend)
10767     giptr++;
10768 
10769   return 1;
10770 }
10771 
gi_ungetline(char * line)10772 static void gi_ungetline(char *line)
10773 {
10774   gilastline = line;
10775 }
10776 
10777 
10778 
10779 
raw_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)10780 static int raw_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
10781 {
10782   int status;
10783   char *s, *t;
10784   INFO info;
10785 
10786   start_info(&info, isfp, flag);
10787 
10788   if (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO) {
10789     set_filename(&info, isfp->filename);
10790     if (isfp->db_name != NULL)
10791       set_dbname(&info, isfp->db_name);
10792     set_format(&info, seqfformat(isfp, 0));
10793 
10794     if (!info.error) {
10795       info.info->entryno = isfp->entry_count;
10796       info.info->seqno = isfp->entry_seqno;
10797       info.info->numseqs = isfp->entry_numseqs;
10798     }
10799   }
10800 
10801   if ((flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO ||
10802        flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
10803     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
10804 
10805   if (((flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO ||
10806         flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
10807       ((flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO ||
10808         flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
10809     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
10810     if (status != STATUS_OK || status != STATUS_WARNING) {
10811       finish_info(&info, isfp);
10812       return status;
10813     }
10814     if (!info.error) {
10815       info.info->truelen = isfp->entry_truelen;
10816       info.info->rawlen = isfp->entry_rawlen;
10817     }
10818 
10819     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
10820       goto RAW_GI_END;
10821   }
10822 
10823   /*
10824    * Store the name of the file (minus any path information) as the
10825    * description of the entry.
10826    */
10827   if (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO ||
10828       flag == SEQINFO_DESCRIPTION) {
10829     for (s=t=isfp->filename; *s; s++)
10830       if (*s == '/')
10831         t = s + 1;
10832     add_description(&info, t, s);
10833   }
10834 
10835   /*
10836    * Finish the INFO structure.
10837    */
10838   if (flag == SEQINFO_ALL) {
10839     if (isfp->db_name == NULL)
10840       add_retrieval(&info, 0, NULL);
10841     else
10842       add_retrieval(&info, 2, isfp->db_name);
10843   }
10844 
10845 RAW_GI_END:
10846   finish_info(&info, isfp);
10847   memory_error(info.error, return STATUS_FATAL);
10848   return STATUS_OK;
10849 }
10850 
10851 
genbank_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)10852 static int genbank_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
10853 {
10854   int status, allflag;
10855   char *s, *t, *line, *end;
10856   INFO info;
10857 
10858   if (!mystreq(entry, 'L', "LOCUS ")) {
10859     if (isfp->filename && isfp->filename[0]) {
10860       raise_error(E_PARSEERROR, return STATUS_ERROR,
10861                   print_error("%s, entry %d:  Invalid format of GenBank "
10862                               "entry.\n", isfp->filename, isfp->entry_count));
10863     }
10864     else {
10865       raise_error(E_PARSEERROR, return STATUS_ERROR,
10866                   print_error("seqfparseent:  Invalid format of GenBank "
10867                               "entry.\n"));
10868     }
10869   }
10870 
10871   start_info(&info, isfp, flag);
10872   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
10873 
10874   if (allflag) {
10875     set_filename(&info, isfp->filename);
10876     if (isfp->db_name != NULL)
10877       set_dbname(&info, isfp->db_name);
10878     set_format(&info, seqfformat(isfp, 0));
10879 
10880     if (!info.error) {
10881       info.info->entryno = isfp->entry_count;
10882       info.info->seqno = isfp->entry_seqno;
10883       info.info->numseqs = isfp->entry_numseqs;
10884     }
10885   }
10886 
10887   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
10888     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
10889 
10890   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
10891       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
10892     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
10893     if (status != STATUS_OK && status != STATUS_WARNING) {
10894       finish_info(&info, isfp);
10895       return status;
10896     }
10897     if (!info.error) {
10898       info.info->truelen = isfp->entry_truelen;
10899       info.info->rawlen = isfp->entry_rawlen;
10900     }
10901 
10902     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
10903       goto GENBANK_GI_END;
10904   }
10905 
10906   /*
10907    * The LOCUS line contains an identifier, the alphabet type, a "circular"
10908    * flag and the date.
10909    * The ACCESSION line contains the accession number(s).
10910    * The DEFINITION line contains the description string.
10911    * The ORGANISM line of the SOURCE structure contains the organism name.
10912    * The COMMENT line contains any or all of an identifier list, comment
10913    * lines and history lines.
10914    */
10915   gi_startline(entry, len);
10916   while ((status = gi_getline(&line, &end, 1)) &&
10917          (line[0] != 'O' || strncmp(line, "ORIGIN", 6) != 0)) {
10918     switch (line[0]) {
10919     case 'A':
10920       if (strncmp(line, "ACCESSION", 9) == 0) {
10921         if (allflag || flag == SEQINFO_IDLIST) {
10922           s = line + 12;
10923           while (s < end && isspace(*s)) s++;
10924           while (s < end) {
10925             for (t=s; s < end && !isspace(*s); s++) ;
10926             add_id(&info, "acc", t, s);
10927             while (s < end && isspace(*s)) s++;
10928           }
10929         }
10930       }
10931       break;
10932 
10933     case 'C':
10934       if (strncmp(line, "COMMENT", 7) == 0) {
10935         if (allflag || flag == SEQINFO_IDLIST || flag == SEQINFO_COMMENT) {
10936           parse_comment(&info, line+7, end, 1, 12, flag);
10937           if (flag == SEQINFO_COMMENT || flag == SEQINFO_IDLIST)
10938             goto GENBANK_GI_END;
10939         }
10940       }
10941       break;
10942 
10943     case 'D':
10944       if (strncmp(line, "DEFINITION", 10) == 0) {
10945         if (allflag || flag == SEQINFO_DESCRIPTION) {
10946           add_description(&info, line+12, end);
10947           if (flag == SEQINFO_DESCRIPTION)
10948             goto GENBANK_GI_END;
10949         }
10950       }
10951       break;
10952 
10953     case 'L':
10954       if (strncmp(line, "LOCUS", 5) == 0) {
10955         if (allflag || flag == SEQINFO_IDLIST)
10956           add_id(&info, (isfp->db_idprefix ? isfp->db_idprefix : "gb"),
10957                  line+12, line+22);
10958 
10959         if (allflag || flag == SEQINFO_ALPHABET) {
10960           set_alphabet(&info, get_alphabet(line+36, line+40));
10961           if (flag == SEQINFO_ALPHABET)
10962             goto GENBANK_GI_END;
10963         }
10964 
10965         if ((allflag || flag == SEQINFO_CIRCULAR) &&
10966             !strncmp(line+42, "circular", 8)) {
10967           set_circular(&info, 1);
10968           if (flag == SEQINFO_CIRCULAR)
10969             goto GENBANK_GI_END;
10970         }
10971 
10972         if (allflag || flag == SEQINFO_DATE) {
10973           set_date(&info, line+62, line+73);
10974           if (flag == SEQINFO_DATE)
10975             goto GENBANK_GI_END;
10976         }
10977       }
10978       break;
10979 
10980     case 'N':
10981     case 'P':
10982       if (toupper(line[1]) == 'I' && toupper(line[2]) == 'D' &&
10983           isspace(line[3]) && (allflag || flag == SEQINFO_IDLIST))
10984         add_id(&info, (line[0] == 'N' ? "nid" : "pid"), line+12, end);
10985       break;
10986 
10987     case 'S':
10988       if (strncmp(line, "SOURCE", 6) == 0) {
10989         if (allflag || flag == SEQINFO_ORGANISM) {
10990           for (s=line+6; s < end; s++) {
10991             if (*s == '\n' && strncmp(s, "\n  ORGANISM", 11) == 0) {
10992               s += 11;
10993               for (t=s; s < end && *s != '\n'; s++) ;
10994               add_organism(&info, t, s);
10995               break;
10996             }
10997           }
10998           if (flag == SEQINFO_ORGANISM)
10999             goto GENBANK_GI_END;
11000         }
11001       }
11002       break;
11003     }
11004   }
11005 
11006   if (status == 0) {
11007     if (isfp->filename && isfp->filename[0]) {
11008       raise_error(E_PARSEERROR, return STATUS_ERROR,
11009                   print_error("%s, entry %d:  Invalid format of GenBank "
11010                               "entry.\n", isfp->filename, isfp->entry_count));
11011     }
11012     else {
11013       raise_error(E_PARSEERROR, return STATUS_ERROR,
11014                   print_error("seqfparseent:  Invalid format of GenBank "
11015                               "entry.\n"));
11016     }
11017   }
11018 
11019   /*
11020    * Add the complete header as a comment if SEQINFO_ALLINFO is specified.
11021    */
11022   if (flag == SEQINFO_ALLINFO) {
11023     for (s=line; s < end && *s != '\n'; s++) ;
11024     add_comment(&info, entry, s+1, 0, 0);
11025   }
11026 
11027   /*
11028    * Check the GCG infoline for information about the date, alphabet and
11029    * description.
11030    */
11031   if (isfp->format == FORMAT_GCG && isfp->gcg_infoline &&
11032       (allflag || flag == SEQINFO_DATE || flag == SEQINFO_ALPHABET ||
11033        flag == SEQINFO_DESCRIPTION))
11034     parse_gcg_oneline(&info, isfp->gcg_infoline, flag);
11035 
11036 GENBANK_GI_END:
11037   /*
11038    * Finish the INFO structure.
11039    */
11040   if (flag == SEQINFO_ALL) {
11041     if (isfp->db_name == NULL)
11042       add_retrieval(&info, 1, "GenBank");
11043     else
11044       add_retrieval(&info, 2, isfp->db_name);
11045   }
11046 
11047   finish_info(&info, isfp);
11048   memory_error(info.error, return STATUS_FATAL);
11049   return STATUS_OK;
11050 }
11051 
11052 
pir_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)11053 static int pir_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
11054 {
11055   int status, allflag;
11056   char *s, *t, *line, *end;
11057   INFO info;
11058 
11059   if (!mystreq(entry, 'E', "ENTRY ")) {
11060     if (isfp->filename && isfp->filename[0]) {
11061       raise_error(E_PARSEERROR, return STATUS_ERROR,
11062                   print_error("%s, entry %d:  Invalid format of PIR "
11063                               "entry.\n", isfp->filename, isfp->entry_count));
11064     }
11065     else {
11066       raise_error(E_PARSEERROR, return STATUS_ERROR,
11067                   print_error("seqfparseent:  Invalid format of PIR "
11068                               "entry.\n"));
11069     }
11070   }
11071 
11072   start_info(&info, isfp, flag);
11073   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
11074 
11075   if (allflag) {
11076     set_filename(&info, isfp->filename);
11077     if (isfp->db_name != NULL)
11078       set_dbname(&info, isfp->db_name);
11079     set_format(&info, seqfformat(isfp, 0));
11080 
11081     if (!info.error) {
11082       info.info->entryno = isfp->entry_count;
11083       info.info->seqno = isfp->entry_seqno;
11084       info.info->numseqs = isfp->entry_numseqs;
11085     }
11086   }
11087 
11088   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL) {
11089     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
11090     if (flag == SEQINFO_ALPHABET)
11091       goto PIR_GI_END;
11092   }
11093 
11094   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
11095       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
11096     status = (*file_table[FORMAT_PIR].getseq_fn)(isfp, GETSEQ_LENGTHS);
11097     if (status != STATUS_OK && status != STATUS_WARNING) {
11098       finish_info(&info, isfp);
11099       return status;
11100     }
11101     if (!info.error) {
11102       info.info->truelen = isfp->entry_truelen;
11103       info.info->rawlen = isfp->entry_rawlen;
11104     }
11105 
11106     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
11107       goto PIR_GI_END;
11108   }
11109 
11110   /*
11111    * The ENTRY line contains the idnum number.  The ACCESSION line contains
11112    * the accession number.  The ORGANISM line contains the organism name
11113    * in the "#formal_name" sub-field.  The TITLE line contains the
11114    * description.
11115    */
11116   gi_startline(entry, len);
11117   while ((status = gi_getline(&line, &end, 1)) &&
11118           (line[0] != 'S' || strncmp(line, "SEQUENCE", 8) != 0)) {
11119     switch (line[0]) {
11120     case 'A':
11121       if (strncmp(line, "ACCESSION", 9) == 0) {
11122         if (allflag || flag == SEQINFO_IDLIST) {
11123           s = line + 10;
11124           while (s < end && isspace(*s)) s++;
11125           while (s < end) {
11126             for (t=s; s < end && !isspace(*s) && *s != ';'; s++) ;
11127             add_id(&info, "acc", t, s);
11128             while (s < end && (isspace(*s) || *s == ';')) s++;
11129           }
11130         }
11131       }
11132       break;
11133 
11134     case 'C':
11135       if (strncmp(line, "COMMENT", 7) == 0) {
11136         if (allflag || flag == SEQINFO_IDLIST || flag == SEQINFO_COMMENT)
11137           parse_comment(&info, line+7, end, 1, 11, flag);
11138       }
11139       break;
11140 
11141     case 'D':
11142       if (strncmp(line, "DATE", 4) == 0) {
11143         if (allflag || flag == SEQINFO_DATE) {
11144           for (s=line+4; s < end && isspace(*s); s++) ;
11145           if (s + 11 < end) {
11146             for (t=s; t < end; t++) {
11147               if (*t == '#') {
11148                 while (t < end && !isspace(*t)) t++;
11149                 while (t < end && isspace(*t)) t++;
11150                 if (t+11 <= end)
11151                   s = t;
11152               }
11153             }
11154             set_date(&info, s, s+11);
11155           }
11156           if (flag == SEQINFO_DATE)
11157             goto PIR_GI_END;
11158         }
11159       }
11160       break;
11161 
11162     case 'E':
11163       if (strncmp(line, "ENTRY", 5) == 0) {
11164         if (allflag || flag == SEQINFO_IDLIST || flag == SEQINFO_FRAGMENT) {
11165           for (s=line+5; s < end && isspace(*s); s++) ;
11166           for (t=s; s < end && !isspace(*s); s++) ;
11167 
11168           add_id(&info, (isfp->db_idprefix ? isfp->db_idprefix : "pir"), t, s);
11169           while (1) {
11170             while (s < end && *s != '#') s++;
11171             if (s == end)
11172               break;
11173 
11174             if (strncmp(s, "#type ", 6) == 0) {
11175               for (s+=6; s < end && isspace(*s); s++) ;
11176               if (s < end && strncmp(s, "fragment", 8) == 0)
11177                 set_fragment(&info, 1);
11178               if (flag == SEQINFO_FRAGMENT)
11179                 goto PIR_GI_END;
11180               break;
11181             }
11182 
11183             s++;
11184           }
11185         }
11186       }
11187       break;
11188 
11189     case 'O':
11190       if (strncmp(line, "ORGANISM", 8) == 0 &&
11191           (allflag || flag == SEQINFO_ORGANISM) && !info.error) {
11192         if (info.info->organism != NULL)
11193           info.info->organism = NULL;
11194 
11195         for (s=line+8; s < end; s++)
11196           if (strncmp(s, "#formal_name", 12) == 0)
11197             break;
11198 
11199         for (t=s+=12; s < end && *s != '#'; s++) ;
11200         add_organism(&info, t, s);
11201 
11202         if (flag == SEQINFO_ORGANISM)
11203           goto PIR_GI_END;
11204       }
11205       break;
11206 
11207     case 'T':
11208       if (strncmp(line, "TITLE", 5) == 0) {
11209         if (allflag || flag == SEQINFO_IDLIST || flag == SEQINFO_ORGANISM ||
11210             flag == SEQINFO_FRAGMENT || flag == SEQINFO_CIRCULAR ||
11211             flag == SEQINFO_DESCRIPTION) {
11212           parse_oneline(&info, line+6, end, flag);
11213 
11214           if (flag == SEQINFO_DESCRIPTION || flag == SEQINFO_FRAGMENT ||
11215               flag == SEQINFO_CIRCULAR)
11216             goto PIR_GI_END;
11217         }
11218       }
11219       break;
11220     }
11221   }
11222 
11223   /*
11224    * Add the complete header as a comment if SEQINFO_ALLINFO is specified.
11225    */
11226   if (flag == SEQINFO_ALLINFO) {
11227     for (s=line; s < end && *s != '\n'; s++) ;
11228     add_comment(&info, entry, s+1, 0, 0);
11229   }
11230 
11231   /*
11232    * Check the GCG infoline for information about the date, alphabet and
11233    * description.
11234    */
11235   if (isfp->format == FORMAT_GCG && isfp->gcg_infoline &&
11236       (allflag || flag == SEQINFO_DATE || flag == SEQINFO_ALPHABET ||
11237        flag == SEQINFO_DESCRIPTION))
11238     parse_gcg_oneline(&info, isfp->gcg_infoline, flag);
11239 
11240 PIR_GI_END:
11241   /*
11242    * Finish the INFO structure.
11243    */
11244   if (flag == SEQINFO_ALL) {
11245     if (isfp->db_name == NULL)
11246       add_retrieval(&info, 1, "PIR");
11247     else
11248       add_retrieval(&info, 2, isfp->db_name);
11249   }
11250 
11251   finish_info(&info, isfp);
11252   memory_error(info.error, return STATUS_FATAL);
11253   return STATUS_OK;
11254 }
11255 
11256 /*
11257  * This is commented out until I decide how to handle the reference id's.
11258  *
11259 
11260 #define dr_table_size 24
11261 static struct {
11262   char *name, *type1, *type2;
11263 } dr_table[dr_table_size] = {
11264   { "AARHUS/GHENT-2DPAGE",  "ag2d",  NULL },
11265   { "AGIS",                 "acc",   NULL },
11266   { "CPGISLE",              "cpg",   NULL },
11267   { "DICTYDB",              "acc",   "ddb" },
11268   { "ECO2DBASE",            "e2d",   NULL },
11269   { "ECOGENE",              "acc",   "eco" },
11270   { "EMBL",                 "acc",   "embl" },
11271   { "EPD",                  "epd",   NULL },
11272   { "FLYBASE",              "fly",   NULL },
11273   { "GCRDB",                "gcr",   NULL },
11274   { "GENBANK",              "acc",   "gb" },
11275   { "HIV",                  "acc",   "hiv" },
11276   { "IMGT/LIGM",            "acc",   NULL },
11277   { "MAIZEDB",              "mdb",   NULL },
11278   { "MIM",                  "mim",   NULL },
11279   { "PDB",                  "pdb",   NULL },
11280   { "PIR",                  "acc",   "pir" },
11281   { "PROSITE",              "acc",   "pros" },
11282   { "REBASE",               "reb",   NULL },
11283   { "SWISS-2DPAGE",         "acc",   NULL },
11284   { "SWISS-PROT",           "acc",   "sp" },
11285   { "TRANSFAC",             "acc",   "tfd" },
11286   { "WORMPEP",              NULL,    "wpep" },
11287   { "YEPD",                 "yepd",  NULL }
11288 };
11289 *
11290 */
11291 
embl_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)11292 static int embl_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
11293 {
11294   int alpha, period, count, os_flag, status, allflag;
11295   char *s, *t, *line, *end, *lastline, *prefix;
11296   INFO info;
11297 
11298   if (!mystreq(entry, 'I', "ID   ")) {
11299     if (isfp->filename && isfp->filename[0]) {
11300       raise_error(E_PARSEERROR, return STATUS_ERROR,
11301                   print_error("%s, entry %d:  Invalid format of "
11302                               "EMBL/Swiss-Prot entry.\n", isfp->filename,
11303                               isfp->entry_count));
11304     }
11305     else {
11306       raise_error(E_PARSEERROR, return STATUS_ERROR,
11307                   print_error("seqfparseent:  Invalid format of "
11308                               "EMBL/Swiss-Prot entry.\n"));
11309     }
11310   }
11311 
11312   prefix = NULL;
11313   start_info(&info, isfp, flag);
11314   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
11315 
11316   if (allflag) {
11317     set_filename(&info, isfp->filename);
11318     if (isfp->db_name != NULL)
11319       set_dbname(&info, isfp->db_name);
11320     set_format(&info, seqfformat(isfp, 0));
11321 
11322     if (!info.error) {
11323       info.info->entryno = isfp->entry_count;
11324       info.info->seqno = isfp->entry_seqno;
11325       info.info->numseqs = isfp->entry_numseqs;
11326     }
11327   }
11328 
11329   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
11330     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
11331 
11332   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
11333       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
11334     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
11335     if (status != STATUS_OK && status != STATUS_WARNING) {
11336       finish_info(&info, isfp);
11337       return status;
11338     }
11339     if (!info.error) {
11340       info.info->truelen = isfp->entry_truelen;
11341       info.info->rawlen = isfp->entry_rawlen;
11342     }
11343 
11344     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
11345       goto EMBL_GI_END;
11346   }
11347 
11348   /*
11349    * The ID line contains an identifier and possibly the alphabet and whether
11350    * it is circular.
11351    * The AC line(s) contain accession numbers.
11352    * The NI and PI lines contain NID and PID identifiers.
11353    * The DT line(s) contain dates (we assume the last date is the latest).
11354    * The DE line(s) contain the description.
11355    * The OS line(s) contain the organism name.
11356    * The CC and XX lines contain comments.
11357    * The DR lines may contain cross-references (see dr_table above).
11358    */
11359   gi_startline(entry, len);
11360   os_flag = 0;
11361   while ((status = gi_getline(&line, &end, 0)) &&
11362          !(line[0] == '/' && line[1] == '/') && !mystreq(line, 'S', "SQ   ")) {
11363     if (!isspace(line[2]) || !isspace(line[3]) || !isspace(line[4]))
11364       continue;
11365 
11366     switch (line[0]) {
11367     case 'A':
11368       if (line[1] == 'C' && (allflag || flag == SEQINFO_IDLIST)) {
11369         for (s=line+5; s < end && isspace(*s); s++) ;
11370         while (s < end) {
11371           for (t=s; s < end && !isspace(*s) && *s != ';'; s++) ;
11372           add_id(&info, "acc", t, s);
11373           while (s < end && (isspace(*s) || *s == ';')) s++;
11374         }
11375       }
11376       break;
11377 
11378     case 'C':
11379       if (line[1] == 'C' &&
11380           (allflag || flag == SEQINFO_COMMENT || flag == SEQINFO_IDLIST)) {
11381         for (s=line+5; s < end && isspace(*s); s++) ;
11382         if (s < end) {
11383           lastline = end;
11384           while (gi_getline(&line, &end, 0)) {
11385             if (mystreq(line, 'C', "CC   "))
11386               lastline = end;
11387             else if (!mystreq(line, 'X', "XX"))
11388               break;
11389           }
11390           gi_ungetline(line);
11391           parse_comment(&info, s, lastline, 1, 5, flag);
11392         }
11393       }
11394       break;
11395 
11396     case 'D':
11397       if (line[1] == 'E' &&
11398           (allflag || flag == SEQINFO_DESCRIPTION ||
11399            flag == SEQINFO_FRAGMENT)) {
11400         add_description(&info, line+5, end);
11401         while (gi_getline(&line, &end, 0) && mystreq(line, 'D', "DE   "))
11402           add_description(&info, line+5, end);
11403         gi_ungetline(line);
11404 
11405         if (!info.error && info.info->description) {
11406           for (t=s=info.info->description; *s; s++) ;
11407           while (s > t && isspace(*(s-1))) s--;
11408           if (s - t >= 10 && *(s-1) == ')' &&
11409               (mystreq(s-10, '(', "(FRAGMENT)") ||
11410                mystreq(s-11, '(', "(FRAGMENTS)"))) {
11411             set_fragment(&info, 1);
11412             s -= (*(s-10) == '(' ? 10 : 11);
11413             while (s > t && isspace(*(s-1))) s--;
11414             if (s > t)
11415               *s = '\0';
11416             else
11417               info.info->description = NULL;
11418             if (flag == SEQINFO_FRAGMENT)
11419               goto EMBL_GI_END;
11420           }
11421         }
11422 
11423         if (flag == SEQINFO_DESCRIPTION)
11424           goto EMBL_GI_END;
11425       }
11426       else if (line[1] == 'T' &&
11427                (flag == SEQINFO_ALL || flag == SEQINFO_DATE)) {
11428         for (s=line+5; s < end && isspace(*s); s++) ;
11429         for (t=s; s < end && !isspace(*s) && *s != ';' && *s != '.'; s++) ;
11430         set_date(&info, t, s);
11431       }
11432 
11433      /*
11434       * Like above, this is commented out until I can figure out
11435       * how to handle reference id's.
11436       *
11437       else if (line[1] == 'R' && (allflag || flag == SEQINFO_IDLIST)) {
11438         for (s=line+5; s < end && isspace(*s); s++) ;
11439         if (s < end) {
11440           for (i=0; i < dr_table_size; i++) {
11441             if (!myncasecmp(s, dr_table[i].name, strlen(dr_table[i].name))) {
11442               while (s < end && !isspace(*s) && *s != ';') s++;
11443               while (s < end && (isspace(*s) || *s == ';')) s++;
11444 
11445               for (t=s; s < end && !isspace(*s) && *s != ';'; s++) ;
11446               t1 = t;
11447               s1 = s;
11448 
11449               while (s < end && (isspace(*s) || *s == ';')) s++;
11450               for (t=s; s < end && !isspace(*s) && *s != ';' && *s != '.'; s++)
11451                 ;
11452               if (t != s && dr_table[i].type2 != NULL)
11453                 add_id(&info, dr_table[i].type2, t, s);
11454 
11455               if (t1 != s1 && dr_table[i].type1 != NULL)
11456                 add_id(&info, dr_table[i].type1, t1, s1);
11457               break;
11458             }
11459           }
11460         }
11461       }
11462       *
11463       */
11464       break;
11465 
11466     case 'I':
11467       if (line[1] == 'D' &&
11468           (allflag || flag == SEQINFO_IDLIST ||
11469            flag == SEQINFO_ALPHABET || flag == SEQINFO_CIRCULAR)) {
11470         if (allflag || flag == SEQINFO_IDLIST) {
11471           if (isfp->db_idprefix != NULL)
11472             prefix = isfp->db_idprefix;
11473           else {
11474             t = NULL;
11475             period = count = 0;
11476             for (s=line+5; s < end; s++) {
11477               if (*s == ';') {
11478                 count++;
11479                 t = s;
11480               }
11481               else if (*s == '.')
11482                 period = 1;
11483             }
11484 
11485             if (count == 3 && period)
11486               prefix = (mystreq(t-3, 'E', "EPD;") ? "epd" : "embl");
11487             else if (count == 2 && period && mystreq(t-3, 'P', "PRT;"))
11488               prefix = "sp";
11489             else
11490               prefix = "oth";
11491           }
11492         }
11493 
11494         for (s=line+5; s < end && isspace(*s); s++) ;
11495         for (t=s; s < end && !isspace(*s) && *s != ';'; s++) ;
11496 
11497         if (t != s && (allflag || flag == SEQINFO_IDLIST))
11498           add_id(&info, prefix, t, s);
11499 
11500         if (allflag || flag == SEQINFO_ALPHABET || flag == SEQINFO_CIRCULAR) {
11501           for ( ; s < end && (isspace(*s) || *s == ';'); s++) ;
11502           while (s < end) {
11503             for (t=s; s < end && !isspace(*s) && *s != ';' && *s != '.'; s++);
11504             if (s - t == 8 && strncmp(t, "circular", 8) == 0 &&
11505                 (allflag || flag == SEQINFO_CIRCULAR)) {
11506               set_circular(&info, 1);
11507               if (flag == SEQINFO_CIRCULAR)
11508                 goto EMBL_GI_END;
11509             }
11510             else if (s > t && (alpha = get_alphabet(t, s)) != UNKNOWN &&
11511                      (allflag || flag == SEQINFO_ALPHABET)) {
11512               set_alphabet(&info, alpha);
11513               if (flag == SEQINFO_ALPHABET)
11514                 goto EMBL_GI_END;
11515               else
11516                 break;
11517             }
11518             while (s < end && (isspace(*s) || *s == ';' || *s == '.')) s++;
11519           }
11520         }
11521       }
11522       break;
11523 
11524     case 'N':
11525     case 'P':
11526       if (line[1] == 'I' && (allflag || flag == SEQINFO_IDLIST))
11527         add_id(&info, (line[0] == 'N' ? "nid" : "pid"), line+5, end);
11528       break;
11529 
11530     case 'O':
11531       if (line[1] == 'S' && !os_flag &&
11532           (allflag || flag == SEQINFO_ORGANISM)) {
11533         for (s=end; s > line + 5 && isspace(*(s-1)); s--) ;
11534         if (*(s-1) == '.') s--;
11535         add_organism(&info, line + 5, s);
11536         os_flag = 1;
11537         if (flag == SEQINFO_ORGANISM)
11538           goto EMBL_GI_END;
11539       }
11540       break;
11541 
11542     case 'X':
11543       if (line[1] == 'X' &&
11544           (allflag || flag == SEQINFO_COMMENT || flag == SEQINFO_IDLIST)) {
11545         for (s=line+5; s < end && isspace(*s); s++) ;
11546         if (s < end) {
11547           while (gi_getline(&line, &end, 0))
11548             if (!mystreq(line, 'X', "XX   "))
11549               break;
11550           gi_ungetline(line);
11551           parse_comment(&info, s, line, 1, 5, flag);
11552         }
11553       }
11554       break;
11555     }
11556   }
11557 
11558 
11559   /*
11560    * Add the complete header as a comment if SEQINFO_ALLINFO is specified.
11561    */
11562   if (flag == SEQINFO_ALLINFO) {
11563     if (status == 0)
11564       add_comment(&info, entry, entry + len, 0, 0);
11565     else
11566       add_comment(&info, entry, end+1, 0, 0);
11567   }
11568 
11569   /*
11570    * Check the GCG infoline for information about the date, alphabet and
11571    * description.
11572    */
11573   if (isfp->format == FORMAT_GCG && isfp->gcg_infoline &&
11574       (allflag || flag == SEQINFO_DATE || flag == SEQINFO_ALPHABET ||
11575        flag == SEQINFO_DESCRIPTION))
11576     parse_gcg_oneline(&info, isfp->gcg_infoline, flag);
11577 
11578 
11579 EMBL_GI_END:
11580   /*
11581    * Finish the INFO structure.
11582    */
11583   if (flag == SEQINFO_ALL) {
11584     if (isfp->db_name == NULL) {
11585       if (prefix != NULL && prefix[0] == 's' && prefix [1] == 'p')
11586         add_retrieval(&info, 1, "Swiss-Prot");
11587       else
11588         add_retrieval(&info, 1, "EMBL");
11589     }
11590     else
11591       add_retrieval(&info, 2, isfp->db_name);
11592   }
11593 
11594   finish_info(&info, isfp);
11595   memory_error(info.error, return STATUS_FATAL);
11596   return STATUS_OK;
11597 }
11598 
11599 
nbrf_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)11600 static int nbrf_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
11601 {
11602   int status, allflag;
11603   char ch, *s, *t, *line, *start, *end;
11604   INFO info;
11605 
11606   if (*entry != '>') {
11607     if (isfp->filename && isfp->filename[0]) {
11608       raise_error(E_PARSEERROR, return STATUS_ERROR,
11609                   print_error("%s, entry %d:  Invalid format of NBRF "
11610                               "entry.\n", isfp->filename, isfp->entry_count));
11611     }
11612     else {
11613       raise_error(E_PARSEERROR, return STATUS_ERROR,
11614                   print_error("seqfparseent:  Invalid format of NBRF "
11615                               "entry.\n"));
11616     }
11617   }
11618 
11619   start_info(&info, isfp, flag);
11620   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
11621 
11622   if (allflag) {
11623     set_filename(&info, isfp->filename);
11624     if (isfp->db_name != NULL)
11625       set_dbname(&info, isfp->db_name);
11626     set_format(&info, seqfformat(isfp, 0));
11627 
11628     if (!info.error) {
11629       info.info->entryno = isfp->entry_count;
11630       info.info->seqno = isfp->entry_seqno;
11631       info.info->numseqs = isfp->entry_numseqs;
11632     }
11633   }
11634 
11635   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
11636     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
11637 
11638   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
11639       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
11640     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
11641     if (status != STATUS_OK && status != STATUS_WARNING) {
11642       finish_info(&info, isfp);
11643       return status;
11644     }
11645     if (!info.error) {
11646       info.info->truelen = isfp->entry_truelen;
11647       info.info->rawlen = isfp->entry_rawlen;
11648     }
11649 
11650     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
11651       goto NBRF_GI_END;
11652   }
11653 
11654   /*
11655    * The first header line contains an identifier after the semi-colon
11656    * (which appears at position 4).  The next line is the one-line
11657    * description.
11658    */
11659   gi_startline(entry, len);
11660   gi_getline(&line, &end, 0);
11661   ch = toupper(line[1]);
11662   if ((ch == 'P' || ch == 'F') && line[2] == '1') {
11663     if (allflag || flag == SEQINFO_ALPHABET) {
11664       set_alphabet(&info, PROTEIN);
11665       if (flag == SEQINFO_ALPHABET)
11666         goto NBRF_GI_END;
11667     }
11668     if (allflag || flag == SEQINFO_FRAGMENT) {
11669       set_fragment(&info, ch == 'F');
11670       if (flag == SEQINFO_FRAGMENT)
11671         goto NBRF_GI_END;
11672     }
11673   }
11674   else if (ch == 'D' || ch == 'R') {
11675     if (allflag || flag == SEQINFO_ALPHABET) {
11676       set_alphabet(&info, (ch == 'D' ? DNA : RNA));
11677       if (flag == SEQINFO_ALPHABET)
11678         goto NBRF_GI_END;
11679     }
11680     if (allflag || flag == SEQINFO_CIRCULAR) {
11681       set_circular(&info, toupper(line[2]) == 'C');
11682       if (flag == SEQINFO_CIRCULAR)
11683         goto NBRF_GI_END;
11684     }
11685   }
11686 
11687   if (allflag || flag == SEQINFO_IDLIST) {
11688     s = line + 4;
11689     while (s < end && !isspace(*s)) {
11690       for (t=s; s < end && !isspace(*s) && *s != '|'; s++) ;
11691       add_id(&info, (isfp->db_idprefix ? isfp->db_idprefix : "oth"), t, s);
11692       if (*s == '|') s++;
11693     }
11694   }
11695 
11696   if (allflag || flag == SEQINFO_IDLIST || flag == SEQINFO_DESCRIPTION ||
11697       flag == SEQINFO_ORGANISM || flag == SEQINFO_FRAGMENT ||
11698       flag == SEQINFO_CIRCULAR || flag == SEQINFO_STARTPOS) {
11699     gi_getline(&line, &end, 0);
11700     parse_oneline(&info, line, end, flag);
11701   }
11702 
11703   if (flag == SEQINFO_ALLINFO)
11704     add_comment(&info, entry, end+1, 0, 0);
11705 
11706   /*
11707    * The rest of the information occurs in the header lines at the end
11708    * of the entry.  Look for a Date line, Accession lines and Comment
11709    * lines.
11710    */
11711   if (allflag || flag == SEQINFO_IDLIST ||
11712       flag == SEQINFO_COMMENT || flag == SEQINFO_DATE) {
11713     start = NULL;
11714     if (isfp->filename && isfp->filename[0]) {
11715       if (isfp->nbrf_header != NULL) {
11716         if (isfp->format != FORMAT_GCG || !isfp->gcg_infoline)
11717           gi_startline(isfp->nbrf_header,
11718                        isfp->fp_entryend - isfp->nbrf_header);
11719         else
11720           gi_startline(isfp->nbrf_header,
11721                        isfp->gcg_infoline - isfp->nbrf_header);
11722         start = isfp->nbrf_header;
11723       }
11724     }
11725     else {
11726       while ((status = gi_getline(&line, &end, 0)))
11727         if (line != end && line[1] == ';')
11728           break;
11729 
11730       if (status == 1) {
11731         gi_startline(line, (entry + len) - line);
11732         start = line;
11733       }
11734     }
11735 
11736     if (start != NULL) {
11737       while (gi_getline(&line, &end, 0)) {
11738         if (!(line[0] == 'C' && line[1] == ';'))
11739           continue;
11740 
11741         if (mystreq(line+2, 'A', "ACCESSION: ")) {
11742           if (allflag || flag == SEQINFO_IDLIST) {
11743             for (s=line+13; s < end && isspace(*s); s++) ;
11744             while (s < end) {
11745               for (t=s; s < end && !isspace(*s) && *s != ';'; s++) ;
11746               add_id(&info, "acc", t, s);
11747               while (s < end && (isspace(*s) || *s == ';')) s++;
11748             }
11749           }
11750         }
11751         else if (mystreq(line+2, 'C', "COMMENT: ")) {
11752           if (allflag || flag == SEQINFO_IDLIST || flag == SEQINFO_COMMENT) {
11753             for (s=line+10; s < end && isspace(*s); s++) ;
11754             if (s < end) {
11755               while ((status = gi_getline(&line, &end, 0)))
11756                 if (!mystreq(line, 'C', "C;COMMENT: "))
11757                   break;
11758 
11759               if (status == 0)
11760                 parse_comment(&info, s, isfp->fp_entryend, 1, 11, flag);
11761               else {
11762                 gi_ungetline(line);
11763                 parse_comment(&info, s, line, 1, 11, flag);
11764               }
11765             }
11766           }
11767         }
11768         else if (mystreq(line+2, 'D', "DATE: ")) {
11769           if (allflag || flag == SEQINFO_DATE) {
11770             for (s=line+7; s < end && isspace(*s); s++) ;
11771             if (s + 11 <= end) {
11772               for (t=s; t + 11 <= end; t++) {
11773                 if (*t == '#') {
11774                   while (t < end && !isspace(*t)) t++;
11775                   while (t < end && isspace(*t)) t++;
11776                   if (t+11 <= end)
11777                     s = t;
11778                 }
11779               }
11780               set_date(&info, s, s+11);
11781             }
11782             if (flag == SEQINFO_DATE)
11783               goto NBRF_GI_END;
11784           }
11785         }
11786       }
11787 
11788       if (flag == SEQINFO_ALLINFO)
11789         add_comment(&info, start, end+1, 0, 0);
11790     }
11791   }
11792 
11793   /*
11794    * Check the GCG infoline for information about the date, alphabet and
11795    * description.
11796    */
11797   if (isfp->format == FORMAT_GCG && isfp->gcg_infoline &&
11798       (allflag || flag == SEQINFO_DATE || flag == SEQINFO_ALPHABET ||
11799        flag == SEQINFO_DESCRIPTION))
11800     parse_gcg_oneline(&info, isfp->gcg_infoline, flag);
11801 
11802 NBRF_GI_END:
11803   /*
11804    * Finish the last fields in the INFO structure.
11805    */
11806   if (flag == SEQINFO_ALL) {
11807     if (isfp->db_name == NULL)
11808       add_retrieval(&info, 1, "NBRF");
11809     else
11810       add_retrieval(&info, 2, isfp->db_name);
11811   }
11812 
11813   finish_info(&info, isfp);
11814   memory_error(info.error, return STATUS_FATAL);
11815   return STATUS_OK;
11816 }
11817 
11818 
fasta_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)11819 static int fasta_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
11820 {
11821   int allflag, status;
11822   char *s, *t, *end;
11823   INFO info;
11824 
11825   if (*entry != '>') {
11826     if (isfp->filename && isfp->filename[0]) {
11827       raise_error(E_PARSEERROR, return STATUS_ERROR,
11828                   print_error("%s, entry %d:  Invalid format of FASTA "
11829                               "entry.\n", isfp->filename, isfp->entry_count));
11830     }
11831     else {
11832       raise_error(E_PARSEERROR, return STATUS_ERROR,
11833                   print_error("seqfparseent:  Invalid format of FASTA "
11834                               "entry.\n"));
11835     }
11836   }
11837 
11838   start_info(&info, isfp, flag);
11839   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
11840 
11841   if (allflag) {
11842     set_filename(&info, isfp->filename);
11843     if (isfp->db_name != NULL)
11844       set_dbname(&info, isfp->db_name);
11845     set_format(&info, seqfformat(isfp, 0));
11846 
11847     if (!info.error) {
11848       info.info->entryno = isfp->entry_count;
11849       info.info->seqno = isfp->entry_seqno;
11850       info.info->numseqs = isfp->entry_numseqs;
11851     }
11852   }
11853 
11854   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
11855     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
11856 
11857   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
11858       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
11859     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
11860     if (status != STATUS_OK && status != STATUS_WARNING) {
11861       finish_info(&info, isfp);
11862       return status;
11863     }
11864     if (!info.error) {
11865       info.info->truelen = isfp->entry_truelen;
11866       info.info->rawlen = isfp->entry_rawlen;
11867     }
11868 
11869     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
11870       goto FASTA_GI_END;
11871   }
11872 
11873   /*
11874    * The first header line is a oneline description.  All other
11875    * header lines are comments.
11876    */
11877   s = entry + 1;
11878   end = entry + len;
11879 
11880   for (t=s; s < end && *s != '\n'; s++) ;
11881   if (t != s && (allflag || flag == SEQINFO_IDLIST ||
11882                  flag == SEQINFO_DESCRIPTION || flag == SEQINFO_ORGANISM ||
11883                  flag == SEQINFO_FRAGMENT || flag == SEQINFO_CIRCULAR ||
11884                  flag == SEQINFO_STARTPOS)) {
11885     parse_oneline(&info, t, s, flag);
11886   }
11887 
11888   if ((s[1] == '>' || s[1] == ';') &&
11889       (allflag || flag == SEQINFO_COMMENT || flag == SEQINFO_IDLIST)) {
11890     s += 2;
11891     for (t=s; s < end && (*s != '\n' || s[1] == '>' || s[1] == ';'); s++) ;
11892     if (t + 2 < s && *t == '\n' && (t[1] == '>' || t[1] == ';'))
11893       t += 2;
11894     parse_comment(&info, t, s, 0, 1, flag);
11895   }
11896 
11897   if (flag == SEQINFO_ALLINFO)
11898     add_comment(&info, entry, s+1, 0, 0);
11899 
11900   /*
11901    * Check the GCG infoline for information about the date, alphabet and
11902    * description.
11903    */
11904   if (isfp->format == FORMAT_GCG && isfp->gcg_infoline &&
11905       (allflag || flag == SEQINFO_DATE || flag == SEQINFO_ALPHABET ||
11906        flag == SEQINFO_DESCRIPTION))
11907     parse_gcg_oneline(&info, isfp->gcg_infoline, flag);
11908 
11909 FASTA_GI_END:
11910   /*
11911    * Finish the last fields in the INFO structure.
11912    */
11913   if (flag == SEQINFO_ALL) {
11914     if (isfp->db_name == NULL)
11915       add_retrieval(&info, 1, "FASTA");
11916     else
11917       add_retrieval(&info, 2, isfp->db_name);
11918   }
11919 
11920   finish_info(&info, isfp);
11921   memory_error(info.error, return STATUS_FATAL);
11922   return STATUS_OK;
11923 }
11924 
11925 
stanford_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)11926 static int stanford_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
11927 {
11928   int allflag, status;
11929   char *s, *t, *end, *comstart, *comend;
11930   INFO info;
11931 
11932   if (*entry != ';') {
11933     if (isfp->filename && isfp->filename[0]) {
11934       raise_error(E_PARSEERROR, return STATUS_ERROR,
11935                   print_error("%s, entry %d:  Invalid format of IG/Stanford "
11936                               "entry.\n", isfp->filename, isfp->entry_count));
11937     }
11938     else {
11939       raise_error(E_PARSEERROR, return STATUS_ERROR,
11940                   print_error("seqfparseent:  Invalid format of IG/Stanford "
11941                               "entry.\n"));
11942     }
11943   }
11944 
11945   start_info(&info, isfp, flag);
11946   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
11947 
11948   if (allflag) {
11949     set_filename(&info, isfp->filename);
11950     if (isfp->db_name != NULL)
11951       set_dbname(&info, isfp->db_name);
11952     set_format(&info, seqfformat(isfp, 0));
11953 
11954     if (!info.error) {
11955       info.info->entryno = isfp->entry_count;
11956       info.info->seqno = isfp->entry_seqno;
11957       info.info->numseqs = isfp->entry_numseqs;
11958     }
11959   }
11960 
11961   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
11962     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
11963 
11964   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
11965       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
11966     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
11967     if (status != STATUS_OK && status != STATUS_WARNING) {
11968       finish_info(&info, isfp);
11969       return status;
11970     }
11971     if (!info.error) {
11972       info.info->truelen = isfp->entry_truelen;
11973       info.info->rawlen = isfp->entry_rawlen;
11974     }
11975 
11976     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
11977       goto STANFORD_GI_END;
11978   }
11979 
11980   /*
11981    * The header lines are comments.  The next line after them is a
11982    * oneline description.
11983    */
11984   s = entry;
11985   end = entry + len;
11986 
11987   for (comstart=++s; s < end && (*s != '\n' || s[1] == ';'); s++) ;
11988   comend = s;
11989 
11990   if (allflag || flag == SEQINFO_IDLIST || flag == SEQINFO_DESCRIPTION ||
11991       flag == SEQINFO_ORGANISM || flag == SEQINFO_FRAGMENT ||
11992       flag == SEQINFO_CIRCULAR || flag == SEQINFO_STARTPOS) {
11993     for (t=++s; s < end && *s != '\n'; s++) ;
11994     parse_oneline(&info, t, s, flag);
11995   }
11996 
11997   if (flag == SEQINFO_ALLINFO)
11998     add_comment(&info, entry, s+1, 0, 0);
11999 
12000   if (comstart != comend &&
12001       (allflag || flag == SEQINFO_COMMENT || flag == SEQINFO_IDLIST))
12002     parse_comment(&info, comstart, comend, 0, 1, flag);
12003 
12004   if (allflag || flag == SEQINFO_CIRCULAR) {
12005     for (s=end; s > entry && isspace(*(s-1)); s--) ;
12006     if (*(s-1) == '2')
12007       set_circular(&info, 1);
12008   }
12009 
12010   /*
12011    * Check the GCG infoline for information about the date, alphabet and
12012    * description.
12013    */
12014   if (isfp->format == FORMAT_GCG && isfp->gcg_infoline &&
12015       (allflag || flag == SEQINFO_DATE || flag == SEQINFO_ALPHABET ||
12016        flag == SEQINFO_DESCRIPTION))
12017     parse_gcg_oneline(&info, isfp->gcg_infoline, flag);
12018 
12019 STANFORD_GI_END:
12020   /*
12021    * Finish the last fields in the INFO structure.
12022    */
12023   if (flag == SEQINFO_ALL) {
12024     if (isfp->db_name == NULL)
12025       add_retrieval(&info, 1, "IG/Stanford");
12026     else
12027       add_retrieval(&info, 2, isfp->db_name);
12028   }
12029 
12030   finish_info(&info, isfp);
12031   memory_error(info.error, return STATUS_FATAL);
12032   return STATUS_OK;
12033 }
12034 
12035 
gcg_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)12036 static int gcg_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
12037 {
12038   int comlen, allflag, status;
12039   char *s, *end, *infoline;
12040   INFO info;
12041 
12042   if (isfp->gcg_subformat != FORMAT_UNKNOWN) {
12043     return
12044       (*file_table[isfp->gcg_subformat].getinfo_fn)(isfp, entry, len, flag);
12045   }
12046 
12047   if (isfp->filename && isfp->filename[0]) {
12048     program_error(isfp->gcg_infoline == NULL, return STATUS_ERROR,
12049                   print_error("   gcg_infoline not set by GCG read "
12050                               "function\n"));
12051 
12052     infoline = isfp->gcg_infoline;
12053   }
12054   else {
12055     for (s=entry,end=entry+len; s < end; s++)
12056       if (*s == '.' && s[1] == '.' && s[2] == '\n')
12057         break;
12058 
12059     error_test(s >= end, E_PARSEERROR, return STATUS_ERROR,
12060                print_error("seqfparseent:  Invalid format of GCG entry.\n"));
12061 
12062     while (s > entry && *(s-1) != '\n') s--;
12063     infoline = s;
12064   }
12065 
12066   start_info(&info, isfp, flag);
12067   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
12068 
12069   if (allflag) {
12070     set_filename(&info, isfp->filename);
12071     if (isfp->db_name != NULL)
12072       set_dbname(&info, isfp->db_name);
12073     set_format(&info, seqfformat(isfp, 0));
12074 
12075     if (!info.error) {
12076       info.info->entryno = isfp->entry_count;
12077       info.info->seqno = isfp->entry_seqno;
12078       info.info->numseqs = isfp->entry_numseqs;
12079     }
12080   }
12081 
12082   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
12083     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
12084 
12085   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
12086       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
12087     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
12088     if (status != STATUS_OK && status != STATUS_WARNING) {
12089       finish_info(&info, isfp);
12090       return status;
12091     }
12092     if (!info.error) {
12093       info.info->truelen = isfp->entry_truelen;
12094       info.info->rawlen = isfp->entry_rawlen;
12095     }
12096   }
12097 
12098   /*
12099    * The beginning lines up to the gcg_infoline are comments.  The
12100    * gcg_infoline contains alphabet, date and description information.
12101    */
12102   if (allflag || flag == SEQINFO_COMMENT) {
12103     comlen = infoline - isfp->fp_entrystart;
12104     add_comment(&info, entry, entry + comlen, 0, 0);
12105   }
12106 
12107   /*
12108    * Check the GCG infoline for information about the date, alphabet and
12109    * description.
12110    */
12111   if (allflag || flag == SEQINFO_DATE || flag == SEQINFO_ALPHABET ||
12112       flag == SEQINFO_DESCRIPTION)
12113     parse_gcg_oneline(&info, infoline, flag);
12114 
12115   /*
12116    * Finish the last fields in the INFO structure.
12117    */
12118   if (flag == SEQINFO_ALL) {
12119     if (isfp->db_name == NULL)
12120       add_retrieval(&info, 1, "GCG");
12121     else
12122       add_retrieval(&info, 2, isfp->db_name);
12123   }
12124 
12125   finish_info(&info, isfp);
12126   memory_error(info.error, return STATUS_FATAL);
12127   return STATUS_OK;
12128 }
12129 
12130 
msf_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)12131 static int msf_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
12132 {
12133   int comlen, allflag, status;
12134   char *s, *t;
12135   INFO info;
12136 
12137   program_error(isfp->gcg_infoline == NULL, return STATUS_ERROR,
12138                 print_error("   gcg_infoline not set by GCG read "
12139                             "function\n"));
12140 
12141   while (isfp->malign_seqno < isfp->entry_seqno) {
12142     for (s=isfp->fp_seqstart; *s != '\n'; s++) ;
12143     isfp->fp_seqstart = s + 1;
12144     for (t=s+1; *t != '\n' && isspace(*t); t++) ;
12145     if (*t != '\n')
12146       isfp->malign_seqno++;
12147   }
12148 
12149   start_info(&info, isfp, flag);
12150   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
12151 
12152   if (allflag) {
12153     set_filename(&info, isfp->filename);
12154     if (isfp->db_name != NULL)
12155       set_dbname(&info, isfp->db_name);
12156     set_format(&info, seqfformat(isfp, 0));
12157 
12158     if (!info.error) {
12159       info.info->entryno = isfp->entry_count;
12160       info.info->seqno = isfp->entry_seqno;
12161       info.info->numseqs = isfp->entry_numseqs;
12162     }
12163   }
12164 
12165   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
12166     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
12167 
12168   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
12169       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
12170     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
12171     if (status != STATUS_OK && status != STATUS_WARNING) {
12172       finish_info(&info, isfp);
12173       return status;
12174     }
12175     if (!info.error) {
12176       info.info->truelen = isfp->entry_truelen;
12177       info.info->rawlen = isfp->entry_rawlen;
12178     }
12179   }
12180 
12181   /*
12182    * The beginning lines up to the gcg_infoline are comments.  The
12183    * gcg_infoline contains alphabet, date and description information.
12184    */
12185   if (flag == SEQINFO_ALL || flag == SEQINFO_COMMENT) {
12186     comlen = isfp->gcg_infoline - isfp->fp_entrystart;
12187     add_comment(&info, entry, entry + comlen, 0, 0);
12188   }
12189 
12190   /*
12191    * Check the GCG infoline for information about the date, alphabet and
12192    * description.
12193    */
12194   if (allflag || flag == SEQINFO_DATE || flag == SEQINFO_ALPHABET ||
12195       flag == SEQINFO_DESCRIPTION)
12196     parse_gcg_oneline(&info, isfp->gcg_infoline, flag);
12197 
12198   /*
12199    * The sequence header line contains the identifier.
12200    */
12201   if (allflag || flag == SEQINFO_IDLIST) {
12202     for (s=isfp->fp_seqstart; isspace(*s); s++) ;
12203     while (!isspace(*s)) s++;
12204     while (*s != '\n' && isspace(*s)) s++;
12205     for (t=s; *s != '\n' && !isspace(*s); s++) ;
12206     add_id(&info, (isfp->db_idprefix ? isfp->db_idprefix : "oth"), t, s);
12207 
12208     /*
12209      * If SEQINFO_ALLINFO is set, find the "//" line that ends the header
12210      * and add all of the text before it as a comment.
12211      */
12212     if (flag == SEQINFO_ALLINFO) {
12213       while (*s != '\n' || s[1] != '/' || s[2] != '/') s++;
12214       add_comment(&info, entry, s+1, 0, 0);
12215     }
12216   }
12217 
12218   /*
12219    * Finish the last fields in the INFO structure.
12220    */
12221   if (flag == SEQINFO_ALL) {
12222     if (isfp->db_name == NULL)
12223       add_retrieval(&info, 1, "MSF");
12224     else
12225       add_retrieval(&info, 2, isfp->db_name);
12226   }
12227 
12228   finish_info(&info, isfp);
12229   memory_error(info.error, return STATUS_FATAL);
12230   return STATUS_OK;
12231 }
12232 
12233 
fastaout_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)12234 static int fastaout_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
12235 {
12236   int mode, slen, slen2, count, alpha, alpha1, alpha2;
12237   int allflag, status, seqlen, totallen;
12238   char ch, *s, *t, *s2, *t2, *line, *end, *entryend, buffer[512];
12239   char *id1, *idend1, *id2, *idend2, descr1[128], descr2[128];
12240   INFO info;
12241 
12242   start_info(&info, isfp, flag);
12243   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
12244 
12245   if (allflag) {
12246     set_filename(&info, isfp->filename);
12247     if (isfp->db_name != NULL)
12248       set_dbname(&info, isfp->db_name);
12249     set_format(&info, seqfformat(isfp, 0));
12250 
12251     if (!info.error) {
12252       info.info->entryno = isfp->entry_count;
12253       info.info->seqno = isfp->entry_seqno;
12254       info.info->numseqs = isfp->entry_numseqs;
12255     }
12256   }
12257 
12258   if (((allflag || flag == SEQINFO_TRUELEN ||
12259         flag == SEQINFO_FRAGMENT) && !isfp->iflag_truelen) ||
12260       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
12261     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
12262     if (status != STATUS_OK && status != STATUS_WARNING) {
12263       finish_info(&info, isfp);
12264       return status;
12265     }
12266     if (!info.error) {
12267       info.info->truelen = isfp->entry_truelen;
12268       info.info->rawlen = isfp->entry_rawlen;
12269     }
12270 
12271     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
12272       goto FOUT_GI_END;
12273   }
12274 
12275   if (allflag || flag == SEQINFO_FRAGMENT) {
12276     totallen = (isfp->entry_seqno == 1 ? isfp->fout_len1 : isfp->fout_len2);
12277     if (isfp->entry_truelen && totallen && totallen < isfp->entry_truelen)
12278       set_fragment(&info, 1);
12279 
12280     if (flag == SEQINFO_FRAGMENT)
12281       goto FOUT_GI_END;
12282   }
12283 
12284   /*
12285    * Scan through the entry, and the information collected from the
12286    * beginning of the file, to construct the needed info.
12287    */
12288   entryend = entry + len;
12289   mode = isfp->fout_mode;
12290   line = end = NULL;
12291 
12292   if (mode == FASTA_MODE)
12293     for (line=end=entry; *end != '\n'; end++) ;
12294 
12295   id1 = idend1 = id2 = idend2 = NULL;
12296   alpha1 = alpha2 = UNKNOWN;
12297   if (isfp->fout_markx == MARKX10 &&
12298       (allflag || flag == SEQINFO_IDLIST ||
12299        flag == SEQINFO_ALPHABET || flag == SEQINFO_COMMENT)) {
12300     for (s=entry; s < entryend && (*s != '\n' || s[1] != '>'); s++) ;
12301     s++;
12302     for (id1=++s; s < entryend && !isspace(*s); s++) ;
12303     if (alpha1 != UNKNOWN)
12304       for (idend1=s++; s < entryend && (*s != '\n' || s[1] != '>'); s++) ;
12305     else {
12306       for (idend1=s++; s < entryend && (*s != '\n' || s[1] != '>'); s++) {
12307         if (*s == '\n' && s[1] == ';' && !strncmp(s+2, " sq_type: ", 10)) {
12308           for (s+=11; s < entryend && *s != '\n' && isspace(*s); s++) ;
12309           if (toupper(*s) == 'P')
12310             alpha1 = PROTEIN;
12311           else if (toupper(*s) == 'D')
12312             alpha1 = DNA;
12313         }
12314       }
12315     }
12316     s++;
12317     for (id2=++s; s < entryend && !isspace(*s); s++) ;
12318     idend2 = s;
12319     if (s >= entryend)
12320       id1 = idend1 = id2 = idend2 = NULL;
12321     else {
12322       if (alpha2 != UNKNOWN)
12323         for (s++; s < entryend && (*s != '\n' || s[1] != '>'); s++) ;
12324       else {
12325         for (s++; s < entryend && (*s != '\n' || s[1] != '>'); s++) {
12326           if (*s == '\n' && s[1] == ';' && !strncmp(s+2, " sq_type: ", 10)) {
12327             for (s+=11; s < entryend && *s != '\n' && isspace(*s); s++) ;
12328             if (toupper(*s) == 'P')
12329               alpha2 = PROTEIN;
12330             else if (toupper(*s) == 'D')
12331               alpha2 = DNA;
12332           }
12333         }
12334       }
12335     }
12336   }
12337 
12338   if (isfp->entry_seqno == 1) {
12339     if (mode == LFASTA_MODE && isfp->fout_markx == MARKX10) {
12340       if (id1 && (allflag || flag == SEQINFO_IDLIST))
12341         add_id(&info, (isfp->db_idprefix ? isfp->db_idprefix : "oth"),
12342                id1, idend1);
12343 
12344       if (alpha1 != UNKNOWN && (allflag || flag == SEQINFO_ALPHABET)) {
12345         set_alphabet(&info, alpha1);
12346         if (flag == SEQINFO_ALPHABET)
12347           goto FOUT_GI_END;
12348       }
12349     }
12350     else {
12351       if (isfp->fout_id1 && (allflag || flag == SEQINFO_IDLIST)) {
12352         for (s=isfp->fout_id1; *s; s++) ;
12353         add_id(&info, (isfp->db_idprefix ? isfp->db_idprefix : "oth"),
12354                isfp->fout_id1, s);
12355       }
12356 
12357       if (isfp->fout_descr1 && (allflag || flag == SEQINFO_DESCRIPTION)) {
12358         for (s=isfp->fout_descr1; *s; s++) ;
12359         add_description(&info, isfp->fout_descr1, s);
12360         if (flag == SEQINFO_DESCRIPTION)
12361           goto FOUT_GI_END;
12362       }
12363 
12364       if (allflag || flag == SEQINFO_ALPHABET) {
12365         if (isfp->db_alpha != NULL)
12366           set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
12367         else if (isfp->fout_alpha1 != UNKNOWN)
12368           set_alphabet(&info, isfp->fout_alpha1);
12369         else if (isfp->fout_markx == MARKX10 && alpha1 != UNKNOWN)
12370           set_alphabet(&info, alpha1);
12371 
12372         if (flag == SEQINFO_ALPHABET)
12373           goto FOUT_GI_END;
12374       }
12375     }
12376   }
12377   else {
12378     if (mode == LFASTA_MODE && isfp->fout_markx == MARKX10) {
12379       if (id2 && (allflag || flag == SEQINFO_IDLIST))
12380         add_id(&info, (isfp->db_idprefix ? isfp->db_idprefix : "oth"),
12381                id2, idend2);
12382 
12383       if (alpha2 != UNKNOWN && (allflag || flag == SEQINFO_ALPHABET)) {
12384         set_alphabet(&info, alpha2);
12385         if (flag == SEQINFO_ALPHABET)
12386           goto FOUT_GI_END;
12387       }
12388     }
12389     else {
12390       if (mode == FASTA_MODE) {
12391         s = (line[0] == '>' && line[1] == '>' ? line + 2 : line);
12392         for (t=s; !isspace(*s); s++) ;
12393         if (allflag || flag == SEQINFO_IDLIST)
12394           add_id(&info, (isfp->db_idprefix ? isfp->db_idprefix : "oth"), t, s);
12395 
12396         while (isspace(*s) && *s != '\n') s++;
12397         if (isfp->fout_markx != MARKX10)
12398           for (t=s,s=end-1; s > t && *s != ')'; s--) ;
12399         else {
12400           t = s;
12401           s = end;
12402         }
12403 
12404         if (s > t) {
12405           if (isfp->fout_markx != MARKX10 &&
12406               (allflag || flag == SEQINFO_ALPHABET)) {
12407             if (isfp->db_alpha != NULL)
12408               set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
12409             else if (*(s-2) == 'a' && *(s-1) == 'a')
12410               set_alphabet(&info, PROTEIN);
12411             else if (*(s-2) == 'n' && *(s-1) == 't')
12412               set_alphabet(&info, DNA);
12413 
12414             if (flag == SEQINFO_ALPHABET)
12415               goto FOUT_GI_END;
12416           }
12417 
12418           if (isfp->fout_markx != MARKX10)
12419             for (s--; s > t && *s != '('; s--) ;
12420           if (s > t && (allflag || flag == SEQINFO_DESCRIPTION)) {
12421             add_description(&info, t, s);
12422           }
12423         }
12424         if (isfp->fout_markx == MARKX10 && alpha2 != UNKNOWN &&
12425             (allflag || flag == SEQINFO_ALPHABET)) {
12426           set_alphabet(&info, alpha2);
12427           if (flag == SEQINFO_ALPHABET)
12428             goto FOUT_GI_END;
12429         }
12430       }
12431       else {
12432         if (isfp->fout_id2 && (allflag || flag == SEQINFO_IDLIST)) {
12433           for (s=isfp->fout_id2; *s; s++) ;
12434           add_id(&info, (isfp->db_idprefix ? isfp->db_idprefix : "oth"),
12435                  isfp->fout_id2, s);
12436         }
12437 
12438         if (isfp->fout_descr2 && (allflag || flag == SEQINFO_DESCRIPTION)) {
12439           for (s=isfp->fout_descr2; *s; s++) ;
12440           add_description(&info, isfp->fout_descr2, s);
12441           if (flag == SEQINFO_DESCRIPTION)
12442             goto FOUT_GI_END;
12443         }
12444 
12445         if (allflag || flag == SEQINFO_ALPHABET) {
12446           if (isfp->db_alpha != NULL)
12447             set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
12448           else if (isfp->fout_alpha2 != UNKNOWN)
12449             set_alphabet(&info, isfp->fout_alpha2);
12450           else if (isfp->fout_markx == MARKX10 && alpha2 != UNKNOWN)
12451             set_alphabet(&info, alpha2);
12452 
12453           if (flag == SEQINFO_ALPHABET)
12454             goto FOUT_GI_END;
12455         }
12456       }
12457     }
12458   }
12459 
12460   if (allflag || flag == SEQINFO_COMMENT) {
12461     if (mode == LFASTA_MODE && isfp->fout_markx == MARKX10) {
12462       descr1[0] = '>';
12463       if (id1) {
12464         slen = idend1 - id1;
12465         memcpy(descr1+1, id1, slen);
12466       }
12467       else
12468         slen = 0;
12469       descr1[slen+1] = '\0';
12470     }
12471     else
12472       sprintf(descr1, ">%s %s, %d %s", isfp->fout_id1, isfp->fout_descr1,
12473               isfp->fout_len1,
12474               (isfp->fout_alpha1 == DNA ? "nt"
12475                  : (isfp->fout_alpha1 == PROTEIN ? "aa" : "ch")));
12476 
12477     if (mode == LFASTA_MODE && isfp->fout_markx == MARKX10) {
12478       descr2[0] = '>';
12479       if (id2) {
12480         slen = idend2 - id2;
12481         memcpy(descr2+1, id2, slen);
12482       }
12483       else
12484         slen = 0;
12485       descr2[slen+1] = '\0';
12486     }
12487     else if (mode == FASTA_MODE) {
12488       if (isfp->fout_markx == MARKX10) {
12489         descr2[0] = '>';
12490         slen = end - line - 2;
12491         memcpy(descr2+1, line+2, slen);
12492 
12493         for (s=end; s < entryend && (*s != '\n' || s[1] != '>'); s++) ;
12494         for (s++; s < entryend && (*s != '\n' || s[1] != '>'); s++) ;
12495         count = 0;
12496         seqlen = 0;
12497         alpha = UNKNOWN;
12498         for (s++; s < entryend && count < 2; s++) {
12499           if (*s == '\n') {
12500             if (s[1] != ';')
12501               break;
12502             else if (strncmp(s+1, "; sq_len: ", 10) == 0) {
12503               for (s+=11; s < entryend && isspace(*s); s++) ;
12504               if (s < entryend && isdigit(*s)) {
12505                 seqlen = *s - '0';
12506                 for (s++; isdigit(*s); s++) {
12507                   seqlen *= 10;
12508                   seqlen += *s - '0';
12509                 }
12510               }
12511               s--;
12512               count++;
12513             }
12514             else if (strncmp(s+1, "; sq_type: ", 11) == 0) {
12515               for (s+=12; s < entryend && isspace(*s); s++) ;
12516               ch = toupper(*s);
12517               if (ch == 'P')
12518                 alpha = PROTEIN;
12519               else if (ch == 'D')
12520                 alpha = DNA;
12521               else
12522                 alpha = UNKNOWN;
12523               s--;
12524               count++;
12525             }
12526           }
12527         }
12528 
12529         if (seqlen > 0 && alpha != UNKNOWN)
12530           sprintf(descr2+slen+1, ", %d %s", seqlen,
12531                   (alpha == PROTEIN ? "aa" : "nt"));
12532         else
12533           descr2[slen+1] = '\0';
12534       }
12535       else {
12536         t = (line[0] == '>' && line[1] == '>' ? line + 2 : line);
12537         for (s=end-1; s > t && *s != ')'; s--) ;
12538         for (s2=s,s--; s > t && *s != '('; s--) ;
12539         for (t2=s; s > t && isspace(*(s-1)); s--) ;
12540         if (s > t) {
12541           descr2[0] = '>';
12542           slen = s - t;
12543           memcpy(descr2+1, t, slen);
12544           descr2[slen+1] = ',';
12545           descr2[slen+2] = ' ';
12546           slen2 = s2 - t2 - 1;
12547           memcpy(descr2+slen+3, t2+1, slen2);
12548           descr2[slen+3+slen2] = '\0';
12549         }
12550         else
12551           descr2[0] = '\0';
12552       }
12553     }
12554     else {
12555       sprintf(descr2, ">%s %s, %d %s", isfp->fout_id2, isfp->fout_descr2,
12556               isfp->fout_len2,
12557               (isfp->fout_alpha2 == DNA ? "nt"
12558                 : (isfp->fout_alpha2 == PROTEIN ? "aa" : "ch")));
12559     }
12560 
12561     sprintf(buffer, "From %s output alignment of:\n   %s\nand\n   %s\n\n",
12562             isfp->fout_progname, descr1, descr2);
12563     for (end=buffer; *end; end++) ;
12564     add_comment(&info, buffer, end, 0, 0);
12565 
12566     if (isfp->fout_markx == MARKX10) {
12567       for (s=entry; s < entryend && *s != '\n'; s++) ;
12568       s++;
12569       if (s < end && *s == ';') {
12570         for (t=s; s < entryend && (*s != '\n' || s[1] == ';'); s++) ;
12571         if (s < entryend)
12572           add_comment(&info, t, s+1, 0, 0);
12573       }
12574     }
12575     else {
12576       s = entry;
12577       if (!isspace(*s)) {
12578         for (s=entry; s < entryend && *s != '\n'; s++) ;
12579         s++;
12580       }
12581       if (*s != '\n') {
12582         for (t=s; s < entryend; s++)
12583           if (*s == '\n' && (s[1] == '\n' || s[1] == '>'))
12584             break;
12585         if (s < entryend)
12586           add_comment(&info, t, s+1, 0, 0);
12587       }
12588     }
12589   }
12590 
12591 FOUT_GI_END:
12592   /*
12593    * Finish the last fields in the INFO structure.
12594    */
12595   if (allflag)
12596     add_retrieval(&info, 3, isfp->fout_progname);
12597 
12598   finish_info(&info, isfp);
12599   memory_error(info.error, return STATUS_FATAL);
12600   return STATUS_OK;
12601 }
12602 
12603 
blastout_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)12604 static int blastout_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
12605 {
12606   int i, allflag, status, startflag, size, pos, maxsize, fragstart, totallen;
12607   char *s, *s2, *t, *end, *pattern, buffer[512];
12608   INFO info;
12609 
12610   start_info(&info, isfp, flag);
12611   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
12612 
12613   if (allflag) {
12614     set_filename(&info, isfp->filename);
12615     if (isfp->db_name != NULL)
12616       set_dbname(&info, isfp->db_name);
12617     set_format(&info, seqfformat(isfp, 0));
12618 
12619     if (!info.error) {
12620       info.info->entryno = isfp->entry_count;
12621       info.info->seqno = isfp->entry_seqno;
12622       info.info->numseqs = isfp->entry_numseqs;
12623     }
12624   }
12625 
12626   if (((allflag || flag == SEQINFO_TRUELEN ||
12627         flag == SEQINFO_FRAGMENT) && !isfp->iflag_truelen) ||
12628       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
12629     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
12630     if (status != STATUS_OK && status != STATUS_WARNING) {
12631       finish_info(&info, isfp);
12632       return status;
12633     }
12634     if (!info.error) {
12635       info.info->truelen = isfp->entry_truelen;
12636       info.info->rawlen = isfp->entry_rawlen;
12637     }
12638 
12639     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
12640       goto BOUT_GI_END;
12641   }
12642 
12643   /*
12644    * Scan through the entry, and the information collected from the
12645    * beginning of the file, to construct the needed info.
12646    */
12647   if (isfp->fout_alpha1 != UNKNOWN && (allflag || flag == SEQINFO_ALPHABET)) {
12648     set_alphabet(&info, isfp->fout_alpha1);
12649     if (flag == SEQINFO_ALPHABET)
12650       goto BOUT_GI_END;
12651   }
12652 
12653   if (allflag || flag == SEQINFO_IDLIST ||
12654       flag == SEQINFO_DESCRIPTION || flag == SEQINFO_ORGANISM ||
12655       flag == SEQINFO_FRAGMENT || flag == SEQINFO_CIRCULAR) {
12656     t = (isfp->entry_seqno == 1 ? isfp->fout_descr1 : isfp->fout_descr2);
12657     if (t != NULL) {
12658       for (s=t; *s && *s != '>'; s++) ;
12659       parse_oneline(&info, t, s, flag);
12660     }
12661   }
12662 
12663   if (allflag || flag == SEQINFO_FRAGMENT || flag == SEQINFO_STARTPOS) {
12664     pattern = (isfp->entry_seqno == 1 ? "\nQuery:" : "\nSbjct:");
12665     for (s=entry; s < entry + len; s++)
12666       if (strncmp(s, pattern, 7) == 0)
12667         break;
12668 
12669     if (s < entry + len) {
12670       totallen = (isfp->entry_seqno == 1 ? isfp->fout_len1 : isfp->fout_len2);
12671       fragstart = myatoi(s+7, 10 ,'0');
12672       if ((allflag || flag == SEQINFO_FRAGMENT) &&
12673           (fragstart > 1 || (totallen && isfp->entry_truelen < totallen)))
12674         set_fragment(&info, 1);
12675 
12676       if ((allflag || flag == SEQINFO_STARTPOS) &&
12677           (fragstart > 1 || (totallen && isfp->entry_truelen < totallen)))
12678         set_fragstart(&info, fragstart);
12679     }
12680   }
12681 
12682   if (allflag || flag == SEQINFO_COMMENT) {
12683     sprintf(buffer, "From %s output alignment of:\n", isfp->fout_progname);
12684     for (end=buffer; *end; end++) ;
12685     add_comment(&info, buffer, end, 0, 0);
12686 
12687     if (isfp->fout_descr1 != NULL)
12688       sprintf(buffer, "   >%s\nand", isfp->fout_descr1);
12689     else
12690       strcpy(buffer, "   >Unknown sequence\nand");
12691 
12692     for (end=buffer; *end; end++) ;
12693     add_comment(&info, buffer, end, 0, 0);
12694 
12695     if (isfp->fout_descr2 != NULL) {
12696       s = isfp->fout_descr2;
12697       startflag = 1;
12698       while (*s) {
12699         if (*s == '>') s++;
12700         maxsize = (startflag ? 70 : 60);
12701         for (t=s++,i=1; *s && i < maxsize && *s != '>'; s++,i++) ;
12702         if (startflag) {
12703           buffer[3] = '>';
12704           pos = 4;
12705         }
12706         else {
12707           strcpy(buffer+3, "           ");
12708           pos = 14;
12709         }
12710 
12711         if (!*s || *s == '>') {
12712           size = s - t;
12713           startflag = 1;
12714         }
12715         else {
12716           for (s=t+60; s > t && !isspace(*s); s--) ;
12717           for (s2=s; *s2 && isspace(*s2); s2++) ;
12718           while (s > t && isspace(*(s-1))) s--;
12719           if (s == t) {
12720             s = t + 60;
12721             size = 60;
12722           }
12723           else {
12724             size = s - t;
12725             s = s2;
12726           }
12727           startflag = 0;
12728         }
12729 
12730         memcpy(buffer+pos, t, size);
12731         buffer[pos+size] = '\0';
12732         add_comment(&info, buffer, buffer + pos + size, 0, 0);
12733       }
12734     }
12735 
12736     add_comment(&info, buffer, buffer + 1, 0, 0);
12737 
12738     add_comment(&info, isfp->fp_entrystart, isfp->fp_seqstart, 0, 0);
12739   }
12740 
12741   /*
12742    * Finish the last fields in the INFO structure.
12743    */
12744 BOUT_GI_END:
12745   if (allflag)
12746     add_retrieval(&info, 3, isfp->fout_progname);
12747 
12748   finish_info(&info, isfp);
12749   memory_error(info.error, return STATUS_FATAL);
12750   return STATUS_OK;
12751 }
12752 
12753 
phyint_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)12754 static int phyint_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
12755 {
12756   int allflag, status;
12757   char *s;
12758   INFO info;
12759 
12760   while (isfp->malign_seqno < isfp->entry_seqno) {
12761     for (s=isfp->fp_seqstart; isspace(*s); s++) ;
12762     for (s++; *s != '\n'; s++) ;
12763     isfp->fp_seqstart = s + 1;
12764     isfp->malign_seqno++;
12765   }
12766 
12767   start_info(&info, isfp, flag);
12768   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
12769 
12770   if (allflag) {
12771     set_filename(&info, isfp->filename);
12772     if (isfp->db_name != NULL)
12773       set_dbname(&info, isfp->db_name);
12774     set_format(&info, seqfformat(isfp, 0));
12775 
12776     if (!info.error) {
12777       info.info->entryno = isfp->entry_count;
12778       info.info->seqno = isfp->entry_seqno;
12779       info.info->numseqs = isfp->entry_numseqs;
12780     }
12781   }
12782 
12783   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
12784     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
12785 
12786   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
12787       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
12788     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
12789     if (status != STATUS_OK && status != STATUS_WARNING) {
12790       finish_info(&info, isfp);
12791       return status;
12792     }
12793     if (!info.error) {
12794       info.info->truelen = isfp->entry_truelen;
12795       info.info->rawlen = isfp->entry_rawlen;
12796     }
12797   }
12798 
12799   if (allflag || flag == SEQINFO_DESCRIPTION)
12800     add_description(&info, isfp->fp_seqstart, isfp->fp_seqstart+10);
12801 
12802   if (flag == SEQINFO_ALLINFO) {
12803     for (s=entry; *s != '\n'; s++) ;
12804     add_comment(&info, entry, s, 0, 0);
12805   }
12806 
12807   if (flag == SEQINFO_ALL) {
12808     if (isfp->db_name == NULL)
12809       add_retrieval(&info, 1, "PHYLIP-Int");
12810     else
12811       add_retrieval(&info, 2, isfp->db_name);
12812   }
12813 
12814   finish_info(&info, isfp);
12815   memory_error(info.error, return STATUS_FATAL);
12816   return STATUS_OK;
12817 }
12818 
12819 
physeq_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)12820 static int physeq_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
12821 {
12822   int seqpos, seqlen, allflag, status;
12823   char *s, *end;
12824   INFO info;
12825 
12826   while (isfp->malign_seqno < isfp->entry_seqno) {
12827     s = isfp->fp_seqstart+10;
12828     end = isfp->fp_entryend;
12829     seqlen = isfp->entry_seqlen;
12830     for (seqpos=0; s < end && seqpos < seqlen; s++)
12831       if (!(isspace(*s) || isdigit(*s)))
12832         seqpos++;
12833     for ( ; *s != '\n'; s++) ;
12834     isfp->fp_seqstart = s + 1;
12835     isfp->malign_seqno++;
12836   }
12837 
12838   start_info(&info, isfp, flag);
12839   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
12840 
12841   if (allflag) {
12842     set_filename(&info, isfp->filename);
12843     if (isfp->db_name != NULL)
12844       set_dbname(&info, isfp->db_name);
12845     set_format(&info, seqfformat(isfp, 0));
12846 
12847     if (!info.error) {
12848       info.info->entryno = isfp->entry_count;
12849       info.info->seqno = isfp->entry_seqno;
12850       info.info->numseqs = isfp->entry_numseqs;
12851     }
12852   }
12853 
12854   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
12855     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
12856 
12857   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
12858       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
12859     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
12860     if (status != STATUS_OK && status != STATUS_WARNING) {
12861       finish_info(&info, isfp);
12862       return status;
12863     }
12864     if (!info.error) {
12865       info.info->truelen = isfp->entry_truelen;
12866       info.info->rawlen = isfp->entry_rawlen;
12867     }
12868   }
12869 
12870   if (allflag || flag == SEQINFO_DESCRIPTION)
12871     add_description(&info, isfp->fp_seqstart, isfp->fp_seqstart+10);
12872 
12873   if (flag == SEQINFO_ALLINFO) {
12874     for (s=entry; *s != '\n'; s++) ;
12875     add_comment(&info, entry, s, 0, 0);
12876   }
12877 
12878   if (flag == SEQINFO_ALL) {
12879     if (isfp->db_name == NULL)
12880       add_retrieval(&info, 1, "PHYLIP-Seq");
12881     else
12882       add_retrieval(&info, 2, isfp->db_name);
12883   }
12884 
12885   finish_info(&info, isfp);
12886   memory_error(info.error, return STATUS_FATAL);
12887   return STATUS_OK;
12888 }
12889 
12890 
clustal_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)12891 static int clustal_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
12892 {
12893   int allflag, status;
12894   char *s, *end;
12895   INFO info;
12896 
12897   s = isfp->fp_seqstart;
12898   end = isfp->fp_entryend;
12899 
12900   if (isfp->malign_seqno < isfp->entry_seqno) {
12901     while (isfp->malign_seqno < isfp->entry_seqno) {
12902       for ( ; s < end && *s != '\n'; s++) ;
12903       s++;
12904       isfp->malign_seqno++;
12905     }
12906     isfp->fp_seqstart = s;
12907   }
12908 
12909   start_info(&info, isfp, flag);
12910   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
12911 
12912   if (allflag) {
12913     set_filename(&info, isfp->filename);
12914     if (isfp->db_name != NULL)
12915       set_dbname(&info, isfp->db_name);
12916     set_format(&info, seqfformat(isfp, 0));
12917 
12918     if (!info.error) {
12919       info.info->entryno = isfp->entry_count;
12920       info.info->seqno = isfp->entry_seqno;
12921       info.info->numseqs = isfp->entry_numseqs;
12922     }
12923   }
12924 
12925   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
12926     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
12927 
12928   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
12929       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
12930     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
12931     if (status != STATUS_OK && status != STATUS_WARNING) {
12932       finish_info(&info, isfp);
12933       return status;
12934     }
12935     if (!info.error) {
12936       info.info->truelen = isfp->entry_truelen;
12937       info.info->rawlen = isfp->entry_rawlen;
12938     }
12939   }
12940 
12941   if (allflag || flag == SEQINFO_DESCRIPTION)
12942     add_description(&info, isfp->fp_seqstart, isfp->fp_seqstart+15);
12943 
12944   if (flag == SEQINFO_ALL) {
12945     if (isfp->db_name == NULL)
12946       add_retrieval(&info, 1, "Clustalw");
12947     else
12948       add_retrieval(&info, 2, isfp->db_name);
12949   }
12950 
12951   finish_info(&info, isfp);
12952   memory_error(info.error, return STATUS_FATAL);
12953   return STATUS_OK;
12954 }
12955 
12956 
asn_getinfo(INTSEQFILE * isfp,char * entry,int len,int flag)12957 static int asn_getinfo(INTSEQFILE *isfp, char *entry, int len, int flag)
12958 {
12959   int status, exists, count, alpha, oldpe, allflag;
12960   char *s, *t, datestr[12];
12961   char *idstr, *idend, *destr, *deend, *pirname, *pnend, *spname, *spend;
12962   char *gbname, *gbend, *emname, *emend, *otname, *otend, *piracc, *paend;
12963   char *spacc, *spaend, *gbacc, *gbaend, *emacc, *emaend, *otacc, *otaend;
12964   char *pdbmol, *pmend, *gistr, *gisend, *giim, *giimend, *name, *nameend;
12965   char *title, *titleend, *org, *orgend, *common, *commonend, *pirorg, *dend;
12966   char *poend, *pdbcomp, *pcend, *pdbsrc, *psend, *comment, *cmend, *date;
12967   char *cdate, *cdend, *udate, *udend, *gbdate, *gbdend, *gbedate, *gbeend;
12968   char *emcdate, *emcend, *emudate, *emuend, *pirdate, *pirend, *spcdate;
12969   char *spcend, *spudate, *spuend, *spadate, *pdbdate, *pdbend, *dbjacc;
12970   char *gibbs, *gibbsend, *gibbm, *gibbmend, *dbjname, *dbjend, *dbjaend;
12971   char *prfname, *prfend, *prfacc, *prfaend, *molstr, *molend, *dnastr;
12972   char *rnastr, *partstr, *topstr, *topend, *inststr, *instend;
12973   INFO info;
12974 
12975   for (s=entry; s < entry + len && isspace(*s); s++) ;
12976   if (!mystreq(s, 'S', "SEQ ")) {
12977     if (isfp->filename && isfp->filename[0]) {
12978       raise_error(E_PARSEERROR, return STATUS_ERROR,
12979                   print_error("%s, entry %d:  Invalid format of ASN.1 "
12980                               "Bioseq-set.seq-set.seq entry.\n",
12981                               isfp->filename, isfp->entry_count));
12982     }
12983     else {
12984       raise_error(E_PARSEERROR, return STATUS_ERROR,
12985                   print_error("seqfparseent:  Invalid format of ASN.1 "
12986                               "Bioseq-set.seq-set.seq entry.\n"));
12987     }
12988   }
12989 
12990   start_info(&info, isfp, flag);
12991   allflag = (flag == SEQINFO_ALL || flag == SEQINFO_ALLINFO);
12992 
12993   if (allflag) {
12994     set_filename(&info, isfp->filename);
12995     if (isfp->db_name != NULL)
12996       set_dbname(&info, isfp->db_name);
12997     set_format(&info, seqfformat(isfp, 0));
12998 
12999     if (!info.error) {
13000       info.info->entryno = isfp->entry_count;
13001       info.info->seqno = isfp->entry_seqno;
13002       info.info->numseqs = isfp->entry_numseqs;
13003     }
13004   }
13005 
13006   if ((allflag || flag == SEQINFO_ALPHABET) && isfp->db_alpha != NULL)
13007     set_alphabet(&info, get_alphabet(isfp->db_alpha, NULL));
13008 
13009   if (((allflag || flag == SEQINFO_TRUELEN) && !isfp->iflag_truelen) ||
13010       ((allflag || flag == SEQINFO_RAWLEN) && !isfp->iflag_rawlen)) {
13011     status = (*file_table[isfp->format].getseq_fn)(isfp, GETSEQ_LENGTHS);
13012     if (status != STATUS_OK && status != STATUS_WARNING) {
13013       finish_info(&info, isfp);
13014       return status;
13015     }
13016     if (!info.error) {
13017       info.info->truelen = isfp->entry_truelen;
13018       info.info->rawlen = isfp->entry_rawlen;
13019     }
13020 
13021     if (flag == SEQINFO_RAWLEN || flag == SEQINFO_TRUELEN)
13022       goto ASN_GI_END;
13023   }
13024 
13025   /*
13026    * Get the major pieces of the "seq" record, namely the "id" and
13027    * "descr" sub-records.
13028    */
13029   idstr = destr = inststr = NULL;
13030   oldpe = pe_flag;
13031   pe_flag = PE_NONE;
13032   status = asn_parse(entry, entry+len,
13033                      "seq.id", &idstr, &idend,
13034                      "seq.descr", &destr, &deend,
13035                      "seq.inst", &inststr, &instend,
13036                      NULL);
13037   pe_flag = oldpe;
13038   if (status == -1) {
13039     if (isfp->filename && isfp->filename[0]) {
13040       raise_error(E_PARSEERROR, return STATUS_ERROR,
13041                   print_error("%s, entry %d:  Invalid format of ASN.1 "
13042                               "Bioseq-set.seq-set.seq entry.\n",
13043                               isfp->filename, isfp->entry_count));
13044     }
13045     else {
13046       raise_error(E_PARSEERROR, return STATUS_ERROR,
13047                   print_error("seqfparseent:  Invalid format of ASN.1 "
13048                               "Bioseq-set.seq-set.seq entry.\n"));
13049     }
13050   }
13051 
13052   /*
13053    * Parse the information in the `id' sub-record.
13054    */
13055   if (idstr != NULL && (allflag || flag == SEQINFO_IDLIST)) {
13056     pirname = spname = gbname = emname = otname = prfname = dbjname = NULL;
13057     piracc = spacc = gbacc = emacc = otacc = prfacc = dbjacc = NULL;
13058     pdbmol = gistr = giim = gibbs = gibbm = NULL;
13059 
13060     oldpe = pe_flag;
13061     pe_flag = PE_NONE;
13062     status = asn_parse(idstr, idend,
13063                        "id.pir.name", &pirname, &pnend,
13064                        "id.swissprot.name", &spname, &spend,
13065                        "id.genbank.name", &gbname, &gbend,
13066                        "id.embl.name", &emname, &emend,
13067                        "id.ddbj.name", &dbjname, &dbjend,
13068                        "id.prf.name", &prfname, &prfend,
13069                        "id.other.name", &otname, &otend,
13070                        "id.pir.accession", &piracc, &paend,
13071                        "id.swissprot.accession", &spacc, &spaend,
13072                        "id.genbank.accession", &gbacc, &gbaend,
13073                        "id.embl.accession", &emacc, &emaend,
13074                        "id.ddbj.accession", &dbjacc, &dbjaend,
13075                        "id.prf.accession", &prfacc, &prfaend,
13076                        "id.other.accession", &otacc, &otaend,
13077                        "id.pdb.mol", &pdbmol, &pmend,
13078                        "id.gi", &gistr, &gisend,
13079                        "id.giim.id", &giim, &giimend,
13080                        "id.gibbsq", &gibbs, &gibbsend,
13081                        "id.gibbmt", &gibbm, &gibbmend,
13082                        NULL);
13083     pe_flag = oldpe;
13084     if (status == -1) {
13085       if (isfp->filename && isfp->filename[0]) {
13086         raise_error(E_PARSEERROR, return STATUS_ERROR,
13087                     print_error("%s, entry %d:  Invalid format of ASN.1 "
13088                                 "Bioseq-set.seq-set.seq entry.\n",
13089                                 isfp->filename, isfp->entry_count));
13090       }
13091       else {
13092         raise_error(E_PARSEERROR, return STATUS_ERROR,
13093                     print_error("seqfparseent:  Invalid format of ASN.1 "
13094                                 "Bioseq-set.seq-set.seq entry.\n"));
13095       }
13096     }
13097 
13098     /*
13099      * Set the id and accession values.
13100      */
13101     if (gbname != NULL)
13102       add_id(&info, "gb", gbname+4, gbend);
13103     if (gbacc != NULL)
13104       add_id(&info, "acc", gbacc+9, gbaend);
13105 
13106     if (pirname != NULL)
13107       add_id(&info, "pir", pirname+4, pnend);
13108     if (piracc != NULL)
13109       add_id(&info, "acc", piracc+9, paend);
13110 
13111     if (emname != NULL)
13112       add_id(&info, "embl", emname+4, emend);
13113     if (emacc != NULL)
13114       add_id(&info, "acc", emacc+9, emaend);
13115 
13116     if (spname != NULL)
13117       add_id(&info, "sp", spname+4, spend);
13118     if (spacc != NULL)
13119       add_id(&info, "acc", spacc+9, spaend);
13120 
13121     if (dbjname != NULL)
13122       add_id(&info, "ddbj", dbjname+4, dbjend);
13123     if (dbjacc != NULL)
13124       add_id(&info, "acc", dbjacc+9, dbjaend);
13125 
13126     if (prfname != NULL)
13127       add_id(&info, "prf", prfname+4, prfend);
13128     if (prfacc != NULL)
13129       add_id(&info, "acc", prfacc+9, prfaend);
13130 
13131     if (otname != NULL)
13132       add_id(&info, "oth", otname+4, otend);
13133     if (otacc != NULL)
13134       add_id(&info, "acc", otacc+9, otaend);
13135 
13136     if (pdbmol != NULL)
13137       add_id(&info, "pdb", pdbmol+3, pmend);
13138 
13139     if (gistr != NULL)
13140       add_id(&info, "gi", gistr+2, gisend);
13141     if (giim != NULL)
13142       add_id(&info, "giim", giim+2, giimend);
13143     if (gibbs != NULL)
13144       add_id(&info, "bbs", gibbs+6, gibbsend);
13145     if (gibbm != NULL)
13146       add_id(&info, "bbm", gibbm+6, gibbmend);
13147   }
13148 
13149   if (destr != NULL && (allflag || flag == SEQINFO_DATE)) {
13150     /*
13151      * Look for just the date.
13152      */
13153     cdate = udate = gbdate = gbedate = emcdate = emudate = NULL;
13154     pirdate = spcdate = spudate = spadate = pdbdate = NULL;
13155 
13156     oldpe = pe_flag;
13157     pe_flag = PE_NONE;
13158     status = asn_parse(destr, deend,
13159                        "descr.create-date", &cdate, &cdend,
13160                        "descr.update-date", &udate, &udend,
13161                        "descr.genbank.date", &gbdate, &gbdend,
13162                        "descr.genbank.entry-date", &gbedate, &gbeend,
13163                        "descr.embl.creation-date", &emcdate, &emcend,
13164                        "descr.embl.update-date", &emudate, &emuend,
13165                        "descr.pir.date", &pirdate, &pirend,
13166                        "descr.sp.created", &spcdate, &spcend,
13167                        "descr.sp.sequpd", &spudate, &spuend,
13168                        "descr.sp.annotupd", &spadate, &spaend,
13169                        "descr.pdb.deposition", &pdbdate, &pdbend,
13170                        NULL);
13171     pe_flag = oldpe;
13172     if (status == -1) {
13173       if (isfp->filename && isfp->filename[0]) {
13174         raise_error(E_PARSEERROR, return STATUS_ERROR,
13175                     print_error("%s, entry %d:  Invalid format of ASN.1 "
13176                                 "Bioseq-set.seq-set.seq entry.\n",
13177                                 isfp->filename, isfp->entry_count));
13178       }
13179       else {
13180         raise_error(E_PARSEERROR, return STATUS_ERROR,
13181                     print_error("seqfparseent:  Invalid format of ASN.1 "
13182                                 "Bioseq-set.seq-set.seq entry.\n"));
13183       }
13184     }
13185 
13186     exists = 1;
13187     date = dend = NULL;
13188     if (cdate != NULL)  { date = cdate; dend = cdend; }
13189     else if (udate != NULL)  { date = udate; dend = udend; }
13190     else if (gbedate != NULL)  { date = gbedate; dend = gbeend; }
13191     else if (emudate != NULL)  { date = emudate; dend = emuend; }
13192     else if (emcdate != NULL)  { date = emcdate; dend = emcend; }
13193     else if (spadate != NULL)  { date = spadate; dend = spaend; }
13194     else if (spudate != NULL)  { date = spudate; dend = spuend; }
13195     else if (spcdate != NULL)  { date = spcdate; dend = spcend; }
13196     else if (pdbdate != NULL)  { date = pdbdate; dend = pdbend; }
13197     else {
13198       exists = 0;
13199       if (pirdate != NULL) {
13200         for (s=pirdate+4; s < pirend && *s != '"'; s++) ;
13201         for (t=++s; t < pirend && *t != '"'; t++) {
13202           if (*t == '#') {
13203             while (t < pirend && !isspace(*t)) t++;
13204             while (t < pirend && isspace(*t)) t++;
13205             if (t+11 < pirend)
13206               s = t;
13207           }
13208         }
13209         set_date(&info, s, s+11);
13210       }
13211       else if (gbdate != NULL)
13212         set_date(&info, gbdate+4, gbdend);
13213     }
13214 
13215     if (exists) {
13216       for (s=date; s < dend && !isspace(*s); s++) ;
13217       for ( ; s < dend && isspace(*s); s++) ;
13218       if (s + 3 < dend && *s == 's' && s[1] == 't') {
13219         if (s[2] == 'r') {
13220           for (s+=3; s < dend && *s != '"'; s++) ;
13221           for (t=++s; t < dend && *t != '"'; t++) ;
13222           if (t < dend)
13223             set_date(&info, s, t);
13224         }
13225         else if (s[2] == 'd') {
13226           s += 3;
13227           count = 0;
13228           while (s < dend && count < 3) {
13229             while (s < dend && !isalpha(*s)) s++;
13230             if (s == dend)
13231               break;
13232 
13233             if (*s == 'y' && strncmp(s, "year ", 5) == 0) {
13234               for (s+=5; s < dend && isspace(*s); s++) ;
13235               if (s + 4 < dend && isdigit(*s) && isdigit(s[1]) &&
13236                   isdigit(s[2]) && isdigit(s[3])) {
13237                 datestr[7] = s[0];
13238                 datestr[8] = s[1];
13239                 datestr[9] = s[2];
13240                 datestr[10] = s[3];
13241                 count++;
13242               }
13243             }
13244             else if (*s == 'm' && strncmp(s, "month ", 6) == 0) {
13245               for (s+=6; s < dend && isspace(*s); s++) ;
13246               if (s + 2 < dend && isdigit(*s) &&
13247                   (isspace(s[1]) || isdigit(s[1]))) {
13248                 t = months[(isspace(s[1]) ? s[0] - '0' : 10 + (s[1] - '0'))];
13249                 datestr[3] = t[0];
13250                 datestr[4] = t[1];
13251                 datestr[5] = t[2];
13252                 count++;
13253               }
13254             }
13255             else if (*s == 'd' && strncmp(s, "day ", 4) == 0) {
13256               for (s+=4; s < dend && isspace(*s); s++) ;
13257               if (s + 2 < dend && isdigit(*s) &&
13258                   (isspace(s[1]) || isdigit(s[1]))) {
13259                 if (isspace(s[1])) {
13260                   datestr[0] = '0';
13261                   datestr[1] = s[0];
13262                 }
13263                 else {
13264                   datestr[0] = s[0];
13265                   datestr[1] = s[1];
13266                 }
13267                 count++;
13268               }
13269             }
13270           }
13271           if (count == 3) {
13272             datestr[2] = datestr[6] = '-';
13273             datestr[11] = '\0';
13274             set_date(&info, datestr, datestr+11);
13275           }
13276         }
13277       }
13278     }
13279   }
13280 
13281   /*
13282    * Get the description, organism and comment information.
13283    */
13284   if (destr != NULL && (allflag || flag == SEQINFO_DESCRIPTION ||
13285                         flag == SEQINFO_COMMENT || flag == SEQINFO_IDLIST ||
13286                         flag == SEQINFO_ORGANISM || flag == SEQINFO_FRAGMENT ||
13287                         flag == SEQINFO_ALPHABET)) {
13288     name = title = org = comment = common = pirorg = pdbcomp = NULL;
13289     pdbsrc = molstr = dnastr = rnastr = partstr = NULL;
13290 
13291     oldpe = pe_flag;
13292     pe_flag = PE_NONE;
13293     status = asn_parse(destr, deend,
13294                        "descr.name", &name, &nameend,
13295                        "descr.comment", &comment, &cmend,
13296                        "descr.title", &title, &titleend,
13297                        "descr.org.taxname", &org, &orgend,
13298                        "descr.org.common", &common, &commonend,
13299                        "descr.pir.source", &pirorg, &poend,
13300                        "descr.pdb.compound", &pdbcomp, &pcend,
13301                        "descr.pdb.source", &pdbsrc, &psend,
13302                        "descr.mol-type", &molstr, &molend,
13303                        "descr.modif.dna", &dnastr, NULL,
13304                        "descr.modif.rna", &rnastr, NULL,
13305                        "descr.modif.partial", &partstr, NULL,
13306                        NULL);
13307     pe_flag = oldpe;
13308     if (status == -1) {
13309       if (isfp->filename && isfp->filename[0]) {
13310         raise_error(E_PARSEERROR, return STATUS_ERROR,
13311                     print_error("%s, entry %d:  Invalid format of ASN.1 "
13312                                 "Bioseq-set.seq-set.seq entry.\n",
13313                                 isfp->filename, isfp->entry_count));
13314       }
13315       else {
13316         raise_error(E_PARSEERROR, return STATUS_ERROR,
13317                     print_error("seqfparseent:  Invalid format of ASN.1 "
13318                                 "Bioseq-set.seq-set.seq entry.\n"));
13319       }
13320     }
13321 
13322     /*
13323      * Check the alphabet and isfragment information.
13324      */
13325     if (partstr != NULL && (allflag || flag == SEQINFO_FRAGMENT))
13326       set_fragment(&info, 1);
13327 
13328     if (allflag || flag == SEQINFO_ALPHABET) {
13329       if (dnastr != NULL)
13330         set_alphabet(&info, DNA);
13331       else if (rnastr != NULL)
13332         set_alphabet(&info, RNA);
13333       else if (molstr != NULL) {
13334         for (s=molstr+8; s < molend && isspace(*s); s++) ;
13335         for (t=s; s < molend && !isspace(*s) && *s != ','; s++) ;
13336         if ((alpha = get_alphabet(t, s)) != UNKNOWN)
13337           set_alphabet(&info, alpha);
13338       }
13339     }
13340 
13341     /*
13342      * Add the description, if it's there.
13343      */
13344     if (allflag || flag == SEQINFO_DESCRIPTION) {
13345       if (title != NULL)
13346         add_description(&info, title+5, titleend);
13347       else if (pdbcomp != NULL) {
13348         for (s=pdbcomp+8; s < pcend && *s != '"'; s++) ;
13349         if (s < pcend) {
13350           for (t=++s; s < pcend && *s != '"'; s++) ;
13351           if (s < pcend)
13352             add_description(&info, t, s);
13353         }
13354       }
13355       else if (name != NULL)
13356         add_description(&info, name+4, nameend);
13357     }
13358 
13359     /*
13360      * Add the organism name, if it's there.
13361      */
13362     if (allflag || flag == SEQINFO_ORGANISM) {
13363       if (org != NULL)
13364         add_organism(&info, org+7, orgend);
13365       else if (common != NULL)
13366         add_organism(&info, common+6, commonend);
13367       else if (pirorg != NULL)
13368         add_organism(&info, pirorg+6, poend);
13369       else if (pdbsrc != NULL) {
13370         for (s=pdbsrc+8; s < psend && *s != '"'; s++) ;
13371         if (s < psend) {
13372           for (t=++s; s < psend && *s != '"'; s++) ;
13373           if (s < psend)
13374             add_organism(&info, t, s);
13375         }
13376       }
13377     }
13378 
13379     if (allflag || flag == SEQINFO_COMMENT || flag == SEQINFO_IDLIST) {
13380       if (comment != NULL) {
13381         for (s=comment+7; s < cmend && isspace(*s); s++) ;
13382         if (*s == '"') s++;
13383         for (t=cmend; t > s && isspace(*(t-1)); t--) ;
13384         if (*(t-1) == '"') t--;
13385 
13386         parse_comment(&info, s, t, 1, 1, flag);
13387 
13388         while (asn_parse(cmend+1, deend,
13389                          "comment", &comment, &cmend, NULL) == 1) {
13390           for (s=comment+7; s < cmend && isspace(*s); s++) ;
13391           if (*s == '"') s++;
13392           for (t=cmend; t > s && isspace(*(t-1)); t--) ;
13393           if (*(t-1) == '"') t--;
13394 
13395           parse_comment(&info, s, t, 1, 1, flag);
13396         }
13397       }
13398     }
13399   }
13400 
13401   /*
13402    * Check the information in the "inst" record.
13403    */
13404   if (inststr != NULL &&
13405       (allflag || flag == SEQINFO_ALPHABET || flag == SEQINFO_CIRCULAR)) {
13406     molstr = topstr = NULL;
13407 
13408     oldpe = pe_flag;
13409     pe_flag = PE_NONE;
13410     status = asn_parse(inststr, instend,
13411                        "inst.mol", &molstr, &molend,
13412                        "inst.topology", &topstr, &topend,
13413                        NULL);
13414     pe_flag = oldpe;
13415     if (status == -1) {
13416       if (isfp->filename && isfp->filename[0]) {
13417         raise_error(E_PARSEERROR, return STATUS_ERROR,
13418                     print_error("%s, entry %d:  Invalid format of ASN.1 "
13419                                 "Bioseq-set.seq-set.seq entry.\n",
13420                                 isfp->filename, isfp->entry_count));
13421       }
13422       else {
13423         raise_error(E_PARSEERROR, return STATUS_ERROR,
13424                     print_error("seqfparseent:  Invalid format of ASN.1 "
13425                                 "Bioseq-set.seq-set.seq entry.\n"));
13426       }
13427     }
13428 
13429     if (molstr != NULL && (allflag || flag == SEQINFO_ALPHABET)) {
13430       for (s=molstr+3; s < molend && isspace(*s); s++) ;
13431       for (t=s; s < molend && !isspace(*s) && *s != ','; s++) ;
13432       if ((alpha = get_alphabet(t, s)) != UNKNOWN)
13433         set_alphabet(&info, alpha);
13434     }
13435 
13436     if (topstr != NULL && (allflag || flag == SEQINFO_CIRCULAR)) {
13437       for (s=topstr+8; s < topend && isspace(*s); s++) ;
13438       if (mystreq(s, 'C', "CIRCULAR"))
13439         set_circular(&info, 1);
13440     }
13441   }
13442 
13443 ASN_GI_END:
13444   /*
13445    * Finish the last fields in the INFO structure.
13446    */
13447   if (flag == SEQINFO_ALL) {
13448     if (isfp->db_name == NULL)
13449       add_retrieval(&info, 1, "ASN.1");
13450     else
13451       add_retrieval(&info, 2, isfp->db_name);
13452   }
13453 
13454   finish_info(&info, isfp);
13455   memory_error(info.error, return STATUS_FATAL);
13456   return STATUS_OK;
13457 }
13458 
13459 
13460 
13461 
13462 /*
13463  *
13464  *
13465  * Sequence Output Section
13466  *
13467  *
13468  *
13469  */
13470 
13471 #define PLAIN -1
13472 #define RAW -2
13473 
guessalpha(char * seq,int seqlen,int * align_out,int * truelen_out)13474 static int guessalpha(char *seq, int seqlen, int *align_out, int *truelen_out)
13475 {
13476   int i, dna, u, t, others, newlines, gaps, truelen;
13477   char ch;
13478 
13479   dna = u = t = gaps = others = newlines = truelen = 0;
13480   for (i=0; i < seqlen; i++) {
13481     if (isalpha(seq[i]))
13482       truelen++;
13483 
13484     ch = toupper(seq[i]);
13485     if (ch == 'A' || ch == 'G' || ch == 'C' || ch == 'T' || ch == 'U') {
13486       dna++;
13487       if (ch == 'U')
13488         u++;
13489       else if (ch == 'T')
13490         t++;
13491     }
13492     else if (ch == '-' || ch == '.' || ch == '(' || ch == ')' || ch == ',')
13493       gaps++;
13494     else if (ch == '\n')
13495       newlines++;
13496     else if (!isalpha(ch) && !isspace(ch))
13497       others++;
13498   }
13499 
13500   if (truelen_out)
13501     *truelen_out = truelen;
13502 
13503   if (newlines == 0 && ((float) dna / (float) seqlen) >= 0.85) {
13504     if (align_out)
13505       *align_out = (others + gaps > 0);
13506     return (t == 0 && u != 0 ? RNA : DNA);
13507   }
13508   else if (others + newlines == 0) {
13509     if (align_out)
13510       *align_out = (gaps > 0);
13511     return PROTEIN;
13512   }
13513   else
13514     return (newlines ? PLAIN : RAW);
13515 }
13516 
putline(FILE * fp,int flag,int pos,char * string,int width,char * filler,int addcontflag)13517 static int putline(FILE *fp, int flag, int pos, char *string, int width,
13518                    char *filler, int addcontflag)
13519 {
13520   int len, flen, contflag;
13521   char *s, *t;
13522 
13523   if (width == 0) {
13524     if (flag)
13525       fputc(' ', fp);
13526 
13527     for (s=string; *s; s++) {
13528       for (t=s; *s && *s != '\n'; s++) ;
13529       if (t != s)
13530         fwrite(t, 1, s - t, fp);
13531 
13532       if (!*s || !s[1])
13533         break;
13534 
13535       fputc('\n', fp);
13536       fputs(filler, fp);
13537     }
13538 
13539     return 0;
13540   }
13541   else {
13542     flen = strlen(filler);
13543     if (flag) {
13544       if (pos+1 >= width) {
13545         fputc('\n', fp);
13546         fputs(filler, fp);
13547         pos = flen;
13548         if (addcontflag) {
13549           fputs("  ", fp);
13550           pos += 2;
13551         }
13552       }
13553       else {
13554         fputc(' ', fp);
13555         pos++;
13556       }
13557     }
13558 
13559     s = string;
13560     while (*s) {
13561       for (t=s,len=pos; *s && *s != '\n'; s++, len++)
13562         if (len == width)
13563           break;
13564 
13565       if (*s && *s != '\n') {
13566         if (isspace(*s))
13567           while (s > t && isspace(*(s-1))) s--;
13568         else {
13569           while (s > t && !isspace(*(s-1))) s--;
13570           if (s == t && !flag)
13571             while (*s && !isspace(*s)) s++;
13572           else
13573             while (s > t && isspace(*(s-1))) s--;
13574         }
13575       }
13576 
13577       if (t != s) {
13578         fwrite(t, 1, s - t, fp);
13579         pos += s - t;
13580       }
13581 
13582       contflag = (addcontflag ? flag : 0);
13583       while (*s && isspace(*s) && *s != '\n') s++;
13584       if (!*s)
13585         break;
13586       else if (*s == '\n') {
13587         if (!s[1])
13588           break;
13589         s++;
13590       }
13591       else
13592         contflag = addcontflag;
13593 
13594       fputc('\n', fp);
13595       fputs(filler, fp);
13596       pos = flen;
13597       flag = 0;
13598       if (contflag) {
13599         fputs("  ", fp);
13600         pos += 2;
13601       }
13602     }
13603 
13604     return pos;
13605   }
13606 }
13607 
13608 
put_oneline(FILE * fp,SEQINFO * info,int truelen,int alpha,char * idlist)13609 static void put_oneline(FILE *fp, SEQINFO *info, int truelen, int alpha,
13610                         char *idlist)
13611 {
13612   int flag;
13613   char *s;
13614 
13615   flag = 0;
13616   if (idlist && idlist[0]) {
13617     for (s=idlist; *s; s++)
13618       fputc(*s, fp);
13619     flag = 1;
13620   }
13621 
13622   if (info->description && info->description[0]) {
13623     if (flag)
13624       fputc(' ', fp);
13625     fputs(info->description, fp);
13626   }
13627   if (info->organism && info->organism[0]) {
13628     fputs(" - ", fp);
13629     fputs(info->organism, fp);
13630   }
13631 
13632   if (truelen > 0) {
13633     fprintf(fp, ", %d ", truelen);
13634     fputs((alpha == DNA || alpha == RNA
13635               ? "bp" : (alpha == PROTEIN ? "aa" : "ch")), fp);
13636 
13637     if (alpha == RNA || info->iscircular || info->isfragment) {
13638       fputs(" (", fp);
13639       flag = 0;
13640       if (info->iscircular) {
13641         fputs("circular", fp);
13642         flag = 1;
13643       }
13644       if (alpha == RNA) {
13645         if (flag)
13646           fputc(' ', fp);
13647         fputs("RNA", fp);
13648         flag = 1;
13649       }
13650       if (info->isfragment) {
13651         if (info->fragstart > 0) {
13652           if (flag)
13653             fputs(", ", fp);
13654           fprintf(fp, "f. %d-%d", info->fragstart,
13655                   info->fragstart + truelen - 1);
13656         }
13657         else {
13658           if (flag)
13659             fputc(' ', fp);
13660           fputs("fragment", fp);
13661         }
13662       }
13663       fputc(')', fp);
13664     }
13665 
13666     fputc('.', fp);
13667   }
13668 }
13669 
simple_putseq(FILE * fp,char * seq,int seqlen,int alpha,int align_flag,int prettyflag)13670 static void simple_putseq(FILE *fp, char *seq, int seqlen, int alpha,
13671                           int align_flag, int prettyflag)
13672 {
13673   int i, j;
13674 
13675   if (seqlen == 0)
13676     return;
13677 
13678   if (!align_flag && (alpha == DNA || alpha == RNA || alpha == PROTEIN)) {
13679     i = j = 0;
13680     if (prettyflag != 1)
13681       fputs("  ", fp);
13682     while (1) {
13683       fputc(seq[i++], fp);
13684       if (i == seqlen)
13685         break;
13686 
13687       if (++j == 60) {
13688         fputc('\n', fp);
13689         if (prettyflag != 1)
13690           fputs("  ", fp);
13691         j = 0;
13692       }
13693       else if (prettyflag != 1 && j % 10 == 0)
13694         fputc(' ', fp);
13695     }
13696   }
13697   else if (align_flag || alpha == RAW) {
13698     i = j = 0;
13699     while (1) {
13700       fputc(seq[i++], fp);
13701       if (i == seqlen)
13702         break;
13703 
13704       if (++j == 60) {
13705         fputc('\n', fp);
13706         j = 0;
13707       }
13708     }
13709   }
13710   else if (alpha == PLAIN) {
13711     fwrite(seq, 1, seqlen, fp);
13712   }
13713 }
13714 
13715 
gcg_checksum(char * seq,int seqlen,int seed)13716 static int gcg_checksum(char *seq, int seqlen, int seed)
13717 {
13718   int i, checksum;
13719   char ch;
13720 
13721   checksum = seed;
13722   for (i=0; i < seqlen; i++) {
13723     ch = toupper((seq[i] == '-' ? '.' : seq[i]));
13724     checksum += (i % 57 + 1) * ch;
13725   }
13726 
13727   return (checksum % 10000);
13728 }
13729 
13730 
gcg_checksum2(char * seq,int seqlen,int maxlen,int seed)13731 static int gcg_checksum2(char *seq, int seqlen, int maxlen, int seed)
13732 {
13733   int i, checksum;
13734   char ch;
13735 
13736   checksum = seed;
13737   for (i=0; i < seqlen; i++) {
13738     ch = toupper((seq[i] == '-' ? '.' : seq[i]));
13739     checksum += (i % 57 + 1) * ch;
13740   }
13741   for ( ; i < maxlen; i++)
13742     checksum += (i % 57 + 1) * '.';
13743 
13744   return (checksum % 10000);
13745 }
13746 
13747 
putgcgseq(FILE * fp,char * seq,int seqlen,SEQINFO * info)13748 static void putgcgseq(FILE *fp, char *seq, int seqlen, SEQINFO *info)
13749 {
13750   int i, j, k, len, alpha, align_flag, count;
13751   char *date, *s, buffer[32];
13752 
13753   fputc('\n', fp);
13754 
13755   /*
13756    * Print the infoline.
13757    */
13758   len = seqfoneline(info, buffer, 20, 1);
13759   fputs("  ", fp);
13760   fputs(buffer, fp);
13761   fputs("  ", fp);
13762   fprintf(fp, "Length: %d  ", seqlen);
13763 
13764   date = get_today();
13765   for (i=1; i <= 12; i++)
13766     if (myncasecmp(date+3, months[i], 3) == 0)
13767       break;
13768   if (i <= 12)
13769     fprintf(fp, "%s %c%c, %s %s  ", gcg_full_months[i], date[0], date[1],
13770             date+7, date+12);
13771 
13772   if (info->alphabet != UNKNOWN)
13773     alpha = info->alphabet;
13774   else
13775     alpha = guessalpha(seq, seqlen, &align_flag, NULL);
13776 
13777   if (alpha == RNA || alpha == DNA)
13778     fputs("Type: N  ", fp);
13779   else if (alpha == PROTEIN)
13780     fputs("Type: P  ", fp);
13781 
13782   fprintf(fp, "Check: %d  ..\n\n", gcg_checksum(seq, seqlen, 0));
13783 
13784   /*
13785    * Print the gcg sequence lines.
13786    */
13787   for (i=0,count=1; i < seqlen; count+=50) {
13788     sprintf(buffer, "%8d  ", count);
13789     for (j=0,s=buffer+9; i < seqlen && j < 5; j++) {
13790       for (k=0; i < seqlen && k < 10; k++,i++)
13791         *s++ = (seq[i] == '-' ? '.' : seq[i]);
13792       *s++ = ' ';
13793     }
13794     *s++ = '\n';
13795     *s++ = '\n';
13796     *s = '\0';
13797     fputs(buffer, fp);
13798   }
13799 }
13800 
13801 
13802 
raw_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)13803 static int raw_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
13804 {
13805   fputs(seq, isfp->output_fp);
13806   return STATUS_OK;
13807 }
13808 
plain_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)13809 static int plain_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
13810 {
13811   simple_putseq(isfp->output_fp, seq, seqlen, PLAIN, 0, isfp->prettyflag);
13812   if (seq[seqlen-1] != '\n')
13813     fputc('\n', isfp->output_fp);
13814   return STATUS_OK;
13815 }
13816 
genbank_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)13817 static int genbank_putseq(INTSEQFILE *isfp, char *seq, int seqlen,
13818                           SEQINFO *info)
13819 {
13820   int i, j, k, a, c, g, t2, u, o, count, alpha, flag, flag2;
13821   char ch, *s, *t, *id, *idend, buffer[128];
13822   char *gbid, *gbend, *shortid, *shend, *nid_start, *nid_end;
13823   FILE *fp = isfp->output_fp;
13824 
13825   /*
13826    * Look for either a "gb" identifier or a short, non-accession identifier.
13827    * If found, use that as the entry ID.  Otherwise, fill in string
13828    * "(below)" or "Unknown" whether there are any non-accession identifiers.
13829    *
13830    * If an ID was used, remember it so that it won't be duplicated when
13831    * the "Cross-Refs:" line is printed.
13832    */
13833   id = NULL;
13834   if (info->idlist && info->idlist[0]) {
13835     gbid = gbend = shortid = shend = NULL;
13836     flag = 0;
13837     for (s=info->idlist; *s; ) {
13838       for (t=s; *s && *s != '|'; s++) ;
13839       if (!gbid && mystreq(t, 'G', "GB:")) {
13840         gbid = t;
13841         gbend = s;
13842         break;
13843       }
13844       if (!mystreq(t, 'A', "ACC:") && !mystreq(t, 'N', "NID:")) {
13845         flag = 1;
13846         if (!shortid && s - t <= 10) {
13847           shortid = t;
13848           shend = s;
13849         }
13850       }
13851       if (*s) s++;
13852     }
13853     if (gbid) {
13854       id = gbid + 3;
13855       idend = gbend;
13856     }
13857     else if (shortid) {
13858       id = shortid;
13859       idend = shend;
13860     }
13861     else if (flag) {
13862       id = "(below)";
13863       idend = id + 7;
13864     }
13865     else {
13866       id = "Unknown";
13867       idend = id + 7;
13868     }
13869   }
13870   else {
13871     id = "Unknown";
13872     idend = id + 7;
13873   }
13874   ch = *idend; *idend = '\0';
13875   fprintf(fp, "LOCUS       %-10s", id);
13876   *idend = ch;
13877 
13878   /*
13879    * Print the rest of the LOCUS line.
13880    */
13881   alpha = (info->alphabet != UNKNOWN
13882              ? info->alphabet : guessalpha(seq, seqlen, NULL, NULL));
13883 
13884   fprintf(fp, "%7d %s %3s%-4s  %-10s%3s       %-11s\n",
13885           seqlen,                                         /* seq. length */
13886           (alpha == PROTEIN ? "aa" : "bp"),               /* char. type */
13887           "",                                             /* strand type */
13888           (alpha == RNA ? "RNA" :                         /* seq. type */
13889             (alpha == DNA ? "DNA" :
13890               (alpha == PROTEIN ? "PRT" : ""))),
13891           (info->iscircular ? "circular" : ""),           /* linear/circular */
13892           "UNC",                                          /* category */
13893           (info->date && info->date[0] ? info->date
13894                                        : "01-JAN-0000")); /* date */
13895 
13896   /*
13897    * Print the DEFINITION, ACCESSION, NID and ORGANISM lines.
13898    */
13899   if (info->description && info->description[0]) {
13900     fputs("DEFINITION  ", fp);
13901     putline(fp, 0, 12, info->description, 80, "            ", 0);
13902     fputc('.', fp);
13903     fputc('\n', fp);
13904   }
13905 
13906   flag = 0;
13907   nid_start = nid_end = NULL;
13908   if (info->idlist && info->idlist[0]) {
13909     for (s=info->idlist; *s; ) {
13910       for (t=s; *s && *s != '|'; s++) ;
13911       if (mystreq(t, 'A', "ACC:")) {
13912         if (!flag) {
13913           fputs("ACCESSION   ", fp);
13914           flag = 12;
13915         }
13916         else if (flag + (s - t - 4 + 1) > 80) {
13917           fputs("\n            ", fp);
13918           flag = 12;
13919         }
13920         else {
13921           fputc(' ', fp);
13922           flag++;
13923         }
13924 
13925         fwrite(t+4, 1, s - t - 4, fp);
13926         flag += s - t - 4;
13927       }
13928       else if (mystreq(t, 'N', "NID:")) {
13929         nid_start = t;
13930         nid_end = s;
13931       }
13932       if (*s) s++;
13933     }
13934   }
13935   if (flag)
13936     fputc('\n', fp);
13937 
13938   if (nid_start != NULL) {
13939     fputs("NID         ", fp);
13940     fwrite(nid_start + 4, 1, nid_end - nid_start - 4, fp);
13941     fputc('\n', fp);
13942   }
13943 
13944   if (info->organism && info->organism[0]) {
13945     fputs("SOURCE      .\n  ORGANISM  ", fp);
13946     putline(fp, 0, 12, info->organism, 80, "            ", 0);
13947     fputc('\n', fp);
13948   }
13949 
13950   /*
13951    * Print the comment section:  actual comments, Cross-References, history.
13952    */
13953   flag = 0;
13954   if (info->comment && info->comment[0]) {
13955     fputs("COMMENT     ", fp);
13956     putline(fp, 0, 12, info->comment, 80, "            ", 1);
13957     fputc('\n', fp);
13958     flag = 1;
13959   }
13960 
13961   flag2 = 0;
13962   if (info->idlist && info->idlist[0]) {
13963     for (s=info->idlist; *s; ) {
13964       for (t=s; *s && *s != '|'; s++) ;
13965       if (!(id && (t == id || t+3 == id)) &&
13966           !mystreq(t, 'A', "ACC:") && !mystreq(t, 'N', "NID:")) {
13967         if (!flag2) {
13968           if (!flag) {
13969             fputs("COMMENT     SEQIO Refs: ", fp);
13970             flag = 1;
13971           }
13972           else
13973             fputs("            \n            SEQIO Refs: ", fp);
13974           flag2 = 24;
13975         }
13976         else if (flag2 + (s - t) + 1 >= 80) {
13977           fputs("\n            SEQIO Refs: ", fp);
13978           flag2 = 24;
13979         }
13980         else {
13981           fputc('|', fp);
13982           flag2++;
13983         }
13984 
13985         fwrite(t, 1, s - t, fp);
13986         flag2 += s - t;
13987       }
13988       if (*s) s++;
13989     }
13990   }
13991   if (flag2)
13992     fputc('\n', fp);
13993 
13994   if (info->history && info->history[0]) {
13995     if (!flag) {
13996       fputs("COMMENT     ", fp);
13997       putline(fp, 0, 12, info->history, 80, "            ", 1);
13998       fputc('\n', fp);
13999       flag = 1;
14000     }
14001     else {
14002       if (!flag2)
14003         fputs("            \n", fp);
14004       fputs("            ", fp);
14005       putline(fp, 0, 12, info->history, 80, "            ", 1);
14006       fputc('\n', fp);
14007     }
14008   }
14009 
14010   /*
14011    * Count the bases, and possibly print a base count.
14012    */
14013   if (alpha == DNA || alpha == RNA) {
14014     a = c = g = t2 = u = o = 0;
14015     for (i=0; i < seqlen; i++) {
14016       switch (seq[i]) {
14017       case 'a': case 'A':  a++; break;
14018       case 'c': case 'C':  c++; break;
14019       case 'g': case 'G':  g++; break;
14020       case 't': case 'T':  t2++; break;
14021       case 'u': case 'U':  u++; break;
14022       default:             o++;
14023       }
14024     }
14025     if (alpha == RNA && !(u == 0 && t2 != 0))
14026       fprintf(fp, "BASE COUNT  %7d a%7d c%7d g%7d u", a, c, g, u);
14027     else
14028       fprintf(fp, "BASE COUNT  %7d a%7d c%7d g%7d t", a, c, g, t2);
14029     if (o)
14030       fprintf(fp, "%7d others", o);
14031     fputc('\n', fp);
14032   }
14033 
14034   /*
14035    * Print the sequence.
14036    */
14037   fputs("ORIGIN\n", fp);
14038 
14039   if (isfp->format == FORMAT_GCG)
14040     putgcgseq(fp, seq, seqlen, info);
14041   else {
14042     for (i=0,count=1; i < seqlen; count+=60) {
14043       sprintf(buffer, "   %6d", count);
14044       for (j=0,s=buffer+9; i < seqlen && j < 6; j++) {
14045         *s++ = ' ';
14046         for (k=0; i < seqlen && k < 10; k++)
14047           *s++ = seq[i++];
14048       }
14049       *s++ = '\n';
14050       *s = '\0';
14051       fputs(buffer, fp);
14052     }
14053     fputs("//\n", fp);
14054   }
14055 
14056   return STATUS_OK;
14057 }
14058 
14059 
pir_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)14060 static int pir_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
14061 {
14062   int i, j, count, pos, flag, flag2;
14063   char ch, *s, *t, *id, *idend;
14064   FILE *fp = isfp->output_fp;
14065 
14066   /*
14067    * Print a non-accession identifier (a "pir" ID if possible) and
14068    * whether the sequence is complete or a fragment.
14069    */
14070   id = idend = NULL;
14071   if (info->idlist && info->idlist[0]) {
14072     for (s=info->idlist; *s; ) {
14073       for (t=s; *s && *s != '|'; s++) ;
14074       if (mystreq(t, 'P', "PIR:")) {
14075         id = t+4;
14076         idend = s;
14077         break;
14078       }
14079       else if (!id && !mystreq(t, 'A', "ACC:")) {
14080         id = t;
14081         idend = s;
14082       }
14083       if (*s) s++;
14084     }
14085 
14086     if (!id) {
14087       id = "UNKNWN";
14088       idend = id + 6;
14089     }
14090   }
14091   else {
14092     id = "UNKNWN";
14093     idend = id + 6;
14094   }
14095   ch = *idend; *idend = '\0';
14096   fprintf(fp, "ENTRY            %s       #type %s\n", id,
14097           (info->isfragment ? "fragment" : "complete"));
14098   *idend = ch;
14099 
14100   if (info->description && info->description[0]) {
14101     fputs("TITLE            ", fp);
14102     pos = putline(fp, 0, 17, info->description, 80, "                 ", 1);
14103     if (info->organism && info->organism[0]) {
14104       pos = putline(fp, 1, pos, "-", 80, "                 ", 1);
14105       pos = putline(fp, 1, pos, info->organism, 80, "                 ", 1);
14106     }
14107     if (info->isfragment)
14108       pos = putline(fp, 1, pos, "(fragment)", 80, "                 ", 1);
14109     fputc('\n', fp);
14110   }
14111 
14112   if (info->organism && info->organism[0]) {
14113     fputs("ORGANISM         #formal_name ", fp);
14114     putline(fp, 0, 30, info->organism, 80, "                 ", 1);
14115     fputc('\n', fp);
14116   }
14117 
14118   if (info->date && info->date[0])
14119     fprintf(fp, "DATE             %s\n", info->date);
14120 
14121   flag = 0;
14122   if (info->idlist && info->idlist[0]) {
14123     for (s=info->idlist; *s; ) {
14124       for (t=s; *s && *s != '|'; s++) ;
14125       if (mystreq(t, 'A', "ACC:")) {
14126         if (!flag) {
14127           fputs("ACCESSIONS       ", fp);
14128           flag = 17;
14129         }
14130         else if (flag + (s - t - 4 + 3) > 80) {
14131           fputs(";\n                   ", fp);
14132           flag = 19;
14133         }
14134         else {
14135           fputs("; ", fp);
14136           flag += 2;
14137         }
14138 
14139         fwrite(t+4, 1, s - t - 4, fp);
14140         flag += s - t - 4;
14141       }
14142       if (*s) s++;
14143     }
14144   }
14145   if (flag)
14146     fputc('\n', fp);
14147 
14148   /*
14149    * Print the comment section:  actual comments, Cross-References, history.
14150    */
14151   flag = 0;
14152   if (info->comment && info->comment[0]) {
14153     fputs("COMMENT    ", fp);
14154     putline(fp, 0, 11, info->comment, 80, "           ", 1);
14155     fputc('\n', fp);
14156     flag = 1;
14157   }
14158 
14159   flag2 = 0;
14160   if (info->idlist && info->idlist[0]) {
14161     for (s=info->idlist; *s; ) {
14162       for (t=s; *s && *s != '|'; s++) ;
14163       if (!(id && (t == id || t+4 == id)) && !mystreq(t, 'A', "ACC:")) {
14164         if (!flag2) {
14165           if (!flag) {
14166             fputs("COMMENT    SEQIO Refs: ", fp);
14167             flag = 1;
14168           }
14169           else
14170             fputs("           \n           SEQIO Refs: ", fp);
14171           flag2 = 23;
14172         }
14173         else if (flag2 + (s - t) + 1 >= 80) {
14174           fputs("\n           SEQIO Refs: ", fp);
14175           flag2 = 23;
14176         }
14177         else {
14178           fputc('|', fp);
14179           flag2 = 23;
14180         }
14181 
14182         fwrite(t, 1, s - t, fp);
14183         flag2 += s - t;
14184       }
14185       if (*s) s++;
14186     }
14187   }
14188   if (flag2)
14189     fputc('\n', fp);
14190 
14191   if (info->history && info->history[0]) {
14192     if (!flag) {
14193       fputs("COMMENT    ", fp);
14194       putline(fp, 0, 11, info->history, 80, "           ", 1);
14195       fputc('\n', fp);
14196       flag = 1;
14197     }
14198     else {
14199       if (!flag2)
14200         fputs("           \n", fp);
14201       fputs("           ", fp);
14202       putline(fp, 0, 11, info->history, 80, "           ", 1);
14203       fputc('\n', fp);
14204     }
14205   }
14206 
14207   fprintf(fp, "SUMMARY          #length %d\n", seqlen);
14208   fputs("SEQUENCE\n", fp);
14209 
14210   if (isfp->format == FORMAT_GCG)
14211     putgcgseq(fp, seq, seqlen, info);
14212   else {
14213     fputs("                5        10        15"
14214           "        20        25        30\n", fp);
14215 
14216     for (i=0,count=1; i < seqlen; count+=30) {
14217       fprintf(fp, "%7d", count);
14218       for (j=0; i < seqlen && j < 30; j++) {
14219         fputc(' ', fp);
14220         fputc(seq[i++], fp);
14221       }
14222       fputc('\n', fp);
14223     }
14224     fputs("///\n", fp);
14225   }
14226 
14227   return STATUS_OK;
14228 }
14229 
embl_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)14230 static int embl_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
14231 {
14232   int i, j, k, a, c, g, t2, u, o, alpha, count, flag, flag2, pos, epd_flag;
14233   char ch, *s, *t, *id, *idend, *nid_start, *nid_end;
14234   FILE *fp = isfp->output_fp;
14235 
14236   /*
14237    * Print a non-accession identifier (an "embl" ID if possible) and
14238    * whether the sequence is complete or a fragment.
14239    */
14240   epd_flag = 0;
14241   id = idend = NULL;
14242   if (info->idlist && info->idlist[0]) {
14243     for (s=info->idlist; *s; ) {
14244       for (t=s; *s && *s != '|'; s++) ;
14245       if (mystreq(t, 'E', "EMBL:")) {
14246         id = t+5;
14247         idend = s;
14248         break;
14249       }
14250       else if (mystreq(t, 'E', "EPD:")) {
14251         epd_flag = 1;
14252         id = t+4;
14253         idend = s;
14254         break;
14255       }
14256       else if (!id && !mystreq(t, 'A', "ACC:") && !mystreq(t, 'N', "NID:")) {
14257         id = t;
14258         idend = s;
14259       }
14260       if (*s) s++;
14261     }
14262     if (!id) {
14263       id = "Unknown";
14264       idend = id + 7;
14265     }
14266   }
14267   else {
14268     id = "Unknown";
14269     idend = id + 7;
14270   }
14271   ch = *idend; *idend = '\0';
14272   fprintf(fp, "ID   %-10s", id);
14273   *idend = ch;
14274 
14275   alpha = (info->alphabet != UNKNOWN
14276              ? info->alphabet : guessalpha(seq, seqlen, NULL, NULL));
14277 
14278   fputs(" converted; ", fp);
14279   if (info->iscircular)
14280     fputs("circular ", fp);
14281   fputs((alpha == RNA ? "RNA"
14282                       : (alpha == DNA ? "DNA"
14283                                       : (alpha == PROTEIN ? "PRT" : "UNK"))),
14284         fp);
14285   fprintf(fp, "; %s; %d %s.\n", (!epd_flag ? "UNC" : "EPD"), seqlen,
14286           (alpha == DNA || alpha == RNA ? "BP"
14287                                         : (alpha == PROTEIN ? "AA" : "CH")));
14288   fputs("XX\n", fp);
14289 
14290   flag = 0;
14291   nid_start = nid_end = NULL;
14292   if (info->idlist && info->idlist[0]) {
14293     for (s=info->idlist; *s; ) {
14294       for (t=s; *s && *s != '|'; s++) ;
14295       if (mystreq(t, 'A', "ACC:")) {
14296         if (!flag) {
14297           fputs("AC   ", fp);
14298           flag = 5;
14299         }
14300         else if (flag + (s - t - 4 + 2) > 80) {
14301           fputs("\nAC   ", fp);
14302           flag = 5;
14303         }
14304         else {
14305           fputc(' ', fp);
14306           flag++;
14307         }
14308 
14309         fwrite(t+4, 1, s - t - 4, fp);
14310         fputc(';', fp);
14311         flag += (s - t - 4) + 1;
14312       }
14313       else if (mystreq(t, 'N', "NID:")) {
14314         nid_start = t;
14315         nid_end = s;
14316       }
14317       if (*s) s++;
14318     }
14319   }
14320   if (flag)
14321     fputs("\nXX\n", fp);
14322 
14323   if (nid_start != NULL) {
14324     fputs("NI   ", fp);
14325     fwrite(nid_start + 4, 1, nid_end - nid_start - 4, fp);
14326     fputs("\nXX\n", fp);
14327   }
14328 
14329   if (info->date && info->date[0])
14330     fprintf(fp, "DT   %s\nXX\n", info->date);
14331 
14332   if (info->description && info->description[0]) {
14333     fputs("DE   ", fp);
14334     pos = putline(fp, 0, 5, info->description, 80, "DE   ", 0);
14335     if (info->isfragment)
14336       putline(fp, 1, pos, "(fragment)", 80, "DE   ", 0);
14337     fputs("\nXX\n", fp);
14338   }
14339 
14340   if (info->organism && info->organism[0]) {
14341     fputs("OS   ", fp);
14342     putline(fp, 0, 5, info->organism, 80, "OS   ", 0);
14343     fputs("\nXX\n", fp);
14344   }
14345 
14346   /*
14347    * Print the comment section:  actual comments, Cross-References, history.
14348    */
14349   flag = 0;
14350   if (info->comment && info->comment[0]) {
14351     fputs("CC   ", fp);
14352     putline(fp, 0, 5, info->comment, 80, "CC   ", 1);
14353     fputc('\n', fp);
14354     flag = 1;
14355   }
14356 
14357   flag2 = 0;
14358   if (info->idlist && info->idlist[0]) {
14359     for (s=info->idlist; *s; ) {
14360       for (t=s; *s && *s != '|'; s++) ;
14361       if (!(id && (t == id || t+4 == id || t+5 == id)) &&
14362           !mystreq(t, 'A', "ACC:") && !mystreq(t, 'N', "NID:")) {
14363         if (!flag2) {
14364           if (!flag) {
14365             fputs("CC   SEQIO Refs: ", fp);
14366             flag = 1;
14367           }
14368           else
14369             fputs("CC   \nCC   SEQIO Refs: ", fp);
14370           flag2 = 17;
14371         }
14372         else if (flag2 + (s - t) + 1 >= 80) {
14373           fputs("\nCC   SEQIO Refs: ", fp);
14374           flag2 = 17;
14375         }
14376         else {
14377           fputc('|', fp);
14378           flag2++;
14379         }
14380 
14381         fwrite(t, 1, s - t, fp);
14382         flag2 += s - t;
14383       }
14384       if (*s) s++;
14385     }
14386   }
14387   if (flag2)
14388     fputc('\n', fp);
14389 
14390   if (info->history && info->history[0]) {
14391     if (!flag) {
14392       fputs("CC   ", fp);
14393       putline(fp, 0, 5, info->history, 80, "CC   ", 1);
14394       fputc('\n', fp);
14395       flag = 1;
14396     }
14397     else {
14398       if (!flag2)
14399         fputs("CC   \n", fp);
14400       fputs("CC   ", fp);
14401       putline(fp, 0, 5, info->history, 80, "CC   ", 1);
14402       fputc('\n', fp);
14403     }
14404   }
14405   if (flag)
14406     fputs("XX\n", fp);
14407 
14408   fprintf(fp, "SQ   Sequence %d %s;", seqlen,
14409           (alpha == DNA || alpha == RNA ? "BP"
14410              : (alpha == PROTEIN ? "AA" : "CH")));
14411   if (alpha != DNA && alpha != RNA)
14412     fputc('\n', fp);
14413   else {
14414     a = c = g = t2 = u = o = 0;
14415     for (i=0; i < seqlen; i++) {
14416       switch (seq[i]) {
14417       case 'a': case 'A':  a++; break;
14418       case 'c': case 'C':  c++; break;
14419       case 'g': case 'G':  g++; break;
14420       case 't': case 'T':  t2++; break;
14421       case 'u': case 'U':  u++; break;
14422       default:             o++;
14423       }
14424     }
14425 
14426     if (alpha == RNA && !(u == 0 && t2 != 0))
14427       fprintf(fp, " %d A; %d C; %d G; %d U; %d other;\n", a, c, g, u, o);
14428     else
14429       fprintf(fp, " %d A; %d C; %d G; %d T; %d other;\n", a, c, g, t2, o);
14430   }
14431 
14432   if (isfp->format == FORMAT_GCG)
14433     putgcgseq(fp, seq, seqlen, info);
14434   else {
14435     for (i=0,count=60; i < seqlen; count+=60) {
14436       fputs("    ", fp);
14437       for (j=0; j < 6; j++) {
14438         fputc(' ', fp);
14439         for (k=0; k < 10; k++)
14440           fputc((i < seqlen ? seq[i++] : ' '), fp);
14441       }
14442 
14443       fprintf(fp, "%10d\n", (i < seqlen ? count : seqlen));
14444     }
14445     fputs("//\n", fp);
14446   }
14447 
14448   return STATUS_OK;
14449 }
14450 
14451 
sprot_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)14452 static int sprot_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
14453 {
14454   int i, j, k, alpha, count, flag, flag2, pos;
14455   char ch, *s, *t, *id, *idend;
14456   FILE *fp = isfp->output_fp;
14457 
14458   /*
14459    * Print a non-accession identifier (an "sp" ID if possible) and
14460    * whether the sequence is complete or a fragment.
14461    */
14462   id = idend = NULL;
14463   if (info->idlist && info->idlist[0]) {
14464     for (s=info->idlist; *s; ) {
14465       for (t=s; *s && *s != '|'; s++) ;
14466       if (mystreq(t, 'S', "SP:")) {
14467         id = t+3;
14468         idend = s;
14469         break;
14470       }
14471       else if (!id && !mystreq(t, 'A', "ACC:")) {
14472         id = t;
14473         idend = s;
14474       }
14475       if (*s) s++;
14476     }
14477     if (!id) {
14478       id = "Unknown";
14479       idend = id + 7;
14480     }
14481   }
14482   else {
14483     id = "Unknown";
14484     idend = id + 7;
14485   }
14486   ch = *idend; *idend = '\0';
14487   fprintf(fp, "ID   %-10s", id);
14488   *idend = ch;
14489 
14490   alpha = (info->alphabet != UNKNOWN ? info->alphabet
14491                                      : guessalpha(seq, seqlen, NULL, NULL));
14492 
14493   fputs("  CONVERTED;  ", fp);
14494   if (info->iscircular)
14495     fputs("circular ", fp);
14496   else
14497     fputs("    ", fp);
14498 
14499   fputs((alpha == RNA ? "RNA"
14500                       : (alpha == DNA ? "DNA"
14501                                       : (alpha == PROTEIN ? "PRT" : "UNK"))),
14502         fp);
14503   fprintf(fp, "; %5d %s.\n", seqlen, (alpha == DNA || alpha == RNA ? "BP"
14504                                         : (alpha == PROTEIN ? "AA" : "CH")));
14505 
14506   flag = 0;
14507   if (info->idlist && info->idlist[0]) {
14508     for (s=info->idlist; *s; ) {
14509       for (t=s; *s && *s != '|'; s++) ;
14510       if (mystreq(t, 'A', "ACC:")) {
14511         if (!flag) {
14512           fputs("AC   ", fp);
14513           flag = 5;
14514         }
14515         else if (flag + (s - t - 4) + 2 > 80) {
14516           fputs("\nAC   ", fp);
14517           flag = 5;
14518         }
14519         else {
14520           fputc(' ', fp);
14521           flag++;
14522         }
14523 
14524         fwrite(t+4, 1, s - t - 4, fp);
14525         fputc(';', fp);
14526         flag += (s - t - 4) + 1;
14527       }
14528       if (*s) s++;
14529     }
14530   }
14531   if (flag)
14532     fputc('\n', fp);
14533 
14534   if (info->date && info->date[0])
14535     fprintf(fp, "DT   %s\n", info->date);
14536 
14537   if (info->description && info->description[0]) {
14538     fputs("DE   ", fp);
14539     pos = putline(fp, 0, 5, info->description, 75, "DE   ", 0);
14540     if (info->isfragment)
14541       putline(fp, 1, pos, "(FRAGMENT)", 75, "DE   ", 0);
14542     fputc('.', fp);
14543     fputc('\n', fp);
14544   }
14545 
14546   if (info->organism && info->organism[0]) {
14547     fputs("OS   ", fp);
14548     putline(fp, 0, 5, info->organism, 75, "OS   ", 0);
14549     fputc('.', fp);
14550     fputc('\n', fp);
14551   }
14552 
14553   /*
14554    * Print the comment section:  Cross-References, actual comments, history.
14555    */
14556   flag = 0;
14557   if (info->comment && info->comment[0]) {
14558     fputs("CC   ", fp);
14559     putline(fp, 0, 5, info->comment, 75, "CC   ", 1);
14560     fputc('\n', fp);
14561     flag = 1;
14562   }
14563 
14564   flag2 = 0;
14565   if (info->idlist && info->idlist[0]) {
14566     for (s=info->idlist; *s; ) {
14567       for (t=s; *s && *s != '|'; s++) ;
14568       if (!(id && (t == id || t+3 == id)) && !mystreq(t, 'A', "ACC:")) {
14569         if (!flag2) {
14570           if (!flag) {
14571             fputs("CC   SEQIO Refs: ", fp);
14572             flag = 1;
14573           }
14574           else
14575             fputs("CC   \nCC   SEQIO Refs: ", fp);
14576           flag2 = 17;
14577         }
14578         else if (flag2 + (s - t) + 1 >= 80) {
14579           fputs("\nCC   SEQIO Refs: ", fp);
14580           flag2 = 17;
14581         }
14582         else {
14583           fputc('|', fp);
14584           flag2++;
14585         }
14586 
14587         fwrite(t, 1, s - t, fp);
14588         flag2 += s - t;
14589       }
14590       if (*s) s++;
14591     }
14592   }
14593   if (flag2)
14594     fputc('\n', fp);
14595 
14596   if (info->history && info->history[0]) {
14597     if (!flag) {
14598       fputs("CC   ", fp);
14599       putline(fp, 0, 5, info->history, 75, "CC   ", 1);
14600       fputc('\n', fp);
14601       flag = 1;
14602     }
14603     else {
14604       if (!flag2)
14605         fputs("CC   \n", fp);
14606       fputs("CC   ", fp);
14607       putline(fp, 0, 5, info->history, 75, "CC   ", 1);
14608       fputc('\n', fp);
14609     }
14610   }
14611 
14612   fprintf(fp, "SQ   SEQUENCE   %d %s;\n", seqlen,
14613           (alpha == DNA || alpha == RNA ? "BP"
14614              : (alpha == PROTEIN ? "AA" : "CH")));
14615 
14616   if (isfp->format == FORMAT_GCG)
14617     putgcgseq(fp, seq, seqlen, info);
14618   else {
14619     for (i=0,count=60; i < seqlen; count+=60) {
14620       fputs("    ", fp);
14621       for (j=0; i < seqlen && j < 6; j++) {
14622         fputc(' ', fp);
14623         for (k=0; i < seqlen && k < 10; k++)
14624           fputc(seq[i++], fp);
14625       }
14626       fputc('\n', fp);
14627     }
14628     fputs("//\n", fp);
14629   }
14630 
14631   return STATUS_OK;
14632 }
14633 
nbrf_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)14634 static int nbrf_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
14635 {
14636   int alpha, align_flag, flag, truelen;
14637   char *s, *t;
14638   FILE *fp = isfp->output_fp;
14639 
14640   alpha = guessalpha(seq, seqlen, &align_flag, &truelen);
14641   if (info->alphabet != UNKNOWN)
14642     alpha = info->alphabet;
14643 
14644   fputc('>', fp);
14645   switch (alpha) {
14646   case PROTEIN:
14647     fputs((info->isfragment ? "F1" : "P1"), fp);
14648     break;
14649 
14650   case DNA:
14651     fputs((info->iscircular ? "DC" : "DL"), fp);
14652     break;
14653 
14654   case RNA:
14655     fputs((info->iscircular ? "RC" : "RL"), fp);
14656     break;
14657 
14658   default:
14659     fputs("XX", fp);
14660   }
14661   fputc(';', fp);
14662 
14663   /*
14664    * Output the list of identifiers.
14665    */
14666   flag = 0;
14667   if (info->idlist && info->idlist[0]) {
14668     for (s=info->idlist; *s; ) {
14669       for (t=s; *s && *s != '|'; s++) ;
14670       if (!mystreq(t, 'A', "ACC:")) {
14671         if (flag)
14672           fputc('|', fp);
14673 
14674         fwrite(t, 1, s - t, fp);
14675         flag = 1;
14676       }
14677       if (*s) s++;
14678     }
14679   }
14680   if (!flag)
14681     fputs("Unknown", fp);
14682   fputc('\n', fp);
14683 
14684   /*
14685    * Then put the oneline description and the actual sequence.
14686    */
14687   put_oneline(fp, info, truelen, alpha, NULL);
14688   fputc('\n', fp);
14689 
14690   if (isfp->format != FORMAT_GCG) {
14691     simple_putseq(fp, seq, seqlen, alpha, align_flag, isfp->prettyflag);
14692     fputc('*', fp);
14693     fputc('\n', fp);
14694   }
14695 
14696   /*
14697    * Output the date, accession numbers, comments and history info.
14698    */
14699   if (info->date && info->date[0])
14700     fprintf(fp, "C;Date: %s\n", info->date);
14701 
14702   flag = 0;
14703   if (info->idlist && info->idlist[0]) {
14704     for (s=info->idlist; *s; ) {
14705       for (t=s; *s && *s != '|'; s++) ;
14706       if (mystreq(t, 'A', "ACC:")) {
14707         if (!flag) {
14708           fputs("C;Accession: ", fp);
14709           flag = 13;
14710         }
14711         else if (flag + (s - t - 4) + 2 >= 80) {
14712           fputs("\nC;Accession: ", fp);
14713           flag = 13;
14714         }
14715         else {
14716           fputs("; ", fp);
14717           flag += 2;
14718         }
14719 
14720         fwrite(t+4, 1, s - t - 4, fp);
14721         flag += s - t - 4;
14722       }
14723       if (*s) s++;
14724     }
14725   }
14726   if (flag)
14727     fputc('\n', fp);
14728 
14729   flag = 0;
14730   if (info->comment && info->comment[0]) {
14731     fputs("C;Comment: ", fp);
14732     putline(fp, 0, 11, info->comment, 80, "C;Comment: ", 1);
14733     fputc('\n', fp);
14734     flag = 1;
14735   }
14736 
14737   if (info->history && info->history[0]) {
14738     if (flag)
14739       fputs("C;Comment: \n", fp);
14740     fputs("C;Comment: ", fp);
14741     putline(fp, 0, 11, info->history, 80, "C;Comment: ", 1);
14742     fputc('\n', fp);
14743   }
14744 
14745   if (isfp->format == FORMAT_GCG)
14746     putgcgseq(fp, seq, seqlen, info);
14747 
14748   return STATUS_OK;
14749 }
14750 
nbrfold_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)14751 static int nbrfold_putseq(INTSEQFILE *isfp, char *seq, int seqlen,
14752                           SEQINFO *info)
14753 {
14754   int alpha, align_flag, truelen;
14755   char *s, *t;
14756   FILE *fp = isfp->output_fp;
14757 
14758   alpha = guessalpha(seq, seqlen, &align_flag, &truelen);
14759   if (info->alphabet != UNKNOWN)
14760     alpha = info->alphabet;
14761 
14762   fputc('>', fp);
14763   switch (alpha) {
14764   case PROTEIN:
14765     fputs((info->isfragment ? "F1" : "P1"), fp);
14766     break;
14767 
14768   case DNA:
14769     fputs((info->iscircular ? "DC" : "DL"), fp);
14770     break;
14771 
14772   case RNA:
14773     fputs((info->iscircular ? "RC" : "RL"), fp);
14774     break;
14775 
14776   default:
14777     fputs("XX", fp);
14778   }
14779   fputc(';', fp);
14780 
14781   if (!(info->idlist && info->idlist[0]))
14782     fputs("Unknown\n", fp);
14783   else {
14784     for (t=s=info->idlist; *s && *s != '|'; s++) ;
14785     fwrite(t, 1, s - t, fp);
14786     fputc('\n', fp);
14787     if (*s && mystreq(s+1, 'A', "ACC:")) {
14788       for (t=++s; *s && *s != '|'; s++) ;
14789       fputc('~', fp);
14790       fwrite(t+4, 1, s - t - 4, fp);
14791       fputc(' ', fp);
14792     }
14793   }
14794 
14795   put_oneline(fp, info, truelen, alpha, NULL);
14796   fputc('\n', fp);
14797 
14798   if (isfp->format == FORMAT_GCG)
14799     putgcgseq(fp, seq, seqlen, info);
14800   else {
14801     simple_putseq(fp, seq, seqlen, alpha, align_flag, isfp->prettyflag);
14802     fputc('*', fp);
14803     fputc('\n', fp);
14804   }
14805 
14806   return STATUS_OK;
14807 }
14808 
fasta_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)14809 static int fasta_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
14810 {
14811   int flag, alpha, align_flag, truelen;
14812   char *s, *t, *idlist;
14813   FILE *fp = isfp->output_fp;
14814 
14815   alpha = guessalpha(seq, seqlen, &align_flag, &truelen);
14816   if (info->alphabet != UNKNOWN)
14817     alpha = info->alphabet;
14818 
14819   fputc('>', fp);
14820 
14821   flag = 0;
14822   idlist = NULL;
14823   if (info->idlist && info->idlist[0]) {
14824     for (t=s=info->idlist; *s && *s != '|'; s++) ;
14825     if (*s && mystreq(s+1, 'A', "ACC:"))
14826       for (s++; *s && *s != '|'; s++) ;
14827     fwrite(t, 1, s - t, fp);
14828     flag = 1;
14829     if (*s)
14830       idlist = s+1;
14831   }
14832   if (flag)
14833     fputc(' ', fp);
14834 
14835   put_oneline(fp, info, truelen, alpha, NULL);
14836   fputc('\n', fp);
14837 
14838   if (info->comment && info->comment[0]) {
14839     fputs(";\n", fp);
14840     fputc(';', fp);
14841     putline(fp, 0, 0, info->comment, 0, ";", 0);
14842     fputc('\n', fp);
14843   }
14844 
14845   flag = 0;
14846   if (idlist) {
14847     for (s=idlist; *s; ) {
14848       for (t=s; *s && *s != '|'; s++) ;
14849 
14850       if (!flag) {
14851         fputs(";SEQIO Refs: ", fp);
14852         flag = 13;
14853       }
14854       else if (flag + (s - t) + 1 >= 80) {
14855         fputs("\n;SEQIO Refs: ", fp);
14856         flag = 13;
14857       }
14858       else {
14859         fputc('|', fp);
14860         flag++;
14861       }
14862 
14863       fwrite(t, 1, s - t, fp);
14864       flag += s - t;
14865 
14866       if (*s) s++;
14867     }
14868   }
14869   if (flag)
14870     fputc('\n', fp);
14871 
14872   if (info->history && info->history[0]) {
14873     if (!flag)
14874       fputs(";\n", fp);
14875     fputc(';', fp);
14876     putline(fp, 0, 0, info->history, 0, ";", 0);
14877     fputc('\n', fp);
14878   }
14879 
14880   if (isfp->format == FORMAT_GCG)
14881     putgcgseq(fp, seq, seqlen, info);
14882   else {
14883     simple_putseq(fp, seq, seqlen, alpha, align_flag, isfp->prettyflag);
14884     fputc('\n', fp);
14885   }
14886 
14887   return STATUS_OK;
14888 }
14889 
fastaold_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)14890 static int fastaold_putseq(INTSEQFILE *isfp, char *seq, int seqlen,
14891                            SEQINFO *info)
14892 {
14893   int alpha, align_flag, truelen;
14894   char *s, *t;
14895   FILE *fp = isfp->output_fp;
14896 
14897   alpha = guessalpha(seq, seqlen, &align_flag, &truelen);
14898   if (info->alphabet != UNKNOWN)
14899     alpha = info->alphabet;
14900 
14901   fputc('>', fp);
14902 
14903   if (info->idlist && info->idlist[0]) {
14904     for (t=s=info->idlist; *s && *s != '|'; s++) ;
14905     if (*s && mystreq(s+1, 'A', "ACC:"))
14906       for (s++; *s && *s != '|'; s++) ;
14907     fwrite(t, 1, s - t, fp);
14908     fputc(' ', fp);
14909   }
14910 
14911   put_oneline(fp, info, truelen, alpha, NULL);
14912   fputc('\n', fp);
14913 
14914   if (isfp->format == FORMAT_GCG)
14915     putgcgseq(fp, seq, seqlen, info);
14916   else {
14917     simple_putseq(fp, seq, seqlen, alpha, align_flag, isfp->prettyflag);
14918     fputc('\n', fp);
14919   }
14920 
14921   return STATUS_OK;
14922 }
14923 
stanford_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)14924 static int stanford_putseq(INTSEQFILE *isfp, char *seq, int seqlen,
14925                            SEQINFO *info)
14926 {
14927   int flag, flag2, alpha, align_flag, truelen;
14928   char *s, *t;
14929   FILE *fp = isfp->output_fp;
14930 
14931   alpha = guessalpha(seq, seqlen, &align_flag, &truelen);
14932   if (info->alphabet != UNKNOWN)
14933     alpha = info->alphabet;
14934 
14935   flag = 0;
14936   if (info->comment && info->comment[0]) {
14937     fputc(';', fp);
14938     putline(fp, 0, 0, info->comment, 0, ";", 0);
14939     fputc('\n', fp);
14940     flag = 1;
14941   }
14942 
14943   flag2 = 0;
14944   if (info->idlist && info->idlist[0]) {
14945     for (s=info->idlist; *s && *s != '|'; s++) ;
14946     if (*s && mystreq(s+1, 'A', "ACC:"))
14947       for (s++; *s && *s != '|'; s++) ;
14948 
14949     if (*s) {
14950       if (flag)
14951         fputs(";\n", fp);
14952 
14953       while (*s) {
14954         for (t=s; *s && *s != '|'; s++) ;
14955 
14956         if (!flag2) {
14957           fputs(";SEQIO Refs: ", fp);
14958           flag2 = 13;
14959         }
14960         else if (flag2 + (s - t) + 1 >= 80) {
14961           fputs("\n;SEQIO Refs: ", fp);
14962           flag2 = 13;
14963         }
14964         else {
14965           fputc('|', fp);
14966           flag2++;
14967         }
14968 
14969         fwrite(t, 1, s - t, fp);
14970         flag2 += s - t;
14971 
14972         if (*s) s++;
14973       }
14974 
14975       if (flag2)
14976         fputc('\n', fp);
14977       flag = 1;
14978     }
14979   }
14980   if (info->history && info->history[0]) {
14981     if (flag && !flag2)
14982       fputs(";\n", fp);
14983     fputc(';', fp);
14984     putline(fp, 0, 0, info->history, 0, ";", 0);
14985     fputc('\n', fp);
14986     flag = 1;
14987   }
14988   if (!flag)
14989     fputs(";\n", fp);
14990 
14991   if (info->idlist && info->idlist[0]) {
14992     for (t=s=info->idlist; *s && *s != '|'; s++) ;
14993     if (*s && mystreq(s+1, 'A', "ACC:"))
14994       for (s++; *s && *s != '|'; s++) ;
14995     fwrite(t, 1, s - t, fp);
14996     fputc(' ', fp);
14997     flag = 1;
14998   }
14999 
15000   put_oneline(fp, info, truelen, alpha, NULL);
15001   fputc('\n', fp);
15002 
15003   if (isfp->format == FORMAT_GCG)
15004     putgcgseq(fp, seq, seqlen, info);
15005   else {
15006     simple_putseq(fp, seq, seqlen, alpha, align_flag, isfp->prettyflag);
15007     fputc((info->iscircular ? '2' : '1'), fp);
15008     fputc('\n', fp);
15009   }
15010 
15011   return STATUS_OK;
15012 }
15013 
stanfordold_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)15014 static int stanfordold_putseq(INTSEQFILE *isfp, char *seq, int seqlen,
15015                               SEQINFO *info)
15016 {
15017   int alpha, flag, align_flag, truelen;
15018   char *s, *t;
15019   FILE *fp = isfp->output_fp;
15020 
15021   alpha = guessalpha(seq, seqlen, &align_flag, &truelen);
15022   if (info->alphabet != UNKNOWN)
15023     alpha = info->alphabet;
15024 
15025   flag = 0;
15026   if (info->idlist && info->idlist[0]) {
15027     for (s=info->idlist; *s && *s != '|'; s++) ;
15028     if (*s && mystreq(s+1, 'A', "ACC:"))
15029       for (s++; *s && *s != '|'; s++) ;
15030 
15031     if (*s) {
15032       fprintf(fp, ";SEQIO Refs: %s\n", s + 1);
15033       flag = 1;
15034     }
15035   }
15036 
15037   if (!flag && info->comment && info->comment[0]) {
15038     for (t=s=info->comment; *s && *s != '\n'; s++) ;
15039     fputc(';', fp);
15040     fwrite(t, 1, s - t, fp);
15041     fputc('\n', fp);
15042   }
15043   else if (!flag)
15044     fputs(";\n", fp);
15045 
15046   if (info->idlist && info->idlist[0]) {
15047     for (t=s=info->idlist; *s && *s != '|'; s++) ;
15048     if (*s && mystreq(s+1, 'A', "ACC:"))
15049       for (s++; *s && *s != '|'; s++) ;
15050     fwrite(t, 1, s - t, fp);
15051     fputc(' ', fp);
15052     flag = 1;
15053   }
15054 
15055   put_oneline(fp, info, seqlen, alpha, NULL);
15056   fputc('\n', fp);
15057 
15058   if (isfp->format == FORMAT_GCG)
15059     putgcgseq(fp, seq, seqlen, info);
15060   else {
15061     simple_putseq(fp, seq, seqlen, alpha, align_flag, isfp->prettyflag);
15062     fputc((info->iscircular ? '2' : '1'), fp);
15063     fputc('\n', fp);
15064   }
15065 
15066   return STATUS_OK;
15067 }
15068 
gcg_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)15069 static int gcg_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
15070 {
15071   int flag, flag2;
15072   char *s, *t;
15073   FILE *fp = isfp->output_fp;
15074 
15075   if (++isfp->entry_count != 1) {
15076     set_error(E_EOF);
15077     return STATUS_EOF;
15078   }
15079 
15080   if (isfp->gcg_subformat != FORMAT_UNKNOWN) {
15081     return
15082       (*file_table[isfp->gcg_subformat].putseq_fn)(isfp, seq, seqlen, info);
15083   }
15084 
15085   /*
15086    * Print the identifiers.
15087    */
15088   flag = 0;
15089   if (info->idlist && info->idlist[0]) {
15090     for (s=info->idlist; *s; ) {
15091       for (t=s; *s && *s != '|'; s++) ;
15092 
15093       if (!flag) {
15094         fputs("Identifiers:\n    ", fp);
15095         flag = 4;
15096       }
15097       else if (flag + (s - t) + 1 >= 80) {
15098         fputs("\n    ", fp);
15099         flag = 4;
15100       }
15101       else {
15102         fputc('|', fp);
15103         flag++;
15104       }
15105 
15106       fwrite(t, 1, s - t, fp);
15107       flag += s - t;
15108 
15109       if (*s) s++;
15110     }
15111   }
15112   if (flag)
15113     fputc('\n', fp);
15114 
15115   if (info->description && info->description[0]) {
15116     if (flag)
15117       fputc('\n', fp);
15118     fputs("Description:\n    ", fp);
15119     putline(fp, 0, 4, info->description, 80, "    ", 0);
15120     fputc('\n', fp);
15121     flag = 1;
15122   }
15123 
15124   if (info->organism && info->organism[0]) {
15125     if (flag)
15126       fputc('\n', fp);
15127     fputs("Organism:\n    ", fp);
15128     putline(fp, 0, 4, info->organism, 80, "    ", 0);
15129     fputc('\n', fp);
15130     flag = 1;
15131   }
15132 
15133   if ((info->comment && info->comment[0]) ||
15134       (info->history && info->history[0])) {
15135     flag2 = 0;
15136     if (info->comment && info->comment[0]) {
15137       if (flag)
15138         fputc('\n', fp);
15139       fputs("Comments:\n    ", fp);
15140       putline(fp, 0, 0, info->comment, 0, "    ", 0);
15141       fputc('\n', fp);
15142       flag2 = 1;
15143     }
15144 
15145     if (info->history && info->history[0]) {
15146       if (flag2)
15147         fputc('\n', fp);
15148       fputs("    ", fp);
15149       putline(fp, 0, 0, info->history, 0, "    ", 0);
15150       fputc('\n', fp);
15151     }
15152     flag = 1;
15153   }
15154 
15155   putgcgseq(fp, seq, seqlen, info);
15156 
15157   return STATUS_OK;
15158 }
15159 
msf_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)15160 static int msf_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
15161 {
15162   int i, count, len, size;
15163   char buffer[32];
15164 
15165   if (isfp->malign_count == isfp->malign_size) {
15166     if (isfp->malign_size == 0) {
15167       size = 32;
15168       isfp->malign_seqs = (char **) malloc(size * sizeof(char *));
15169       isfp->malign_ids = (char **) malloc(size * sizeof(char *));
15170       isfp->malign_seqlens = (int *) malloc(size * sizeof(int));
15171       memory_error(isfp->malign_seqs == NULL || isfp->malign_ids == NULL ||
15172                    isfp->malign_seqlens == NULL,
15173                    return STATUS_FATAL);
15174       isfp->malign_size = size;
15175     }
15176     else {
15177       size = (isfp->malign_size += isfp->malign_size);
15178       isfp->malign_seqs = (char **) realloc(isfp->malign_seqs,
15179                                             size * sizeof(char *));
15180       isfp->malign_ids = (char **) realloc(isfp->malign_ids,
15181                                            size * sizeof(char *));
15182       isfp->malign_seqlens = (int *) realloc(isfp->malign_seqlens,
15183                                              size * sizeof(int));
15184       if (isfp->malign_seqs == NULL || isfp->malign_ids == NULL ||
15185           isfp->malign_seqlens == NULL) {
15186         if (isfp->malign_seqs != NULL) {
15187           for (i=0; i < isfp->malign_count; i++)
15188             free(isfp->malign_seqs[i]);
15189           free(isfp->malign_seqs);
15190           isfp->malign_seqs = NULL;
15191         }
15192         if (isfp->malign_ids != NULL) {
15193           for (i=0; i < isfp->malign_count; i++)
15194             free(isfp->malign_ids[i]);
15195           free(isfp->malign_ids);
15196           isfp->malign_ids = NULL;
15197         }
15198         if (isfp->malign_seqlens != NULL) {
15199           free(isfp->malign_seqlens);
15200           isfp->malign_seqlens = NULL;
15201         }
15202         isfp->malign_count = isfp->malign_size = 0;
15203         memory_error(1, return STATUS_FATAL);
15204       }
15205     }
15206   }
15207 
15208   count = isfp->malign_count;
15209   isfp->malign_seqs[count] = mystrdup2(seq, seq+seqlen);
15210   isfp->malign_seqlens[count] = seqlen;
15211   len = seqfoneline(info, buffer, 10, 1);
15212   isfp->malign_ids[count] = (len ? mystrdup(buffer) : mystrdup("Unknown"));
15213 
15214   if (isfp->malign_seqs[count] == NULL || isfp->malign_ids[count] == NULL) {
15215     if (isfp->malign_seqs != NULL) {
15216       for (i=0; i < isfp->malign_count; i++)
15217         free(isfp->malign_seqs[i]);
15218       free(isfp->malign_seqs);
15219       isfp->malign_seqs = NULL;
15220     }
15221     if (isfp->malign_ids != NULL) {
15222       for (i=0; i < isfp->malign_count; i++)
15223         free(isfp->malign_ids[i]);
15224       free(isfp->malign_ids);
15225       isfp->malign_ids = NULL;
15226     }
15227     if (isfp->malign_seqlens != NULL) {
15228       free(isfp->malign_seqlens);
15229       isfp->malign_seqlens = NULL;
15230     }
15231     isfp->malign_count = isfp->malign_size = 0;
15232     memory_error(1, return STATUS_FATAL);
15233   }
15234 
15235   isfp->malign_count++;
15236   return STATUS_OK;
15237 }
15238 
msf_putseqend(INTSEQFILE * isfp)15239 static int msf_putseqend(INTSEQFILE *isfp)
15240 {
15241   int i, j, k, len, len2, count, maxlen, numseqs, check, alpha;
15242   int align_flag, all_idprefix, any_idprefix, stripflag;
15243   char *seq, *s, *t, *date, buffer[32];
15244   FILE *fp;
15245 
15246   if (isfp->malign_count == 0)
15247     return STATUS_OK;
15248 
15249   fp = isfp->output_fp;
15250   numseqs = isfp->malign_count;
15251   all_idprefix = 1;
15252   any_idprefix = 0;
15253   for (i=0,maxlen=0; i < numseqs; i++) {
15254     if (maxlen < isfp->malign_seqlens[i])
15255       maxlen = isfp->malign_seqlens[i];
15256 
15257     if (is_idprefix(isfp->malign_ids[i]))
15258       any_idprefix = 1;
15259     else
15260       all_idprefix = 0;
15261   }
15262   stripflag = (any_idprefix && !all_idprefix);
15263 
15264   fputs("PileUp\n\n\n", fp);
15265 
15266   /*
15267    * Print the infoline.
15268    */
15269   for (t=s=isfp->filename; *s; s++)
15270     if (*s == dirch)
15271       t = s;
15272   fputc(' ', fp);
15273   fputs((t == isfp->filename ? t : t+1), fp);
15274   fputs("  ", fp);
15275   fprintf(fp, "MSF: %d  ", maxlen);
15276 
15277   alpha = guessalpha(isfp->malign_seqs[0], isfp->malign_seqlens[0],
15278                      &align_flag, NULL);
15279 
15280   if (alpha == RNA || alpha == DNA)
15281     fputs("Type: N  ", fp);
15282   else if (alpha == PROTEIN)
15283     fputs("Type: P  ", fp);
15284 
15285   date = get_today();
15286   for (i=1; i <= 12; i++)
15287     if (myncasecmp(date+3, months[i], 3) == 0)
15288       break;
15289   if (i <= 12)
15290     fprintf(fp, "%s %c%c, %s %s  ", gcg_full_months[i], date[0], date[1],
15291             date+7, date+12);
15292 
15293   for (i=0,check=0; i < numseqs; i++)
15294     check = gcg_checksum2(isfp->malign_seqs[i], isfp->malign_seqlens[i],
15295                           maxlen, check);
15296   fprintf(fp, "Check: %d  ..\n", check);
15297 
15298   /*
15299    * Print the header lines.
15300    */
15301   fputc('\n', fp);
15302   for (i=0; i < numseqs; i++) {
15303     s = isfp->malign_ids[i];
15304     if (stripflag && is_idprefix(s))
15305       while (*s++ != ':') ;
15306 
15307     fprintf(fp, " Name: %-15s  Len: %5d  Check: %4d  Weight:  1.00\n",
15308             s, maxlen, gcg_checksum2(isfp->malign_seqs[i],
15309                                      isfp->malign_seqlens[i], maxlen, 0));
15310   }
15311   fputs("\n//\n\n", fp);
15312 
15313   /*
15314    * Print the sequence lines.
15315    */
15316   for (j=0; j < maxlen; j+=50) {
15317     fputs("            ", fp);
15318     sprintf(buffer, "%d", j+1);
15319     len = strlen(buffer);
15320     fputs(buffer, fp);
15321 
15322     count = (j + 50 <= maxlen ? 50 : maxlen - j);
15323     sprintf(buffer, "%d", (j + count));
15324     len2 = strlen(buffer);
15325 
15326     count += count / 10 + (count % 10 ? 0 : -1);
15327     if (count >= len + len2) {
15328       for (i=0; i < count - len - len2; i++)
15329         fputc(' ', fp);
15330       fputs(buffer, fp);
15331     }
15332     fputc('\n', fp);
15333 
15334     for (i=0; i < numseqs; i++) {
15335       s = isfp->malign_ids[i];
15336       if (is_idprefix(s) && stripflag)
15337         while (*s++ != ':') ;
15338       fprintf(fp, "%-10s  ", s);
15339 
15340       len = isfp->malign_seqlens[i] - j;
15341       seq = isfp->malign_seqs[i] + (len > 0 ? j : 0);
15342       for (count=0,k=0; count < 50 && j + count < maxlen; count++) {
15343         if (k++ == 10) {
15344           fputc(' ', fp);
15345           k = 1;
15346         }
15347         if (count < len) {
15348           fputc((*seq == '-' ? '.' : *seq), fp);
15349           seq++;
15350         }
15351         else
15352           fputc('.', fp);
15353       }
15354       fputc('\n', fp);
15355     }
15356     fputc('\n', fp);
15357   }
15358 
15359   return STATUS_OK;
15360 }
15361 
15362 
phylip_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)15363 static int phylip_putseq(INTSEQFILE *isfp, char *seq, int seqlen,
15364                          SEQINFO *info)
15365 {
15366   int i, count, len, size;
15367   char buffer[32];
15368 
15369   if (isfp->malign_count == isfp->malign_size) {
15370     if (isfp->malign_size == 0) {
15371       size = 32;
15372       isfp->malign_seqs = (char **) malloc(size * sizeof(char *));
15373       isfp->malign_ids = (char **) malloc(size * sizeof(char *));
15374       isfp->malign_seqlens = (int *) malloc(size * sizeof(int));
15375       memory_error(isfp->malign_seqs == NULL || isfp->malign_ids == NULL ||
15376                    isfp->malign_seqlens == NULL,
15377                    return STATUS_FATAL);
15378       isfp->malign_size = size;
15379     }
15380     else {
15381       size = (isfp->malign_size += isfp->malign_size);
15382       isfp->malign_seqs = (char **) realloc(isfp->malign_seqs,
15383                                             size * sizeof(char *));
15384       isfp->malign_ids = (char **) realloc(isfp->malign_ids,
15385                                            size * sizeof(char *));
15386       isfp->malign_seqlens = (int *) realloc(isfp->malign_seqlens,
15387                                              size * sizeof(int));
15388       if (isfp->malign_seqs == NULL || isfp->malign_ids == NULL ||
15389           isfp->malign_seqlens == NULL) {
15390         if (isfp->malign_seqs != NULL) {
15391           for (i=0; i < isfp->malign_count; i++)
15392             free(isfp->malign_seqs[i]);
15393           free(isfp->malign_seqs);
15394           isfp->malign_seqs = NULL;
15395         }
15396         if (isfp->malign_ids != NULL) {
15397           for (i=0; i < isfp->malign_count; i++)
15398             free(isfp->malign_ids[i]);
15399           free(isfp->malign_ids);
15400           isfp->malign_ids = NULL;
15401         }
15402         if (isfp->malign_seqlens != NULL) {
15403           free(isfp->malign_seqlens);
15404           isfp->malign_seqlens = NULL;
15405         }
15406         isfp->malign_count = isfp->malign_size = 0;
15407         memory_error(1, return STATUS_FATAL);
15408       }
15409     }
15410   }
15411 
15412   count = isfp->malign_count;
15413   isfp->malign_seqs[count] = mystrdup2(seq, seq+seqlen);
15414   isfp->malign_seqlens[count] = seqlen;
15415   len = seqfoneline(info, buffer, 10, 1);
15416   isfp->malign_ids[count] = (len ? mystrdup(buffer) : mystrdup("Unknown"));
15417 
15418   if (isfp->malign_seqs[count] == NULL || isfp->malign_ids[count] == NULL) {
15419     if (isfp->malign_seqs != NULL) {
15420       for (i=0; i < isfp->malign_count; i++)
15421         free(isfp->malign_seqs[i]);
15422       free(isfp->malign_seqs);
15423       isfp->malign_seqs = NULL;
15424     }
15425     if (isfp->malign_ids != NULL) {
15426       for (i=0; i < isfp->malign_count; i++)
15427         free(isfp->malign_ids[i]);
15428       free(isfp->malign_ids);
15429       isfp->malign_ids = NULL;
15430     }
15431     if (isfp->malign_seqlens != NULL) {
15432       free(isfp->malign_seqlens);
15433       isfp->malign_seqlens = NULL;
15434     }
15435     isfp->malign_count = isfp->malign_size = 0;
15436     memory_error(1, return STATUS_FATAL);
15437   }
15438 
15439   isfp->malign_count++;
15440   return STATUS_OK;
15441 }
15442 
phyint_putseqend(INTSEQFILE * isfp)15443 static int phyint_putseqend(INTSEQFILE *isfp)
15444 {
15445   int i, j, k, len, count, maxlen, numseqs, flag;
15446   int all_idprefix, any_idprefix, stripflag;
15447   char *seq, *s;
15448   FILE *fp;
15449 
15450   if (isfp->malign_count == 0)
15451     return STATUS_OK;
15452 
15453   fp = isfp->output_fp;
15454   numseqs = isfp->malign_count;
15455   all_idprefix = 1;
15456   any_idprefix = 0;
15457   for (i=0,maxlen=0; i < numseqs; i++) {
15458     if (maxlen < isfp->malign_seqlens[i])
15459       maxlen = isfp->malign_seqlens[i];
15460 
15461     if (is_idprefix(isfp->malign_ids[i]))
15462       any_idprefix = 1;
15463     else
15464       all_idprefix = 0;
15465   }
15466   stripflag = (any_idprefix && !all_idprefix);
15467 
15468   fprintf(fp, "     %d    %d  I\n", numseqs, maxlen);
15469 
15470   for (flag=1,j=0; j < maxlen; j+=50,flag=0) {
15471     for (i=0; i < numseqs; i++) {
15472       if (flag) {
15473         s = isfp->malign_ids[i];
15474         if (stripflag && is_idprefix(s))
15475           while (*s++ != ':') ;
15476 
15477         fprintf(fp, "%-10s", s);
15478       }
15479       else
15480         fputs("          ", fp);
15481 
15482       len = isfp->malign_seqlens[i] - j;
15483       seq = isfp->malign_seqs[i] + (len > 0 ? j : 0);
15484       for (count=0,k=10; count < 50 && j + count < maxlen; count++,k++) {
15485         if (k == 10) {
15486           fputc(' ', fp);
15487           k = 0;
15488         }
15489         if (count < len)
15490           fputc(*seq++, fp);
15491         else
15492           fputc('-', fp);
15493       }
15494       fputc(' ', fp);
15495       fputc('\n', fp);
15496     }
15497     if (j + 50 < maxlen)
15498       fputc('\n', fp);
15499   }
15500 
15501   return STATUS_OK;
15502 }
15503 
15504 
physeq_putseqend(INTSEQFILE * isfp)15505 static int physeq_putseqend(INTSEQFILE *isfp)
15506 {
15507   int i, j, k, len, count, maxlen, numseqs, flag;
15508   int all_idprefix, any_idprefix, stripflag;
15509   char *seq, *s;
15510   FILE *fp;
15511 
15512   if (isfp->malign_count == 0)
15513     return STATUS_OK;
15514 
15515   fp = isfp->output_fp;
15516   numseqs = isfp->malign_count;
15517   all_idprefix = 1;
15518   any_idprefix = 0;
15519   for (i=0,maxlen=0; i < numseqs; i++) {
15520     if (maxlen < isfp->malign_seqlens[i])
15521       maxlen = isfp->malign_seqlens[i];
15522 
15523     if (is_idprefix(isfp->malign_ids[i]))
15524       any_idprefix = 1;
15525     else
15526       all_idprefix = 0;
15527   }
15528   stripflag = (any_idprefix && !all_idprefix);
15529 
15530   fprintf(fp, "     %d    %d\n", numseqs, maxlen);
15531 
15532   for (i=0; i < numseqs; i++) {
15533     for (flag=1,j=0; j < maxlen; j+=50,flag=0) {
15534       if (flag) {
15535         s = isfp->malign_ids[i];
15536         if (stripflag && is_idprefix(s))
15537           while (*s++ != ':') ;
15538 
15539         fprintf(fp, "%-10s", s);
15540       }
15541       else
15542         fputs("          ", fp);
15543 
15544       len = isfp->malign_seqlens[i] - j;
15545       seq = isfp->malign_seqs[i] + (len > 0 ? j : 0);
15546       for (count=0,k=10; count < 50 && j + count < maxlen; count++,k++) {
15547         if (k == 10) {
15548           fputc(' ', fp);
15549           k = 0;
15550         }
15551         if (count < len)
15552           fputc(*seq++, fp);
15553         else
15554           fputc('-', fp);
15555       }
15556       fputc(' ', fp);
15557       fputc('\n', fp);
15558     }
15559   }
15560 
15561   return STATUS_OK;
15562 }
15563 
15564 
clustal_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)15565 static int clustal_putseq(INTSEQFILE *isfp, char *seq, int seqlen,
15566                           SEQINFO *info)
15567 {
15568   int i, count, len, size;
15569   char buffer[32];
15570 
15571   if (isfp->malign_count == isfp->malign_size) {
15572     if (isfp->malign_size == 0) {
15573       size = 32;
15574       isfp->malign_seqs = (char **) malloc(size * sizeof(char *));
15575       isfp->malign_ids = (char **) malloc(size * sizeof(char *));
15576       isfp->malign_seqlens = (int *) malloc(size * sizeof(int));
15577       memory_error(isfp->malign_seqs == NULL || isfp->malign_ids == NULL ||
15578                    isfp->malign_seqlens == NULL,
15579                    return STATUS_FATAL);
15580       isfp->malign_size = size;
15581     }
15582     else {
15583       size = (isfp->malign_size += isfp->malign_size);
15584       isfp->malign_seqs = (char **) realloc(isfp->malign_seqs,
15585                                             size * sizeof(char *));
15586       isfp->malign_ids = (char **) realloc(isfp->malign_ids,
15587                                            size * sizeof(char *));
15588       isfp->malign_seqlens = (int *) realloc(isfp->malign_seqlens,
15589                                              size * sizeof(int));
15590       if (isfp->malign_seqs == NULL || isfp->malign_ids == NULL ||
15591           isfp->malign_seqlens == NULL) {
15592         if (isfp->malign_seqs != NULL) {
15593           for (i=0; i < isfp->malign_count; i++)
15594             free(isfp->malign_seqs[i]);
15595           free(isfp->malign_seqs);
15596           isfp->malign_seqs = NULL;
15597         }
15598         if (isfp->malign_ids != NULL) {
15599           for (i=0; i < isfp->malign_count; i++)
15600             free(isfp->malign_ids[i]);
15601           free(isfp->malign_ids);
15602           isfp->malign_ids = NULL;
15603         }
15604         if (isfp->malign_seqlens != NULL) {
15605           free(isfp->malign_seqlens);
15606           isfp->malign_seqlens = NULL;
15607         }
15608         isfp->malign_count = isfp->malign_size = 0;
15609         memory_error(1, return STATUS_FATAL);
15610       }
15611     }
15612   }
15613 
15614   count = isfp->malign_count;
15615   isfp->malign_seqs[count] = mystrdup2(seq, seq+seqlen);
15616   isfp->malign_seqlens[count] = seqlen;
15617   len = seqfoneline(info, buffer, 15, 1);
15618   isfp->malign_ids[count] = (len ? mystrdup(buffer) : mystrdup("Unknown"));
15619 
15620   if (isfp->malign_seqs[count] == NULL || isfp->malign_ids[count] == NULL) {
15621     if (isfp->malign_seqs != NULL) {
15622       for (i=0; i < isfp->malign_count; i++)
15623         free(isfp->malign_seqs[i]);
15624       free(isfp->malign_seqs);
15625       isfp->malign_seqs = NULL;
15626     }
15627     if (isfp->malign_ids != NULL) {
15628       for (i=0; i < isfp->malign_count; i++)
15629         free(isfp->malign_ids[i]);
15630       free(isfp->malign_ids);
15631       isfp->malign_ids = NULL;
15632     }
15633     if (isfp->malign_seqlens != NULL) {
15634       free(isfp->malign_seqlens);
15635       isfp->malign_seqlens = NULL;
15636     }
15637     isfp->malign_count = isfp->malign_size = 0;
15638     memory_error(1, return STATUS_FATAL);
15639   }
15640 
15641   isfp->malign_count++;
15642   return STATUS_OK;
15643 }
15644 
clustal_putseqend(INTSEQFILE * isfp)15645 static int clustal_putseqend(INTSEQFILE *isfp)
15646 {
15647   int i, j, len, count, maxlen, numseqs;
15648   int all_idprefix, any_idprefix, stripflag;
15649   char *seq, *s;
15650   FILE *fp;
15651 
15652   if (isfp->malign_count == 0)
15653     return STATUS_OK;
15654 
15655   fp = isfp->output_fp;
15656   numseqs = isfp->malign_count;
15657   all_idprefix = 1;
15658   any_idprefix = 0;
15659   for (i=0,maxlen=0; i < numseqs; i++) {
15660     if (maxlen < isfp->malign_seqlens[i])
15661       maxlen = isfp->malign_seqlens[i];
15662 
15663     if (is_idprefix(isfp->malign_ids[i]))
15664       any_idprefix = 1;
15665     else
15666       all_idprefix = 0;
15667   }
15668   stripflag = (any_idprefix && !all_idprefix);
15669 
15670   fputs("CLUSTAL W(*.**) multiple sequence alignment\n\n\n", fp);
15671 
15672   for (j=0; j < maxlen; j+=60) {
15673     fputc('\n', fp);
15674     for (i=0; i < numseqs; i++) {
15675       s = isfp->malign_ids[i];
15676       if (stripflag && is_idprefix(s))
15677         while (*s++ != ':') ;
15678 
15679       fprintf(fp, "%-15s", s);
15680 
15681       len = isfp->malign_seqlens[i] - j;
15682       seq = isfp->malign_seqs[i] + (len > 0 ? j : 0);
15683       for (count=0; count < 60 && j + count < maxlen; count++) {
15684         if (count < len)
15685           fputc(*seq++, fp);
15686         else
15687           fputc('-', fp);
15688       }
15689       fputc('\n', fp);
15690     }
15691     for (count=0; count < 15; count++)
15692       fputc(' ', fp);
15693     for (count=0; count < 60 && j + count < maxlen; count++)
15694       fputc(' ', fp);
15695     fputc('\n', fp);
15696   }
15697 
15698   return STATUS_OK;
15699 }
15700 
15701 
15702 
asn_putseq(INTSEQFILE * isfp,char * seq,int seqlen,SEQINFO * info)15703 static int asn_putseq(INTSEQFILE *isfp, char *seq, int seqlen, SEQINFO *info)
15704 {
15705   int i, col, alpha, flag, flag2, idlen, pos;
15706   char ch, *s, *t, *accstr, *accend, *gistr, *gisend, *giimstr, *giimend;
15707   char *bbsstr, *bbsend, *bbmstr, *bbmend, *idstr, *idend, *idpref;
15708   FILE *fp = isfp->output_fp;
15709 
15710   /*
15711    * Print the header (either of the whole file or the link between `seq'
15712    * records).
15713    */
15714   if (isfp->entry_count == 0)
15715     fputs("Bioseq-set ::= {\n  seq-set {\n", fp);
15716   else
15717     fputs(" ,\n", fp);
15718   isfp->entry_count++;
15719 
15720   fputs("    seq {\n", fp);
15721 
15722   /*
15723    * Do the `id' record.
15724    */
15725   accstr = gistr = giimstr = bbsstr = bbmstr = idstr = NULL;
15726   accend = gisend = giimend = bbsend = bbmend = idend = NULL;
15727   idpref = NULL;
15728   idlen = 0;
15729 
15730   fputs("      id {\n", fp);
15731   if (info->idlist && info->idlist[0]) {
15732     for (s=info->idlist; *s; ) {
15733       for (t=s; *s && *s != '|'; s++) ;
15734 
15735       ch = toupper(*t);
15736       if (!accstr && mystreq(t, 'A', "ACC:")) {
15737         accstr = t;
15738         accend = s;
15739       }
15740       else if (ch == 'B') {
15741         if (!bbsstr && mystreq(t, 'B', "BBS:")) {
15742           bbsstr = t;
15743           bbsend = s;
15744         }
15745         else if (!bbmstr && mystreq(t, 'B', "BBM:")) {
15746           bbmstr = t;
15747           bbmend = s;
15748         }
15749       }
15750       else if (ch == 'G' && toupper(t[1]) == 'I') {
15751         if (!gistr && t[2] == ':') {    /* GI: */
15752           gistr = t;
15753           gisend = s;
15754         }
15755         else if (!giimstr && mystreq(t+2, 'I', "IM:")) {   /* GIIM: */
15756           giimstr = t;
15757           giimend = s;
15758         }
15759       }
15760       else if (!idstr && (((i = 1) && mystreq(t, 'G', "GB:")) ||
15761                           ((i = 2) && mystreq(t, 'P', "PIR:")) ||
15762                           ((i = 3) && mystreq(t, 'E', "EMBL:")) ||
15763                           ((i = 4) && mystreq(t, 'S', "SP:")) ||
15764                           ((i = 5) && mystreq(t, 'P', "PDB:")) ||
15765                           ((i = 6) && mystreq(t, 'D', "DBJ:")) ||
15766                           ((i = 7) && mystreq(t, 'P', "PRF:")) ||
15767                           ((i = 8) && mystreq(t, 'O', "OTH:")))) {
15768         idstr = t;
15769         idend = s;
15770         switch (i) {
15771         case 1:  idpref = "genbank"; idlen = 3; break;
15772         case 2:  idpref = "pir"; idlen = 4; break;
15773         case 3:  idpref = "embl"; idlen = 5; break;
15774         case 4:  idpref = "swissprot"; idlen = 3; break;
15775         case 5:  idpref = "pdb"; idlen = 4; break;
15776         case 6:  idpref = "ddbj"; idlen = 4; break;
15777         case 7:  idpref = "prf"; idlen = 4; break;
15778         case 8:  idpref = "other"; idlen = 4; break;
15779         }
15780       }
15781 
15782       if (*s) s++;
15783     }
15784 
15785     flag = 0;
15786     if (idstr) {
15787       if (!strcmp(idpref, "pdb")) {
15788         fputs("        pdb {\n          mol \"", fp);
15789         fwrite(idstr+4, 1, idend - idstr - 4, fp);
15790         fputs("\" }", fp);
15791         accstr = accend = NULL;
15792       }
15793       else {
15794         fprintf(fp, "        %s {\n          name \"", idpref);
15795         fwrite(idstr+idlen, 1, idend - idstr - idlen, fp);
15796         fputc('"', fp);
15797         if (accstr) {
15798           fputs(" ,\n          accession \"", fp);
15799           fwrite(accstr+4, 1, accend - accstr - 4, fp);
15800           fputc('"', fp);
15801         }
15802         fputs(" }", fp);
15803       }
15804       flag = 1;
15805     }
15806     else
15807       accstr = NULL;
15808 
15809     if (gistr) {
15810       if (flag)
15811         fputs(" ,\n", fp);
15812       fputs("        gi ", fp);
15813       fwrite(gistr+3, 1, gisend - gistr - 3, fp);
15814       flag = 1;
15815     }
15816     if (bbsstr) {
15817       if (flag)
15818         fputs(" ,\n", fp);
15819       fputs("        gibbsq ", fp);
15820       fwrite(bbsstr+4, 1, bbsend - bbsstr - 4, fp);
15821       flag = 1;
15822     }
15823     if (bbmstr) {
15824       if (flag)
15825         fputs(" ,\n", fp);
15826       fputs("        gibbmt ", fp);
15827       fwrite(bbmstr+4, 1, bbmend - bbmstr - 4, fp);
15828       flag = 1;
15829     }
15830     if (giimstr) {
15831       if (flag)
15832         fputs(" ,\n", fp);
15833       fputs("        giim {\n          id ", fp);
15834       fwrite(giimstr+5, 1, giimend - giimstr - 5, fp);
15835       fputs(" }", fp);
15836       flag = 1;
15837     }
15838 
15839     if (!flag)
15840       fputs("        other {\n          name \"(below)\" }", fp);
15841   }
15842   else {
15843     fputs("        other {\n          name \"Unknown\" }", fp);
15844   }
15845   fputs(" } ,\n", fp);
15846 
15847 
15848   /*
15849    * Do the `descr' record.
15850    */
15851   flag = 0;
15852   if (info->isfragment) {
15853     if (!flag) {
15854       fputs("      descr {\n", fp);
15855       flag = 1;
15856     }
15857     else
15858       fputs(" ,\n", fp);
15859 
15860     fputs("        modif {\n          partial }", fp);
15861   }
15862 
15863   if (info->description && info->description[0]) {
15864     if (!flag) {
15865       fputs("      descr {\n", fp);
15866       flag = 1;
15867     }
15868     else
15869       fputs(" ,\n", fp);
15870 
15871     fputs("        title \"", fp);
15872     putline(fp, 0, 15, info->description, 78, " ", 0);
15873     fputc('"', fp);
15874   }
15875 
15876   if (info->organism && info->organism[0]) {
15877     if (!flag) {
15878       fputs("      descr {\n", fp);
15879       flag = 1;
15880     }
15881     else
15882       fputs(" ,\n", fp);
15883 
15884     fputs("        org {\n          taxname \"", fp);
15885     putline(fp, 0, 19, info->organism, 78, " ", 0);
15886     fputs("\" }", fp);
15887   }
15888 
15889   if (info->date && info->date[0]) {
15890     if (!flag) {
15891       fputs("      descr {\n", fp);
15892       flag = 1;
15893     }
15894     else
15895       fputs(" ,\n", fp);
15896 
15897     fputs("        update-date\n", fp);
15898     fprintf(fp, "          str \"%s\"", info->date);
15899   }
15900 
15901   if (info->comment && info->comment[0]) {
15902     if (!flag) {
15903       fputs("      descr {\n", fp);
15904       flag = 1;
15905     }
15906     else
15907       fputs(" ,\n", fp);
15908 
15909     fputs("        comment \"", fp);
15910     putline(fp, 0, 17, info->comment, 78, " ", 1);
15911     fputc('"', fp);
15912   }
15913 
15914   flag2 = 0;
15915   if (info->idlist && info->idlist[0]) {
15916     for (s=info->idlist; *s; ) {
15917       for (t=s; *s && *s != '|'; s++) ;
15918       if (t != idstr && t != accstr && t != gistr &&
15919           t != giimstr && t != bbsstr && t != bbmstr) {
15920         if (!flag2) {
15921           if (!flag) {
15922             fputs("      descr {\n", fp);
15923             flag = 1;
15924           }
15925           else {
15926             fputs(" ,\n", fp);
15927           }
15928 
15929           fputs("        comment \"SEQIO Refs: ", fp);
15930           flag2 = 29;
15931         }
15932         else if (flag2 + (s - t) + 3 >= 78) {
15933           fputs("\" ,\n        comment \"SEQIO Refs: ", fp);
15934           flag2 = 29;
15935         }
15936         else {
15937           fputc('|', fp);
15938           flag2++;
15939         }
15940 
15941         fwrite(t, 1, s - t, fp);
15942         flag2 += s - t;
15943       }
15944 
15945       if (*s) s++;
15946     }
15947   }
15948 
15949   if (info->history && info->history[0]) {
15950     if (!flag2) {
15951       if (!flag) {
15952         fputs("      descr {\n", fp);
15953         flag = 1;
15954       }
15955       else
15956         fputs(" ,\n", fp);
15957 
15958       fputs("        comment \"", fp);
15959       pos = 17;
15960       flag2 = 1;
15961     }
15962     else {
15963       fputs("\n ", fp);
15964       pos = 1;
15965     }
15966 
15967     putline(fp, 0, pos, info->history, 78, " ", 1);
15968   }
15969 
15970   if (flag2)
15971     fputc('"', fp);
15972 
15973   if (flag)
15974     fputs(" } ,\n", fp);
15975 
15976   /*
15977    * Do the `inst' record.
15978    */
15979   fputs("      inst {\n", fp);
15980   fputs("        repr raw ,\n", fp);
15981 
15982   alpha = (info->alphabet != UNKNOWN ? info->alphabet
15983                                      : guessalpha(seq, seqlen, NULL, NULL));
15984   if (alpha == DNA || alpha == RNA || alpha == PROTEIN)
15985     fprintf(fp, "        mol %s ,\n",
15986             (alpha == DNA ? "dna" : (alpha == RNA ? "rna" : "aa")));
15987 
15988   fprintf(fp, "        length %d ,\n", seqlen);
15989   if (info->iscircular)
15990     fputs("        topology circular ,\n", fp);
15991 
15992   fputs("        seq-data\n", fp);
15993   fprintf(fp, "          %s \"",
15994           (alpha == DNA || alpha == RNA ? "iupacna"
15995              : (alpha == PROTEIN ? "iupacaa" : "ascii")));
15996 
15997   for (i=0,col=19; i < seqlen; i++) {
15998     if (++col == 79) {
15999       fputc('\n', fp);
16000       col = 1;
16001     }
16002     fputc(seq[i], fp);
16003   }
16004   fputs("\" } }", fp);
16005 
16006   return STATUS_OK;
16007 }
16008 
16009 
asn_putseqend(INTSEQFILE * isfp)16010 static int asn_putseqend(INTSEQFILE *isfp)
16011 {
16012   if (isfp->optype == OP_WRITE)
16013     fputs(" } }\n", isfp->output_fp);
16014   return STATUS_OK;
16015 }
16016 
16017 
16018 
16019 
16020 
16021 /*
16022  *
16023  *
16024  * Section containing the annotate functions.
16025  *
16026  *
16027  *
16028  */
16029 
16030 
genbank_annotate(FILE * fp,char * entry,int entrylen,char * newcomment,int flag)16031 static int genbank_annotate(FILE *fp, char *entry, int entrylen,
16032                             char *newcomment, int flag)
16033 {
16034   int status, count, hcount, ncount, pos;
16035   char *s, *line, *end, *history, *comment, *comend, *lastline;
16036 
16037   error_test(!mystreq(entry, 'L', "LOCUS       "),
16038              E_PARSEERROR, return STATUS_ERROR,
16039              print_error("seqfannotate:  Entry not in GenBank format.\n"));
16040 
16041   for (ncount=0,s=newcomment; *s; s++)
16042     if (*s == '\n')
16043       ncount++;
16044   if (*(s-1) != '\n')
16045     ncount++;
16046 
16047   /*
16048    * Skip past and output the text before the comment.
16049    */
16050   gi_startline(entry, entrylen);
16051 
16052   lastline = NULL;
16053   while (1) {
16054     status = gi_getline(&line, &end, 0);
16055     error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16056                print_error("seqfannotate:  Premature end of entry.\n"));
16057 
16058     if (mystreq(line, 'C', "COMMENT") || mystreq(line, 'F', "FEATURES") ||
16059         mystreq(line, 'B', "BASE COUNT") || mystreq(line, 'O', "ORIGIN"))
16060       break;
16061   }
16062   fwrite(entry, 1, line - entry, fp);
16063 
16064   /*
16065    * Output the old and new comments and the history, depending on what
16066    * appears in the entry and the value of `flag'.
16067    */
16068   fputs("COMMENT     ", fp);
16069 
16070   history = NULL;
16071   count = hcount = 0;
16072   if (toupper(*line) == 'C') {
16073     for (s=line+7; s < line + 12 && s < end && isspace(*s); s++) ;
16074     comment = s;
16075     comend = NULL;
16076     while (isspace(line[0]) || count == 0) {
16077       for (s=line; s < end && isspace(*s); s++) ;
16078       if (s < end &&
16079           (mystreq(s, 'S', "SEQIO") || (history != NULL && s == line + 14))) {
16080         if (history == NULL) {
16081           history = s;
16082           if (comend == NULL)
16083             comend = line;
16084         }
16085         hcount++;
16086       }
16087       else {
16088         history = NULL;
16089         hcount = 0;
16090         comend = (line + 12 == end ? line : NULL);
16091       }
16092 
16093       status = gi_getline(&line, &end, 0);
16094       error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16095                  print_error("seqfannotate:  Premature end of entry.\n"));
16096       count++;
16097     }
16098     if (comend == NULL)
16099       comend = line;
16100 
16101     if (flag && comment < comend) {
16102       fwrite(comment, 1, comend - comment, fp);
16103       fputs("            \n            ", fp);
16104       count++;
16105     }
16106   }
16107 
16108   /*
16109    * Output the new comment, the history lines if they exist, and
16110    * a history line noting the annotation.
16111    */
16112   putline(fp, 0, 12, newcomment, 80, "            ", 1);
16113   fputs("\n            \n            ", fp);
16114 
16115   if (history != NULL) {
16116     fwrite(history, 1, line - history, fp);
16117     fputs("            ", fp);
16118   }
16119 
16120   pos = (flag ? count - hcount + 1 : 1);
16121   fprintf(fp, "SEQIO annotation, lines %d-%d.   %s\n", pos, pos + ncount - 1,
16122           get_today());
16123 
16124   /*
16125    * Output the rest of the entry.
16126    */
16127   end = entry + entrylen;
16128   fwrite(line, 1, end - line, fp);
16129 
16130   return STATUS_OK;
16131 }
16132 
16133 
pir_annotate(FILE * fp,char * entry,int entrylen,char * newcomment,int flag)16134 static int pir_annotate(FILE *fp, char *entry, int entrylen, char *newcomment,
16135                         int flag)
16136 {
16137   int status, count, hcount, ncount, pos;
16138   char *s, *line, *end, *history, *comment, *comend, *lastline;
16139 
16140   error_test(!mystreq(entry, 'E', "ENTRY"), E_PARSEERROR, return STATUS_ERROR,
16141              print_error("seqfannotate:  Entry not in PIR format.\n"));
16142 
16143   for (ncount=0,s=newcomment; *s; s++)
16144     if (*s == '\n')
16145       ncount++;
16146   if (*(s-1) != '\n')
16147     ncount++;
16148 
16149   /*
16150    * Skip past and output the text before the comment.
16151    */
16152   gi_startline(entry, entrylen);
16153 
16154   lastline = NULL;
16155   while (1) {
16156     status = gi_getline(&line, &end, 0);
16157     error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16158                print_error("seqfannotate:  Premature end of entry.\n"));
16159 
16160     if (!isspace(line[0]) &&
16161         (mystreq(line, 'C', "COMMENT") || mystreq(line, 'G', "GENETIC") ||
16162          mystreq(line, 'C', "CLASSIFICATION") ||
16163          mystreq(line, 'K', "KEYWORDS") || mystreq(line, 'F', "FEATURE") ||
16164          mystreq(line, 'S', "SUMMARY") || mystreq(line, 'S', "SEQUENCE")))
16165       break;
16166   }
16167   fwrite(entry, 1, line - entry, fp);
16168 
16169   /*
16170    * Output the old and new comments and the history, depending on what
16171    * appears in the entry and the value of `flag'.
16172    */
16173   fputs("COMMENT    ", fp);
16174 
16175   history = NULL;
16176   count = hcount = 0;
16177   if (toupper(*line) == 'C' && toupper(line[1]) == 'O') {
16178     for (s=line+7; s < line + 11 && s < end && isspace(*s); s++) ;
16179     comment = s;
16180     comend = NULL;
16181     while (isspace(line[0]) || count == 0) {
16182       for (s=line; s < end && isspace(*s); s++) ;
16183       if (s < end &&
16184           (mystreq(s, 'S', "SEQIO") || (history != NULL && s == line + 13))) {
16185         if (history == NULL) {
16186           history = s;
16187           if (comend == NULL)
16188             comend = line;
16189         }
16190         hcount++;
16191       }
16192       else {
16193         history = NULL;
16194         hcount = 0;
16195         comend = (line + 11 == end ? line : NULL);
16196       }
16197 
16198       status = gi_getline(&line, &end, 0);
16199       error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16200                  print_error("seqfannotate:  Premature end of entry.\n"));
16201       count++;
16202     }
16203     if (comend == NULL)
16204       comend = line;
16205 
16206     if (flag && comment < comend) {
16207       fwrite(comment, 1, comend - comment, fp);
16208       fputs("           \n           ", fp);
16209       count++;
16210     }
16211   }
16212 
16213   /*
16214    * Output the new comment, the history lines if they exist, and
16215    * a history line noting the annotation.
16216    */
16217   putline(fp, 0, 11, newcomment, 80, "           ", 1);
16218   fputs("\n           \n           ", fp);
16219 
16220   if (history != NULL) {
16221     fwrite(history, 1, line - history, fp);
16222     fputs("           ", fp);
16223   }
16224 
16225   pos = (flag ? count - hcount + 1 : 1);
16226   fprintf(fp, "SEQIO annotation, lines %d-%d.   %s\n", pos, pos + ncount - 1,
16227           get_today());
16228 
16229   /*
16230    * Output the rest of the entry.
16231    */
16232   end = entry + entrylen;
16233   fwrite(line, 1, end - line, fp);
16234 
16235   return STATUS_OK;
16236 }
16237 
embl_annotate(FILE * fp,char * entry,int entrylen,char * newcomment,int flag)16238 static int embl_annotate(FILE *fp, char *entry, int entrylen, char *newcomment,
16239                          int flag)
16240 {
16241   int status, count, hcount, ncount, pos, ccflag, xxflag;
16242   char *s, *line, *end, *history, *comment, *comend, *lastline;
16243   char *trailing_line;
16244 
16245   error_test(!mystreq(entry, 'I', "ID   "), E_PARSEERROR, return STATUS_ERROR,
16246              print_error("seqfannotate:  Entry not in EMBL format.\n"));
16247 
16248   for (ncount=0,s=newcomment; *s; s++)
16249     if (*s == '\n')
16250       ncount++;
16251   if (*(s-1) != '\n')
16252     ncount++;
16253 
16254   /*
16255    * Skip past and output the text before the comment.
16256    */
16257   gi_startline(entry, entrylen);
16258 
16259   lastline = NULL;
16260   while (1) {
16261     status = gi_getline(&line, &end, 0);
16262     error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16263                print_error("seqfannotate:  Premature end of entry.\n"));
16264 
16265     if (mystreq(line, 'C', "CC   ") || mystreq(line, 'X', "XX   ") ||
16266         mystreq(line, 'S', "SQ   ") || !strncmp(line, "     ", 5))
16267       break;
16268 
16269     if (!lastline &&
16270         (mystreq(line, 'D', "DR   ") || mystreq(line, 'P', "PR   ") ||
16271          mystreq(line, 'F', "FT   ") || mystreq(line, 'F', "FH   ")))
16272       lastline = line;
16273   }
16274   ccflag = (toupper(*line) == 'C');
16275   xxflag = (toupper(*line) == 'X');
16276 
16277   if (!ccflag && !xxflag && lastline != NULL) {
16278     gi_ungetline(lastline);
16279     gi_getline(&line, &end, 0);
16280   }
16281 
16282   fwrite(entry, 1, line - entry, fp);
16283 
16284   /*
16285    * Output the old and new comments and the history, depending on what
16286    * appears in the entry and the value of `flag'.
16287    */
16288   fputs((xxflag ? "XX   " : "CC   "), fp);
16289 
16290   history = NULL;
16291   count = hcount = 0;
16292   if (ccflag || xxflag) {
16293     comment = line+5;
16294     comend = NULL;
16295     trailing_line = NULL;
16296     while ((xxflag && mystreq(line, 'X', "XX   ")) ||
16297            (ccflag &&
16298             (mystreq(line, 'C', "CC   ") || mystreq(line, 'X', "XX")))) {
16299       if (trailing_line != NULL) {
16300         history = NULL;
16301         hcount = 0;
16302         comend = trailing_line;
16303         trailing_line = NULL;
16304       }
16305 
16306       if (end - line > 5 && (mystreq(line+5, 'S', "SEQIO") ||
16307                              (history != NULL && line[5] == ' ' &&
16308                               line[6] == ' ' && line[7] != ' '))) {
16309         if (history == NULL) {
16310           history = line+5;
16311           if (comend == NULL)
16312             comend = line;
16313         }
16314         hcount++;
16315       }
16316       else if (end - line < 5)
16317         trailing_line = line;
16318       else {
16319         history = NULL;
16320         hcount = 0;
16321         comend = (line+5 == end ? line : NULL);
16322       }
16323 
16324       status = gi_getline(&line, &end, 0);
16325       error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16326                  print_error("seqfannotate:  Premature end of entry.\n"));
16327       count++;
16328     }
16329     if (trailing_line != NULL)
16330       line = trailing_line;
16331 
16332     if (comend == NULL)
16333       comend = line;
16334 
16335     if (flag && comment < comend) {
16336       fwrite(comment, 1, comend - comment, fp);
16337       fputs((xxflag ? "XX   \nXX   " : "CC   \nCC   "), fp);
16338       count++;
16339     }
16340   }
16341 
16342   /*
16343    * Output the new comment, the history lines if they exist, and
16344    * a history line noting the annotation.
16345    */
16346   putline(fp, 0, 5, newcomment, 80, (xxflag ? "XX   " : "CC   "), 1);
16347   fputs((xxflag ? "\nXX   \nXX   " : "\nCC   \nCC   "), fp);
16348 
16349   if (history != NULL) {
16350     fwrite(history, 1, line - history, fp);
16351     fputs((xxflag ? "XX   " : "CC   "), fp);
16352   }
16353 
16354   pos = (flag ? count - hcount + 1 : 1);
16355   fprintf(fp, "SEQIO annotation, lines %d-%d.   %s\n", pos, pos + ncount - 1,
16356           get_today());
16357 
16358   if (!ccflag && !xxflag)
16359     fputs("XX\n", fp);
16360 
16361   /*
16362    * Output the rest of the entry.
16363    */
16364   end = entry + entrylen;
16365   fwrite(line, 1, end - line, fp);
16366 
16367   return STATUS_OK;
16368 }
16369 
16370 
sprot_annotate(FILE * fp,char * entry,int entrylen,char * newcomment,int flag)16371 static int sprot_annotate(FILE *fp, char *entry, int entrylen,
16372                           char *newcomment, int flag)
16373 {
16374   int status, count, hcount, ncount, pos, ccflag;
16375   char *s, *line, *end, *history, *comment, *comend, *lastline;
16376 
16377   error_test(!mystreq(entry, 'I', "ID   "), E_PARSEERROR, return STATUS_ERROR,
16378              print_error("seqfannotate:  Entry not in Swiss-Prot format.\n"));
16379 
16380   for (ncount=0,s=newcomment; *s; s++)
16381     if (*s == '\n')
16382       ncount++;
16383   if (*(s-1) != '\n')
16384     ncount++;
16385 
16386   /*
16387    * Skip past and output the text before the comment.
16388    */
16389   gi_startline(entry, entrylen);
16390 
16391   lastline = NULL;
16392   while (1) {
16393     status = gi_getline(&line, &end, 0);
16394     error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16395                print_error("seqfannotate:  Premature end of entry.\n"));
16396 
16397     if (mystreq(line, 'C', "CC   ") || mystreq(line, 'S', "SQ   ") ||
16398         !strncmp(line, "     ", 5))
16399       break;
16400 
16401     if (!lastline &&
16402         (mystreq(line, 'D', "DR   ") || mystreq(line, 'K', "KW   ") ||
16403          mystreq(line, 'F', "FT   ")))
16404       lastline = line;
16405   }
16406   ccflag = (toupper(*line) == 'C');
16407 
16408   if (!ccflag && lastline != NULL) {
16409     gi_ungetline(lastline);
16410     gi_getline(&line, &end, 0);
16411   }
16412 
16413   fwrite(entry, 1, line - entry, fp);
16414 
16415   /*
16416    * Output the old and new comments and the history, depending on what
16417    * appears in the entry and the value of `flag'.
16418    */
16419   fputs("CC   ", fp);
16420 
16421   history = NULL;
16422   count = hcount = 0;
16423   if (ccflag) {
16424     comment = line+5;
16425     comend = NULL;
16426     while (mystreq(line, 'C', "CC   ")) {
16427       if (mystreq(line+5, 'S', "SEQIO") ||
16428           (history != NULL && line[5] == ' ' &&
16429            line[6] == ' ' && line[7] != ' ')) {
16430         if (history == NULL) {
16431           history = line+5;
16432           if (comend == NULL)
16433             comend = line;
16434         }
16435         hcount++;
16436       }
16437       else {
16438         history = NULL;
16439         hcount = 0;
16440         comend = (line+5 == end ? line : NULL);
16441       }
16442 
16443       status = gi_getline(&line, &end, 0);
16444       error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16445                  print_error("seqfannotate:  Premature end of entry.\n"));
16446       count++;
16447     }
16448     if (comend == NULL)
16449       comend = line;
16450 
16451     if (flag && comment < comend) {
16452       fwrite(comment, 1, comend - comment, fp);
16453       fputs("CC   \nCC   ", fp);
16454       count++;
16455     }
16456   }
16457 
16458   /*
16459    * Output the new comment, the history lines if they exist, and
16460    * a history line noting the annotation.
16461    */
16462   putline(fp, 0, 5, newcomment, 80, "CC   ", 1);
16463   fputs("\nCC   \nCC   ", fp);
16464 
16465   if (history != NULL) {
16466     fwrite(history, 1, line - history, fp);
16467     fputs("CC   ", fp);
16468   }
16469 
16470   pos = (flag ? count - hcount + 1 : 1);
16471   fprintf(fp, "SEQIO annotation, lines %d-%d.   %s\n", pos, pos + ncount - 1,
16472           get_today());
16473 
16474   /*
16475    * Output the rest of the entry.
16476    */
16477   end = entry + entrylen;
16478   fwrite(line, 1, end - line, fp);
16479 
16480   return STATUS_OK;
16481 }
16482 
16483 
fasta_annotate(FILE * fp,char * entry,int entrylen,char * newcomment,int flag)16484 static int fasta_annotate(FILE *fp, char *entry, int entrylen,
16485                           char *newcomment, int flag)
16486 {
16487   int status, count, hcount, ncount, pos;
16488   char *s, *line, *end, *history, *comment, *comend;
16489 
16490   error_test(*entry != '>', E_PARSEERROR, return STATUS_ERROR,
16491              print_error("seqfannotate:  Entry not in FASTA format.\n"));
16492 
16493   for (ncount=0,s=newcomment; *s; s++)
16494     if (*s == '\n')
16495       ncount++;
16496   if (*(s-1) != '\n')
16497     ncount++;
16498 
16499   /*
16500    * Skip past and output the oneline description before the comment.
16501    */
16502   gi_startline(entry, entrylen);
16503   if (gi_getline(&line, &end, 0) == 0 || gi_getline(&line, &end, 0) == 0)
16504     raise_error(E_PARSEERROR, return STATUS_ERROR,
16505                 print_error("seqfannotate:  Premature end of entry.\n"));
16506 
16507   fwrite(entry, 1, line - entry, fp);
16508 
16509   /*
16510    * Output the old and new comments and the history, depending on what
16511    * appears in the entry and the value of `flag'.
16512    */
16513   fputc('>', fp);
16514 
16515   history = NULL;
16516   count = hcount = 0;
16517   if (*line == '>') {
16518     comment = line+1;
16519     comend = NULL;
16520     while (*line == '>') {
16521       if (mystreq(line+1, 'S', "SEQIO") ||
16522           (history != NULL && line[1] == ' ' &&
16523            line[2] == ' ' && line[3] != ' ')) {
16524         if (history == NULL) {
16525           history = line+1;
16526           if (comend == NULL)
16527             comend = line;
16528         }
16529         hcount++;
16530       }
16531       else {
16532         history = NULL;
16533         hcount = 0;
16534         comend = (line+1 == end ? line : NULL);
16535       }
16536 
16537       status = gi_getline(&line, &end, 0);
16538       error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16539                  print_error("seqfannotate:  Premature end of entry.\n"));
16540       count++;
16541     }
16542     if (comend == NULL)
16543       comend = line;
16544 
16545     if (flag && comment < comend) {
16546       fwrite(comment, 1, comend - comment, fp);
16547       fputc('\n', fp);
16548     }
16549   }
16550 
16551   /*
16552    * Output the new comment, the history lines if they exist, and
16553    * a history line noting the annotation.
16554    */
16555   fputs("\n>", fp);
16556   count++;
16557   putline(fp, 0, 1, newcomment, 80, ">", 1);
16558   fputs("\n>\n>", fp);
16559 
16560   if (history != NULL) {
16561     fwrite(history, 1, line - history, fp);
16562     fputc('>', fp);
16563   }
16564 
16565   pos = (flag ? count - hcount + 1 : 1);
16566   fprintf(fp, "SEQIO annotation, lines %d-%d.   %s\n", pos, pos + ncount - 1,
16567           get_today());
16568 
16569   /*
16570    * Output the rest of the entry.
16571    */
16572   end = entry + entrylen;
16573   fwrite(line, 1, end - line, fp);
16574 
16575   return STATUS_OK;
16576 }
16577 
16578 
nbrf_annotate(FILE * fp,char * entry,int entrylen,char * newcomment,int flag)16579 static int nbrf_annotate(FILE *fp, char *entry, int entrylen,
16580                          char *newcomment, int flag)
16581 {
16582   int status, count, hcount, ncount, pos, ccflag;
16583   char *s, *line, *end, *history, *comment, *comend, *lastline;
16584 
16585   error_test(*entry != '>', E_PARSEERROR, return STATUS_ERROR,
16586              print_error("seqfannotate:  Entry not in NBRF format.\n"));
16587 
16588   for (ncount=0,s=newcomment; *s; s++)
16589     if (*s == '\n')
16590       ncount++;
16591   if (*(s-1) != '\n')
16592     ncount++;
16593 
16594   /*
16595    * Skip past and output the oneline description before the comment.
16596    */
16597   gi_startline(entry, entrylen);
16598 
16599   lastline = NULL;
16600   ccflag = 0;
16601   while ((status = gi_getline(&line, &end, 0))) {
16602     if (mystreq(line, 'C', "C;COMMENT:")) {
16603       ccflag = 1;
16604       break;
16605     }
16606 
16607     if (!lastline &&
16608         (mystreq(line, 'F', "F;") ||
16609          (toupper(line[0]) == 'C' &&
16610           (mystreq(line, 'C', "C;GENETICS:") ||
16611            mystreq(line, 'C', "C;COMPLEX:") ||
16612            mystreq(line, 'C', "C;FUNCTION:") ||
16613            mystreq(line, 'C', "C;SUPERFAMILY:") ||
16614            mystreq(line, 'C', "C;KEYWORDS:")))))
16615       lastline = line;
16616   }
16617   if (status == 0)
16618     fwrite(entry, 1, entrylen, fp);
16619   else {
16620     if (!ccflag && lastline != NULL) {
16621       gi_ungetline(lastline);
16622       gi_getline(&line, &end, 0);
16623     }
16624 
16625     fwrite(entry, 1, line - entry, fp);
16626   }
16627 
16628   /*
16629    * Output the old and new comments and the history, depending on what
16630    * appears in the entry and the value of `flag'.
16631    */
16632   fputs("C;Comment: ", fp);
16633 
16634   history = NULL;
16635   count = hcount = 0;
16636   if (ccflag) {
16637     comment = line+11;
16638     comend = NULL;
16639     while (status && mystreq(line, 'C', "C;COMMENT: ")) {
16640       if (mystreq(line+11, 'S', "SEQIO") ||
16641           (history != NULL && line[11] == ' ' &&
16642            line[12] == ' ' && line[13] != ' ')) {
16643         if (history == NULL) {
16644           history = line+11;
16645           if (comend == NULL)
16646             comend = line;
16647         }
16648         hcount++;
16649       }
16650       else {
16651         history = NULL;
16652         hcount = 0;
16653         comend = (line+11 == end ? line : NULL);
16654       }
16655 
16656       status = gi_getline(&line, &end, 0);
16657       count++;
16658     }
16659     if (status == 0)
16660       comend = entry + entrylen;
16661     else if (comend == NULL)
16662       comend = line;
16663 
16664     if (flag && comment < comend) {
16665       fwrite(comment, 1, comend - comment, fp);
16666       fputs("C;Comment: \nC;Comment: ", fp);
16667       count++;
16668     }
16669   }
16670 
16671   /*
16672    * Output the new comment, the history lines if they exist, and
16673    * a history line noting the annotation.
16674    */
16675   putline(fp, 0, 11, newcomment, 80, "C;Comment: ", 1);
16676   fputs("\nC;Comment: \nC;Comment: ", fp);
16677 
16678   if (history != NULL) {
16679     fwrite(history, 1, line - history, fp);
16680     fputs("C;Comment: ", fp);
16681   }
16682 
16683   pos = (flag ? count - hcount + 1 : 1);
16684   fprintf(fp, "SEQIO annotation, lines %d-%d.   %s\n", pos, pos + ncount - 1,
16685           get_today());
16686 
16687   /*
16688    * Output the rest of the entry.
16689    */
16690   if (status != 0) {
16691     end = entry + entrylen;
16692     fwrite(line, 1, end - line, fp);
16693   }
16694 
16695   return STATUS_OK;
16696 }
16697 
16698 
stanford_annotate(FILE * fp,char * entry,int entrylen,char * newcomment,int flag)16699 static int stanford_annotate(FILE *fp, char *entry, int entrylen,
16700                              char *newcomment, int flag)
16701 {
16702   int status, count, hcount, ncount, pos;
16703   char *s, *line, *end, *history, *comment, *comend;
16704 
16705   error_test(*entry != ';', E_PARSEERROR, return STATUS_ERROR,
16706              print_error("seqfannotate:  Entry not in IG/Stanford format.\n"));
16707 
16708   for (ncount=0,s=newcomment; *s; s++)
16709     if (*s == '\n')
16710       ncount++;
16711   if (*(s-1) != '\n')
16712     ncount++;
16713 
16714   /*
16715    * Setup the gi_getline info to start at the first comment line (which
16716    * is also the first entry line).
16717    */
16718   gi_startline(entry, entrylen);
16719   status = gi_getline(&line, &end, 0);
16720   error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16721              print_error("seqfannotate:  Premature end of entry.\n"));
16722 
16723   /*
16724    * Output the old and new comments and the history, depending on what
16725    * appears in the entry and the value of `flag'.
16726    */
16727   fputc(';', fp);
16728 
16729   history = NULL;
16730   count = hcount = 0;
16731   if (*line == ';') {
16732     comment = line+1;
16733     comend = NULL;
16734     while (*line == ';') {
16735       if (mystreq(line+1, 'S', "SEQIO") ||
16736           (history != NULL && line[1] == ' ' &&
16737            line[2] == ' ' && line[3] != ' ')) {
16738         if (history == NULL) {
16739           history = line+1;
16740           if (comend == NULL)
16741             comend = line;
16742         }
16743         hcount++;
16744       }
16745       else {
16746         history = NULL;
16747         hcount = 0;
16748         comend = (line+1 == end ? line : NULL);
16749       }
16750 
16751       status = gi_getline(&line, &end, 0);
16752       error_test(status == 0, E_PARSEERROR, return STATUS_ERROR,
16753                  print_error("seqfannotate:  Premature end of entry.\n"));
16754       count++;
16755     }
16756     if (comend == NULL)
16757       comend = line;
16758 
16759     if (flag && comment < comend) {
16760       fwrite(comment, 1, comend - comment, fp);
16761       fputs(";\n;", fp);
16762       count++;
16763     }
16764   }
16765 
16766   /*
16767    * Output the new comment, the history lines if they exist, and
16768    * a history line noting the annotation.
16769    */
16770   putline(fp, 0, 1, newcomment, 80, ";", 1);
16771   fputs("\n;\n;", fp);
16772 
16773   if (history != NULL) {
16774     fwrite(history, 1, line - history, fp);
16775     fputc(';', fp);
16776   }
16777 
16778   pos = (flag ? count - hcount + 1 : 1);
16779   fprintf(fp, "SEQIO annotation, lines %d-%d.   %s\n", pos, pos + ncount - 1,
16780           get_today());
16781 
16782   /*
16783    * Output the rest of the entry.
16784    */
16785   end = entry + entrylen;
16786   fwrite(line, 1, end - line, fp);
16787 
16788   return STATUS_OK;
16789 }
16790 
asn_annotate(FILE * fp,char * entry,int entrylen,char * newcomment,int flag)16791 static int asn_annotate(FILE *fp, char *entry, int entrylen, char *newcomment,
16792                         int flag)
16793 {
16794   int ccflag, status, count, hcount, ncount, pos, oldpe;
16795   int tempc, temph, inhistory, blank;
16796   char qch, *s, *start, *end, *entryend, *head, *tail, *comment, *comend;
16797   char *destr, *deend, *str, *strend, *lastcomment, *lastend;
16798 
16799   entryend = entry + entrylen;
16800 
16801   for (s=entry; s < entryend && isspace(*s); s++) ;
16802   error_test(s + 4 >= entryend || !mystreq(s, 'S', "SEQ "),
16803              E_PARSEERROR, return STATUS_ERROR,
16804              print_error("seqfannotate:  Entry not an ASN.1 "
16805                          "`Bioseq-set.seq-set.seq' record.\n"));
16806 
16807   for (ncount=0,s=newcomment; *s; s++)
16808     if (*s == '\n')
16809       ncount++;
16810 
16811   /*
16812    * Look for the "seq.descr.comment" sub-record.  If no such record is found,
16813    * look for, in order, a "seq.descr" (comment goes at the end of that
16814    * record), "seq.id" (descr.comment goes after that record), or
16815    * "seq.inst" (descr.comment goes before that record).  If none of those
16816    * are found, put a descr.comment record at the end of the entry.
16817    */
16818   ccflag = 0;
16819   head = tail = start = end = NULL;
16820   comment = comend = NULL;
16821 
16822   destr = NULL;
16823   oldpe = pe_flag;
16824   pe_flag = PE_NONE;
16825   status = asn_parse(entry, entryend, "seq.descr", &destr, &deend, NULL);
16826   pe_flag = oldpe;
16827   error_test(status == -1, E_PARSEERROR, return STATUS_ERROR,
16828              print_error("seqfannotate:  Invalid format for ASN.1 entry.\n"));
16829 
16830   if (destr != NULL) {
16831     oldpe = pe_flag;
16832     pe_flag = PE_NONE;
16833     status = asn_parse(destr, deend, "descr.comment", &comment, &comend, NULL);
16834     pe_flag = oldpe;
16835     error_test(status == -1, E_PARSEERROR, return STATUS_ERROR,
16836                print_error("seqfannotate:  Invalid format for ASN.1 "
16837                            "entry.\n"));
16838 
16839     if (comment != NULL) {
16840       ccflag = 1;
16841       start = comment;
16842       end = deend;
16843     }
16844     else {
16845       for (start=deend; start > destr && isspace(*(start-1)); start--) ;
16846       error_test(*(start-1) != '}', E_PARSEERROR, return STATUS_ERROR,
16847                print_error("seqfannotate:  Invalid format for ASN.1 "
16848                            "entry.\n"));
16849       end = --start;
16850       head = ",\n        comment \"";
16851       tail = "\" ";
16852     }
16853   }
16854   else {
16855     str = NULL;
16856     oldpe = pe_flag;
16857     pe_flag = PE_NONE;
16858     status = asn_parse(entry, entryend, "seq.id", &str, &strend, NULL);
16859     pe_flag = oldpe;
16860     error_test(status == -1, E_PARSEERROR, return STATUS_ERROR,
16861                print_error("seqfannotate:  Invalid format for ASN.1 "
16862                            "entry.\n"));
16863 
16864     if (str != NULL) {
16865       start = end = strend;
16866       head = ",\n      descr {\n        comment \"";
16867       tail = "\" } ";
16868     }
16869     else {
16870       oldpe = pe_flag;
16871       pe_flag = PE_NONE;
16872       status = asn_parse(entry, entryend, "seq.inst", &str, &strend, NULL);
16873       pe_flag = oldpe;
16874       error_test(status == -1, E_PARSEERROR, return STATUS_ERROR,
16875                  print_error("seqfannotate:  Invalid format for ASN.1 "
16876                              "entry.\n"));
16877 
16878       if (str != NULL) {
16879         start = end = str;
16880         head = "descr {\n        comment \"";
16881         tail = "\" } ,\n      ";
16882       }
16883       else {
16884         for (start=entryend; start > entry && isspace(*(start-1)); start--) ;
16885         error_test(*(start-1) != '}', E_PARSEERROR, return STATUS_ERROR,
16886                    print_error("seqfannotate:  Invalid format for ASN.1 "
16887                                "entry.\n"));
16888         end = --start;
16889         head = ",\n      descr {\n        comment \"";
16890         tail = "\" } ";
16891       }
16892     }
16893   }
16894 
16895   fwrite(entry, 1, start - entry, fp);
16896 
16897   count = hcount = 0;
16898   if (!ccflag) {
16899     fputs(head, fp);
16900     pos = 17;
16901   }
16902   else {
16903     lastcomment = s = start;
16904     while (1) {
16905       for (s+=7; s < end && isspace(*s); s++) ;
16906       error_test(s == end || (*s != '\'' && *s != '"'),
16907                  E_PARSEERROR, return STATUS_ERROR,
16908                  print_error("seqfannotate:  Invalid format for ASN.1 "
16909                              "entry.\n"));
16910 
16911       tempc = temph = inhistory = blank = 0;
16912       inhistory = mystreq(s+1, 'S', "SEQIO");
16913       if (inhistory)
16914         temph = 1;
16915       else
16916         tempc = 1;
16917 
16918       qch = *s;
16919       for (s++; s < end && *s != qch && *(s-1) != '\\'; s++) {
16920         if (*s == '\n') {
16921           if (mystreq(s+1, ' ', " SEQIO") ||
16922               (inhistory && s[1] == ' ' && s[2] == ' ' &&
16923                s[3] == ' ' && s[4] != ' ')) {
16924             inhistory = 1;
16925             temph++;
16926           }
16927           else if (s[1] == ' ' && s[2] == '\n') {
16928             inhistory = temph = 0;
16929             blank = 1;
16930           }
16931           else
16932             blank = inhistory = temph = 0;
16933 
16934           tempc++;
16935         }
16936       }
16937       error_test(s == end, E_PARSEERROR, return STATUS_ERROR,
16938                  print_error("seqfannotate:  Invalid format for ASN.1 "
16939                              "entry.\n"));
16940 
16941       hcount += temph;
16942       count += (tempc - temph - blank);
16943       lastend = s++;
16944 
16945       while (s < end && isspace(*s)) s++;
16946       error_test(s == end || (*s != ',' && *s != '}'),
16947                  E_PARSEERROR, return STATUS_ERROR,
16948                  print_error("seqfannotate:  Invalid format for ASN.1 "
16949                              "entry.\n"));
16950 
16951       if (*s != ',') {
16952         if (flag) {
16953           fwrite(lastcomment, 1, s - lastcomment, fp);
16954           fputs(",\n        ", fp);
16955         }
16956         end = s;
16957         fputs("comment \"", fp);
16958         tail = "\" ";
16959         break;
16960       }
16961 
16962       for (s++; s < end && isspace(*s); s++) ;
16963       error_test(s == end, E_PARSEERROR, return STATUS_ERROR,
16964                  print_error("seqfannotate:  Invalid format for ASN.1 "
16965                              "entry.\n"));
16966 
16967       if (!mystreq(s, 'C', "COMMENT ")) {
16968         comment = NULL;
16969         oldpe = pe_flag;
16970         pe_flag = PE_NONE;
16971         status = asn_parse(s, end, "comment", &comment, &deend, NULL);
16972         pe_flag = oldpe;
16973         error_test(status == -1, E_PARSEERROR, return STATUS_ERROR,
16974                    print_error("seqfannotate:  Invalid format for ASN.1 "
16975                                "entry.\n"));
16976 
16977         if (comment != NULL) {
16978           if (flag)
16979             fwrite(lastcomment, 1, s - lastcomment, fp);
16980           fwrite(s, 1, comment - s, fp);
16981           lastcomment = s;
16982         }
16983         else {
16984           if (flag)
16985             fwrite(lastcomment, 1, s - lastcomment, fp);
16986           end = s;
16987           fputs("comment \"", fp);
16988           tail = "\" ,\n        ";
16989           break;
16990         }
16991       }
16992     }
16993   }
16994 
16995   putline(fp, 0, 17, newcomment, 78, " ", 1);
16996   fputs("\" ,\n        comment \"", fp);
16997 
16998   pos = (flag ? count - hcount + 1 : 1);
16999   fprintf(fp, "SEQIO annotation, lines %d-%d.   %s", pos, pos + ncount - 1,
17000           get_today());
17001   fputs(tail, fp);
17002 
17003   /*
17004    * Output the rest of the entry.
17005    */
17006   fwrite(end, 1, entryend - end, fp);
17007 
17008   return STATUS_OK;
17009 }
17010 
17011 
17012 
17013 
17014 
17015 
17016 /*
17017  *
17018  *
17019  * Section dealing with Database specification and database file locations.
17020  *
17021  *
17022  */
17023 
17024 
17025 typedef struct {
17026   char *string;
17027   int count, size, lastflag, error;
17028 } STRING;
17029 
17030 
17031 /*
17032  * addstring
17033  *
17034  * This procedure adds the string "s" onto the end of the dynamically
17035  * allocated string in the STRING structure, automatically growing that
17036  * string if necessary.  This just simplifies the database initialization
17037  * code below.
17038  *
17039  * Parameters:  str  -  a STRING structure (where the string in it can be NULL)
17040  *              s    -  a character string
17041  *
17042  * Returns:   nothing
17043  */
17044 
addstring(STRING * str,char * s)17045 static void addstring(STRING *str, char *s)
17046 {
17047   int len, newlen, lastflag;
17048   char *t, *s2, *t2;
17049 
17050   if (str->error)
17051     return;
17052 
17053   len = strlen(s);
17054   if (str->count + len + 2 >= str->size) {
17055     if (str->size == 0) {
17056       str->size = 256 + len + 2;
17057       if ((str->string = (char *) malloc(str->size)) == NULL) {
17058         str->error = 1;
17059         return;
17060       }
17061     }
17062     else {
17063       str->size += str->size + len + 2;
17064       if ((str->string = (char *) realloc(str->string, str->size)) == NULL) {
17065         str->error = 1;
17066         return;
17067       }
17068     }
17069   }
17070 
17071   lastflag = str->lastflag;
17072   str->lastflag = 0;
17073   for (s2=s; *s2; s2++) {
17074     if (*s2 == '@') {
17075       str->lastflag = 1;
17076       break;
17077     }
17078   }
17079 
17080   if (lastflag && str->lastflag) {
17081     t2 = NULL;
17082     for (t=str->string+str->count-1; t > str->string && *(t-1) != '\n'; t--)
17083       if (*t == '@')
17084         t2 = t;
17085 
17086     if (s2 - s == t2 - t && strncmp(s, t, s2 - s) == 0) {
17087       str->string[str->count-1] = ',';
17088       newlen = len - (s2 - s) - 1;
17089       memcpy(str->string + str->count, s2 + 1, len - (s2 - s) - 1);
17090       str->string[str->count+newlen] = '\n';
17091       str->count += newlen + 1;
17092       return;
17093     }
17094   }
17095 
17096   memcpy(str->string + str->count, s, len);
17097   str->string[str->count+len] = '\n';
17098   str->count += len + 1;
17099 }
17100 
17101 
17102 typedef struct bseq_node {
17103   char *names, *dir, *fields, *fieldsend, *files;
17104   int linenum;
17105   struct bseq_node *next;
17106 } BSEQ_NODE, *BIOSEQ_LIST;
17107 
17108 BIOSEQ_LIST bseq_dblist = NULL;
17109 int got_bioseq = 0;
17110 
17111 static int int_bioseq_read(char *filename);
17112 static BIOSEQ_LIST dbname_match(BIOSEQ_LIST, char *, int *);
17113 static int sufalias_match(BIOSEQ_LIST, char *, STRING *);
17114 static int file_match(BIOSEQ_LIST, char *, char *, STRING *, int);
17115 static int match_string(char *, char *, char *, char *);
17116 static int complex_match(char *, char *, char *, char *);
17117 static int match_and_expand(BIOSEQ_LIST, char *, char *, char *, char *,
17118                             STRING *);
17119 static int expand(char *, int, char *, char *, char *, char *, int,
17120                   STRING *, BIOSEQ_LIST);
17121 static int ident_lookup(char *indexfile, int newfile, char *id,
17122                         char *idend, STRING *string);
17123 static void ident_close(void);
17124 static int findline(FILE *fp, int low, int high, char *pattern, int patsize);
17125 static int findline2(char *text, int low, int high, char *pattern,
17126                      int patsize);
17127 
17128 
bioseq_read(char * filelist)17129 int bioseq_read(char *filelist)
17130 {
17131   int status;
17132   char *s, *end, buffer[FILENAME_MAX+1];
17133 
17134   if (!ctype_initflag)
17135     init_ctype();
17136 
17137   param_error(filelist == NULL, return -1, "bioseq_read", "arg 1 is NULL");
17138   param_error(filelist[0] == '\0', return -1, "bioseq_read",
17139               "arg 1 is an empty string");
17140 
17141   if (!got_bioseq) {
17142     got_bioseq = 1;
17143     if ((s = getenv("BIOSEQ")) != NULL && (status = bioseq_read(s)) < 0)
17144       return status;
17145   }
17146 
17147   for (end=filelist; *end; end++) ;
17148   while (end > filelist) {
17149     for (s=end-1; s >= filelist && *s != ','; s--) ;
17150     error_test(s == end - 1, E_PARSEERROR, return -1,
17151                print_error("%s:  Invalid format of BIOSEQ file list.\n",
17152                            filelist));
17153 
17154     memcpy(buffer, s + 1, end - s - 1);
17155     buffer[end-s-1] = '\0';
17156     if ((status = int_bioseq_read(buffer)) < 0)
17157       return status;
17158 
17159     end = s;
17160   }
17161 
17162   return 0;
17163 }
17164 
17165 
int_bioseq_read(char * filename)17166 static int int_bioseq_read(char *filename)
17167 {
17168   int flag, linenum, count, inalias, errflag;
17169   char *s, *t, *text, *dir, *file, *lastfield;
17170   BIOSEQ_LIST newlist, newlnode, node, next;
17171 
17172   /*
17173    * Read in the file.
17174    */
17175   text = read_small_file(get_truename(filename, NULL));
17176   error_test(text == NULL, E_READFAILED, return -1,
17177              print_error("%s:  Cannot read BIOSEQ file.\n", filename));
17178 
17179   linenum = 1;
17180   newlist = NULL;
17181   s = text;
17182   errflag = 0;
17183 
17184   /*
17185    * Look for the first header line.
17186    */
17187   while (*s != '>') {
17188     flag = 0;
17189     for ( ; *s && *s != '\n'; s++) {
17190       if (*s == '#')
17191         flag = 1;
17192       else if (!flag && !isspace(*s) && !errflag) {
17193         print_error("%s, line %d:  Other text appears before first BIOSEQ "
17194                     "entry.\n", filename, linenum);
17195         errflag++;
17196       }
17197     }
17198     if (*s) s++;
17199     linenum++;
17200   }
17201 
17202   /*
17203    * The main loop.
17204    */
17205   while (*s) {
17206     /*
17207      * Allocate a new node.
17208      */
17209     newlnode = (BSEQ_NODE *) malloc(sizeof(BSEQ_NODE));
17210     if (newlnode == NULL) {
17211       seqferrno = E_NOMEMORY;
17212       print_error("Memory Error:  Ran out of memory.\n", filename);
17213       errflag++;
17214       goto BSEQ_LOOP_END;
17215     }
17216 
17217     newlnode->names = s+1;
17218     newlnode->dir = newlnode->fields = NULL;
17219     newlnode->fieldsend = newlnode->files = NULL;
17220     newlnode->linenum = linenum;
17221     newlnode->next = newlist;
17222     newlist = newlnode;
17223 
17224     /*
17225      * Skip to the end of the header line, checking for a directory.
17226      * NULL-terminate the name list and the directory (if it's
17227      * there).
17228      */
17229     dir = NULL;
17230     flag = 0;
17231     for ( ; *s && *s != '\n'; s++) {
17232       if (*s == ':' && dir == NULL) {
17233         *s = '\0';
17234         dir = s+1;
17235       }
17236 
17237       if (!flag && dir == NULL && !isspace(*s) && *s != ',')
17238         flag = 1;
17239     }
17240     if (!*s) {
17241       print_error("%s, line %d:  Premature end of BIOSEQ file.\n",
17242                   filename, linenum);
17243       errflag++;
17244       goto BSEQ_LOOP_END;
17245     }
17246     if (!flag) {
17247       print_error("%s, line %d:  No entry name on first line of "
17248                   "BIOSEQ entry.\n", filename, linenum);
17249       errflag++;
17250       if (errflag == 20) {
17251         print_error("%s, line %d:  Too many errors.\n", filename, linenum);
17252         goto BSEQ_LOOP_END;
17253       }
17254     }
17255 
17256     *s = '\0';
17257 
17258     /*
17259      * NULL-terminate the directory and check that nothing else appears
17260      * on the line.
17261      */
17262     if (dir != NULL) {
17263       for ( ; dir < s && isspace(*dir); dir++) ;
17264       if (dir == s) {
17265         print_error("%s, line %d:  Invalid first line format of "
17266                     "BIOSEQ entry.\n", filename, linenum);
17267         errflag++;
17268         if (errflag == 20) {
17269           print_error("%s, line %d:  Too many errors.\n", filename, linenum);
17270           goto BSEQ_LOOP_END;
17271         }
17272       }
17273 
17274       newlnode->dir = dir;
17275 
17276       for ( ; dir < s && !isspace(*dir); dir++) ;
17277       if (*(dir-1) == dirch)
17278         *(dir-1) = '\0';
17279 
17280       if (dir < s) {
17281         *dir = '\0';
17282         for (dir++; dir < s; dir++) {
17283           if (!isspace(*dir)) {
17284             print_error("%s, line %d:  Extra text appears after root "
17285                         "directory in BIOSEQ entry.\n", filename, linenum);
17286             errflag++;
17287             if (errflag == 20) {
17288               print_error("%s, line %d:  Too many errors.\n", filename,
17289                           linenum);
17290               goto BSEQ_LOOP_END;
17291             }
17292             break;
17293           }
17294         }
17295       }
17296     }
17297 
17298     /*
17299      * Skip through the field lines, NULL-terminating each line and
17300      * looking for the beginning of the file list.
17301      */
17302     s++;
17303     linenum++;
17304 
17305     newlnode->fields = (*s == '>' ? s : NULL);
17306     lastfield = NULL;
17307     for ( ; *s == '>'; s++,linenum++) {
17308       if (!isspace(s[1])) {
17309         for (s++; *s && !isspace(*s) && *s != ':'; s++) ;
17310         if (*s && *s != ':') {
17311           print_error("%s, line %d:  Invalid format for information field.\n",
17312                       filename, linenum);
17313           errflag++;
17314           if (errflag == 20) {
17315             print_error("%s, line %d:  Too many errors.\n", filename, linenum);
17316             goto BSEQ_LOOP_END;
17317           }
17318         }
17319 
17320         while (*s && *s != '\n') s++;
17321 
17322         if (!*s) {
17323           print_error("%s, line %d:  Premature end of BIOSEQ entry.\n",
17324                       filename, linenum);
17325           errflag++;
17326           goto BSEQ_LOOP_END;
17327         }
17328         *s = '\0';
17329 
17330         for (t=s; isspace(*(t-1)); t--)
17331           *(t-1) = '\0';
17332         lastfield = t;
17333       }
17334       else {
17335         if (lastfield == NULL) {
17336           print_error("%s, line %d:  No information fieldname given.\n",
17337                       filename, linenum);
17338           errflag++;
17339           if (errflag == 20) {
17340             print_error("%s, line %d:  Too many errors.\n", filename, linenum);
17341             goto BSEQ_LOOP_END;
17342           }
17343 
17344           while (*s && *s != '\n') s++;
17345         }
17346         else {
17347           for (s++; *s && isspace(*s) && *s != '\n'; s++)
17348             *(s-1) = '\0';
17349 
17350           *lastfield++ = ' ';
17351           while (*s && *s != '\n') {
17352             *lastfield++ = *s;
17353             *s++ = '\0';
17354           }
17355           *lastfield = '\0';
17356 
17357           if (!*s) {
17358             print_error("%s, line %d:  Premature end of BIOSEQ entry.\n",
17359                         filename, linenum);
17360             errflag++;
17361             goto BSEQ_LOOP_END;
17362           }
17363           *s = '\0';
17364 
17365           for (t=lastfield; isspace(*(t-1)); t--)
17366             *(t-1) = '\0';
17367           lastfield = t;
17368         }
17369       }
17370     }
17371 
17372     if (newlnode->fields)
17373       newlnode->fieldsend = s;
17374 
17375     /*
17376      * Scan the files lines, performing minimal syntax checking and
17377      * looking either for the end of the file or a line beginning
17378      * with '>'.  Don't NULL-terminate lines here until the very end.
17379      */
17380     newlnode->files = s;
17381     count = 0;
17382     inalias = 0;
17383     flag = 0;
17384     while (*s) {
17385       while (*s && (*s == ' ' || *s == '\t' || *s == ',')) s++;
17386       if (!*s)
17387         break;
17388 
17389       if (*s == '\n') {
17390         if (!s[1] || s[1] == '>')
17391           break;
17392         else {
17393           s++;
17394           linenum++;
17395           continue;
17396         }
17397       }
17398 
17399       if (*s == '#') {
17400         while (*s && *s != '\n') s++;
17401         continue;
17402       }
17403 
17404       file = s;
17405       while (*s && !isspace(*s) && *s != ',' && *s != '#' && *s != '(' &&
17406              *s != ')')
17407         s++;
17408 
17409       if (file != s)
17410         flag = 1;
17411 
17412       if (file != s && *s != '(' && *(s-1) == dirch) {
17413         print_error("%s, line %d:  Filenames in BIOSEQ entries cannot end "
17414                     "with `%c'.\n", filename, linenum, dirch);
17415         errflag++;
17416         if (errflag == 20) {
17417           print_error("%s, line %d:  Too many errors.\n", filename, linenum);
17418           goto BSEQ_LOOP_END;
17419         }
17420       }
17421 
17422       if (!*s || isspace(*s) || *s == ',')
17423         continue;
17424 
17425       switch (*s) {
17426       case '#':
17427         while (*s && *s != '\n') s++;
17428         break;
17429 
17430       case '(':
17431         if (inalias) {
17432           print_error("%s, line %d:  Parentheses not allowed "
17433                       "inside alias definition.\n", filename, linenum);
17434           errflag++;
17435           if (errflag == 20) {
17436             print_error("%s, line %d:  Too many errors.\n", filename,
17437                         linenum);
17438             goto BSEQ_LOOP_END;
17439           }
17440         }
17441         else if (*(s-1) == ':') {
17442           if (count != 0) {
17443             print_error("%s, line %d:  `:(' may not appear inside "
17444                         "other parentheses.\n", filename, linenum);
17445             errflag++;
17446             if (errflag == 20) {
17447               print_error("%s, line %d:  Too many errors.\n", filename,
17448                           linenum);
17449               goto BSEQ_LOOP_END;
17450             }
17451           }
17452           else {
17453             inalias = 1;
17454             for (t=file; t < s-1; t++) {
17455               if (*t == '*' || *t == '?' || *t == dirch ||
17456                   (*t == '~' && t != file)) {
17457                 print_error("%s, line %d:  `%c' not permitted in alias "
17458                             "name.\n", filename, linenum, *t);
17459                 errflag++;
17460                 if (errflag == 20) {
17461                   print_error("%s, line %d:  Too many errors.\n",
17462                               filename, linenum);
17463                   goto BSEQ_LOOP_END;
17464                 }
17465                 break;
17466               }
17467             }
17468           }
17469         }
17470         else if (*(s-1) != dirch) {
17471           print_error("%s, line %d:  Parse error at `%c('.\n",
17472                       filename, linenum, *(s-1));
17473           errflag++;
17474           if (errflag == 20) {
17475             print_error("%s, line %d:  Too many errors.\n", filename,
17476                         linenum);
17477             goto BSEQ_LOOP_END;
17478           }
17479         }
17480         count++;
17481         s++;
17482         break;
17483 
17484       case ')':
17485         if (count == 0) {
17486           print_error("%s, line %d:  Unmatched `)'.\n", filename, linenum);
17487           errflag++;
17488           if (errflag == 20) {
17489             print_error("%s, line %d:  Too many errors.\n", filename,
17490                         linenum);
17491             goto BSEQ_LOOP_END;
17492           }
17493         }
17494         else
17495           count--;
17496         inalias = 0;
17497         s++;
17498         break;
17499       }
17500     }
17501 
17502     if (inalias) {
17503       print_error("%s, line %d:  End of BIOSEQ entry reached "
17504                   "inside alias definition.\n", filename, linenum);
17505       errflag++;
17506       if (errflag == 20) {
17507         print_error("%s, line %d:  Too many errors.\n", filename, linenum);
17508         goto BSEQ_LOOP_END;
17509       }
17510     }
17511     else if (count != 0) {
17512       print_error("%s, line %d:  End of BIOSEQ entry reached "
17513                   "with unmatched '('.\n", filename, linenum);
17514       errflag++;
17515       if (errflag == 20) {
17516         print_error("%s, line %d:  Too many errors.\n", filename, linenum);
17517         goto BSEQ_LOOP_END;
17518       }
17519     }
17520 
17521     if (!flag)
17522       newlnode->files = NULL;
17523 
17524     if (*s) {
17525       *s = '\0';
17526       s++;
17527     }
17528   }
17529 BSEQ_LOOP_END:
17530 
17531   /*
17532    * If a parse error occurred while checking the file, free all memory
17533    * allocated and return an error value.
17534    */
17535   if (errflag > 0) {
17536     while (newlist != NULL) {
17537       newlnode = newlist->next;
17538       free(newlist);
17539       newlist = newlnode;
17540     }
17541 
17542     if (seqferrno == E_NOERROR)
17543       seqferrno = E_PARSEERROR;
17544     return -1;
17545   }
17546 
17547   /*
17548    * Now, add the entries to the list of databases.
17549    *
17550    * A separate pass is used here (instead of combining this with the
17551    * above loop) so that if errors occur in the file, the original
17552    * database list is not affected.
17553    */
17554   for (node=newlist; node != NULL; node=next) {
17555     next = node->next;
17556     node->next = bseq_dblist;
17557     bseq_dblist = node;
17558   }
17559 
17560   return 0;
17561 }
17562 
17563 
bioseq_check(char * dbspec)17564 int bioseq_check(char *dbspec)
17565 {
17566   int i, pos, len, namelen;
17567   char *s, *dbname, buffer[8];
17568   BIOSEQ_LIST node;
17569 
17570   if (!ctype_initflag)
17571     init_ctype();
17572 
17573   if (dbspec == NULL || dbspec[0] == '\0')
17574     return 0;
17575 
17576   if (!got_bioseq) {
17577     got_bioseq = 1;
17578     if ((s = getenv("BIOSEQ")) != NULL && bioseq_read(s) < 0)
17579       return 0;
17580   }
17581 
17582   if (dbname_match(bseq_dblist, dbspec, NULL) != NULL)
17583     return 1;
17584 
17585   pos = -1;
17586   for (s=dbspec,len=0; len < 6 && *s; s++,len++)
17587     if (pos == -1 && *s == ':')
17588       pos = len;
17589 
17590   if (pos >= 2 && pos <= 4) {
17591     memcpy(buffer, dbspec, pos);
17592     buffer[pos] = '\0';
17593 
17594     for (i=0; i < idpref_table_size; i++)
17595       if (mycasecmp(buffer, idpref_table[i].idprefix) == 0)
17596         break;
17597 
17598     if (i < idpref_table_size) {
17599       node = bseq_dblist;
17600       namelen = strlen(idpref_table[i].dbname);
17601       while ((node = dbname_match(node,
17602                                   idpref_table[i].dbname, &len)) != NULL) {
17603         if (len == namelen)
17604           return 1;
17605         node = node->next;
17606       }
17607       return 0;
17608     }
17609 
17610     if ((dbname = bioseq_matchinfo("IdPrefix", buffer)) != NULL) {
17611       free(dbname);
17612       return 1;
17613     }
17614   }
17615 
17616   return 0;
17617 }
17618 
17619 
bioseq_info(char * dbspec,char * fieldname)17620 char *bioseq_info(char *dbspec, char *fieldname)
17621 {
17622   int i, flag, isroot, pos, len, copyflag, namelen;
17623   char *s, *t, *temp, *dbname, buffer[8];
17624   BIOSEQ_LIST node;
17625 
17626   if (!ctype_initflag)
17627     init_ctype();
17628 
17629   param_error(dbspec == NULL, return NULL, "bioseq_info", "arg 1 is NULL");
17630   param_error(dbspec[0] == '\0', return NULL, "bioseq_info",
17631               "arg 1 is an empty string");
17632   param_error(fieldname == NULL, return NULL, "bioseq_info", "arg 2 is NULL");
17633   param_error(fieldname[0] == '\0', return NULL, "bioseq_info",
17634               "arg 2 is an empty string");
17635 
17636   if (!got_bioseq) {
17637     got_bioseq = 1;
17638     if ((s = getenv("BIOSEQ")) != NULL && bioseq_read(s) < 0)
17639       return NULL;
17640   }
17641 
17642   error_test(bseq_dblist == NULL, E_DBFILEERROR, return NULL,
17643              print_error("%s:  No databases are known.  (Env. variable BIOSEQ "
17644                          "may not be set.)\n", dbspec));
17645 
17646   isroot = (mycasecmp(fieldname, "Root") == 0);
17647 
17648   /*
17649    * Find the BIOSEQ entry by trying to match to a database name.
17650    */
17651   flag = 0;
17652   node = bseq_dblist;
17653   while ((node = dbname_match(node, dbspec, NULL)) != NULL) {
17654     if (isroot) {
17655       if (node->dir != NULL) {
17656         temp = mystrdup(node->dir);
17657         memory_error(temp == NULL, return NULL);
17658         return temp;
17659       }
17660     }
17661     else {
17662       /*
17663        * Check the fields of the matching entry for a match to the parameter.
17664        */
17665       if (node->fields != NULL) {
17666         for (s=node->fields; s < node->fieldsend; ) {
17667           for (t=fieldname,s++; *t && *s != ':'; t++,s++)
17668             if (toupper(*t) != toupper(*s))
17669               break;
17670 
17671           if (!*t && *s == ':') {
17672             for (s++; *s && isspace(*s); s++) ;
17673             temp = mystrdup(s);
17674             memory_error(temp == NULL, return NULL);
17675             return temp;
17676           }
17677 
17678           while (*s) s++;
17679           while (!*s) s++;
17680         }
17681       }
17682     }
17683 
17684     flag = 1;
17685     node = node->next;
17686   }
17687   if (flag)
17688     return NULL;
17689 
17690   /*
17691    * Next, try to match to an identifier prefix.
17692    */
17693   pos = -1;
17694   for (s=dbspec,len=0; len < 6 && *s; s++,len++)
17695     if (pos == -1 && *s == ':')
17696       pos = len;
17697 
17698   if (pos >= 2 && pos <= 4) {
17699     memcpy(buffer, dbspec, pos);
17700     buffer[pos] = '\0';
17701 
17702     for (i=0; i < idpref_table_size; i++)
17703       if (mycasecmp(buffer, idpref_table[i].idprefix) == 0)
17704         break;
17705 
17706     if (i < idpref_table_size) {
17707       dbname = idpref_table[i].dbname;
17708       copyflag = 0;
17709     }
17710     else {
17711       dbname = bioseq_matchinfo("IdPrefix", buffer);
17712       if (dbname == NULL)
17713         return NULL;
17714       copyflag = 1;
17715     }
17716 
17717     /*
17718      * Now, try to search for the information field using this database name.
17719      */
17720     node = bseq_dblist;
17721     namelen = strlen(dbname);
17722     while ((node = dbname_match(node, dbname, &len)) != NULL) {
17723       if (len == namelen) {
17724         if (isroot) {
17725           if (copyflag)
17726             free(dbname);
17727 
17728           if (node->dir == NULL)
17729             return NULL;
17730           else {
17731             temp = mystrdup(node->dir);
17732             memory_error(temp == NULL, return NULL);
17733             return temp;
17734           }
17735         }
17736 
17737         /*
17738          * Check the fields of the matching entry for a match to the parameter.
17739          */
17740         if (node->fields != NULL) {
17741           for (s=node->fields; s < node->fieldsend; ) {
17742             for (t=fieldname,s++; *t && *s != ':'; t++,s++)
17743               if (toupper(*t) != toupper(*s))
17744                 break;
17745 
17746             if (!*t && *s == ':') {
17747               if (copyflag)
17748                 free(dbname);
17749               for (s++; *s && isspace(*s); s++) ;
17750               temp = mystrdup(s);
17751               memory_error(temp == NULL, return NULL);
17752               return temp;
17753             }
17754 
17755             while (*s) s++;
17756             while (!*s) s++;
17757           }
17758         }
17759       }
17760 
17761       node = node->next;
17762       flag = 1;
17763     }
17764 
17765     if (copyflag)
17766       free(dbname);
17767   }
17768 
17769   error_test(!flag, E_DBPARSEERROR, return NULL,
17770              print_error("%s:  No BIOSEQ entry matches database "
17771                          "specification.\n", dbspec));
17772 
17773   return NULL;
17774 }
17775 
17776 
bioseq_matchinfo(char * fieldname,char * fieldvalue)17777 char *bioseq_matchinfo(char *fieldname, char *fieldvalue)
17778 {
17779   int isroot;
17780   char *s, *t, *temp;
17781   BIOSEQ_LIST node;
17782 
17783   if (!ctype_initflag)
17784     init_ctype();
17785 
17786   param_error(fieldname == NULL, return NULL, "bioseq_matchinfo",
17787               "arg 1 is NULL");
17788   param_error(fieldname[0] == '\0', return NULL, "bioseq_matchinfo",
17789               "arg 1 is an empty string");
17790   param_error(fieldvalue == NULL, return NULL, "bioseq_matchinfo",
17791               "arg 2 is NULL");
17792   param_error(fieldvalue[0] == '\0', return NULL, "bioseq_matchinfo",
17793               "arg 2 is an empty string");
17794 
17795   if (!got_bioseq) {
17796     got_bioseq = 1;
17797     if ((s = getenv("BIOSEQ")) != NULL && bioseq_read(s) < 0)
17798       return NULL;
17799   }
17800 
17801   error_test(bseq_dblist == NULL, E_DBFILEERROR, return NULL,
17802              print_error("Error:  No databases are known.  (Env. variable "
17803                          "BIOSEQ may not be set.)\n"));
17804 
17805   isroot = (mycasecmp(fieldname, "Root") == 0);
17806 
17807   for (node=bseq_dblist; node != NULL; node=node->next) {
17808     if (isroot && node->dir && mycasecmp(node->dir, fieldvalue) == 0) {
17809       for (s=node->names; *s && (isspace(*s) || *s == ','); s++) ;
17810       for (t=s; *s && !isspace(*s) && *s != ','; s++) ;
17811       temp = mystrdup2(t, s);
17812       memory_error(temp == NULL, return NULL);
17813       return temp;
17814     }
17815 
17816     if (isroot || node->fields == NULL)
17817       continue;
17818 
17819     for (s=node->fields; s < node->fieldsend; ) {
17820       for (t=fieldname,s++; *t && *s != ':'; t++,s++)
17821         if (toupper(*t) != toupper(*s))
17822           break;
17823 
17824       if (!*t && *s == ':') {
17825         for (s++; *s && isspace(*s); s++) ;
17826         if (mycasecmp(s, fieldvalue) == 0) {
17827           for (s=node->names; *s && (isspace(*s) || *s == ','); s++) ;
17828           for (t=s; *s && !isspace(*s) && *s != ','; s++) ;
17829           temp = mystrdup2(t, s);
17830           memory_error(temp == NULL, return NULL);
17831           return temp;
17832         }
17833       }
17834 
17835       while (*s) s++;
17836       while (!*s) s++;
17837     }
17838   }
17839   return NULL;
17840 }
17841 
17842 
bioseq_parse(char * dbspec)17843 char *bioseq_parse(char *dbspec)
17844 {
17845   int i, len, namelen, matchlen, idpreflen, dbspeclen;
17846   int status, flag, lookupflag, copyflag;
17847   char *s, *t, *t2, *fieldname, *dbname, indexpath[FILENAME_MAX];
17848   char buffer[FILENAME_MAX];
17849   BIOSEQ_LIST node, filenode;
17850   STRING string = { NULL, 0, 0, 0, 0 };
17851 
17852   if (!ctype_initflag)
17853     init_ctype();
17854 
17855   param_error(dbspec == NULL, return NULL, "bioseq_parse",
17856               "arg 1 is NULL");
17857   param_error(dbspec[0] == '\0', return NULL, "bioseq_parse",
17858               "arg 1 is an empty string");
17859 
17860   if (!got_bioseq) {
17861     got_bioseq = 1;
17862     if ((s = getenv("BIOSEQ")) != NULL && bioseq_read(s) < 0)
17863       return NULL;
17864   }
17865 
17866   error_test(bseq_dblist == NULL, E_DBFILEERROR, return NULL,
17867              print_error("%s:  No databases are known.  (Env. variable BIOSEQ "
17868                          "may not be set.)\n", dbspec));
17869 
17870   /*
17871    * Find the node which contains the list of files, and find the node
17872    * which contains an index.
17873    */
17874   dbspeclen = strlen(dbspec);
17875   fieldname = "Index";
17876   indexpath[0] = '\0';
17877   filenode = NULL;
17878   matchlen = 0;
17879 
17880   flag = 0;
17881   node = bseq_dblist;
17882   while ((filenode == NULL || indexpath[0] == '\0') &&
17883          (node = dbname_match(node, dbspec, &len)) != NULL) {
17884     if (!filenode && node->files) {
17885       filenode = node;
17886       matchlen = len;
17887     }
17888 
17889     if (len < dbspeclen && dbspec[len] == ':' &&
17890         !indexpath[0] && node->fields != NULL) {
17891       for (s=node->fields; s < node->fieldsend; ) {
17892         for (t=fieldname,s++; *t && *s != ':'; t++,s++)
17893           if (toupper(*t) != toupper(*s))
17894             break;
17895 
17896         if (!*t && *s == ':') {
17897           for (s++; *s && isspace(*s); s++) ;
17898           if (is_absolute(s) || node->dir == NULL)
17899             strcpy(indexpath, s);
17900           else {
17901             strcpy(indexpath, node->dir);
17902             for (t2=indexpath; *t2; t2++) ;
17903             *t2++ = dirch;
17904             strcpy(t2, s);
17905           }
17906           break;
17907         }
17908 
17909         while (*s) s++;
17910         while (!*s) s++;
17911       }
17912     }
17913 
17914     flag = 1;
17915     node = node->next;
17916   }
17917 
17918   /*
17919    * If that failed, try to match to an identifier prefix.
17920    */
17921   if (!flag) {
17922     idpreflen = -1;
17923     for (s=dbspec,len=0; len < 6 && *s; s++,len++)
17924       if (idpreflen == -1 && *s == ':')
17925         idpreflen = len;
17926 
17927     if (idpreflen >= 2 && idpreflen <= 4) {
17928       memcpy(buffer, dbspec, idpreflen);
17929       buffer[idpreflen] = '\0';
17930 
17931       for (i=0; i < idpref_table_size; i++)
17932         if (mycasecmp(buffer, idpref_table[i].idprefix) == 0)
17933           break;
17934 
17935       if (i < idpref_table_size) {
17936         dbname = idpref_table[i].dbname;
17937         copyflag = 0;
17938       }
17939       else {
17940         dbname = bioseq_matchinfo("IdPrefix", buffer);
17941         if (dbname == NULL)
17942           return NULL;
17943         copyflag = 1;
17944       }
17945 
17946       /*
17947        * Now, try to search for the information field using this database name.
17948        */
17949       node = bseq_dblist;
17950       namelen = strlen(dbname);
17951       while ((filenode == NULL || indexpath[0] == '\0') &&
17952              (node = dbname_match(node, dbname, &len)) != NULL) {
17953         if (len == namelen) {
17954           if (!filenode && node->files) {
17955             filenode = node;
17956             matchlen = idpreflen;
17957           }
17958 
17959           if (!indexpath[0] && node->fields != NULL) {
17960             for (s=node->fields; s < node->fieldsend; ) {
17961               for (t=fieldname,s++; *t && *s != ':'; t++,s++)
17962                 if (toupper(*t) != toupper(*s))
17963                   break;
17964 
17965               if (!*t && *s == ':') {
17966                 for (s++; *s && isspace(*s); s++) ;
17967                 if (is_absolute(s) || node->dir == NULL)
17968                   strcpy(indexpath, s);
17969                 else {
17970                   strcpy(indexpath, node->dir);
17971                   for (t2=indexpath; *t2; t2++) ;
17972                   *t2++ = dirch;
17973                   strcpy(t2, s);
17974                 }
17975                 break;
17976               }
17977 
17978               while (*s) s++;
17979               while (!*s) s++;
17980             }
17981           }
17982         }
17983 
17984         node = node->next;
17985         flag = 1;
17986       }
17987 
17988       if (copyflag)
17989         free(dbname);
17990     }
17991   }
17992 
17993   error_test(!flag && !filenode && !indexpath[0], E_DBPARSEERROR, return NULL,
17994              print_error("%s:  No BIOSEQ entry matches database "
17995                          "specification.\n", dbspec));
17996 
17997   /*
17998    * Check for the three forms of dbspec:
17999    *   1) just the database name.
18000    *        - Check to see if "~" is an alias.  If so, then expand that
18001    *          alias.  Otherwise, take all of the files in the entry.
18002    *   2) the database name with a suffix alias.
18003    *        - Look for that suffix alias, and expand it.
18004    *   3) the database name, a ':' and list of files and aliases.
18005    *        - Loop through the list of files and aliases, and search
18006    *          the entry for each of them.
18007    */
18008   status = 0;
18009   if (filenode && dbspec[matchlen] == '\0') {
18010     status = sufalias_match(filenode, "", &string);
18011     if (status == 0)
18012       status = file_match(filenode, NULL, NULL, &string, 0);
18013   }
18014   else if (filenode && dbspec[matchlen] != ':') {
18015     status = sufalias_match(filenode, dbspec+len, &string);
18016   }
18017   else {
18018     for (s=dbspec; *s && *s != ':'; s++) ;
18019     if (*s)
18020       for (s++; *s && (isspace(*s) || *s == ','); s++) ;
18021     error_test(!*s, E_PARSEERROR, return NULL,
18022                print_error("%s:  Invalid database specification.\n",
18023                            dbspec));
18024 
18025     lookupflag = 0;
18026     while (*s) {
18027       for (t=s; *s && !isspace(*s) && *s != ','; s++) ;
18028       error_test(t == s, E_PARSEERROR, return NULL,
18029                  print_error("%s:  Invalid database specification.\n",
18030                              dbspec));
18031 
18032       status = 0;
18033       if (filenode)
18034         status = file_match(filenode, t, s, &string, 0);
18035 
18036       if (status == 0 && indexpath[0]) {
18037         status = ident_lookup(indexpath, !lookupflag, t, s, &string);
18038         lookupflag = 1;
18039       }
18040 
18041       if (status == -1)
18042         break;
18043       else if (status == 0) {
18044         memcpy(buffer, t, s - t);
18045         buffer[s-t] = '\0';
18046 
18047         set_error(E_DBFILEERROR);
18048         print_warning("%s:  Unable to find `%s' in database.\n", dbspec,
18049                       buffer);
18050       }
18051 
18052       while (*s && (isspace(*s) || *s == ',')) s++;
18053     }
18054 
18055     if (lookupflag)
18056       ident_close();
18057   }
18058 
18059   if (status == -1) {
18060     if (string.string)
18061       free(string.string);
18062     return NULL;
18063   }
18064 
18065   memory_error(string.error, return NULL);
18066   if (string.string == NULL)
18067     return NULL;
18068 
18069   string.string[string.count] = '\0';
18070   return string.string;
18071 }
18072 
18073 
dbname_match(BIOSEQ_LIST node,char * dbname,int * len_out)18074 static BIOSEQ_LIST dbname_match(BIOSEQ_LIST node, char *dbname, int *len_out)
18075 {
18076   int status;
18077   char *s, *t;
18078 
18079   for ( ; node != NULL; node=node->next) {
18080     for (s=node->names; *s && (isspace(*s) || *s == ','); s++) ;
18081     while (*s) {
18082       t = dbname;
18083       for ( ; *s && !isspace(*s) && *s != ',' && *t && *t != ':'; s++,t++)
18084         if (toupper(*s) != toupper(*t))
18085           break;
18086 
18087       if (!*s || isspace(*s) || *s == ',') {
18088         if (!*t || *t == ':' || node->files == NULL) {
18089           if (len_out)  *len_out = t - dbname;
18090           return node;
18091         }
18092         else if ((status = sufalias_match(node, t, NULL)) != 0) {
18093           if (status == 1) {
18094             if (len_out)  *len_out = t - dbname;
18095             return node;
18096           }
18097           else
18098             return NULL;
18099         }
18100       }
18101 
18102       while (*s && !isspace(*s) && *s != ',') s++;
18103       while (*s && (isspace(*s) || *s == ',')) s++;
18104 
18105     }
18106   }
18107 
18108   return NULL;
18109 }
18110 
18111 
sufalias_match(BIOSEQ_LIST node,char * suffix,STRING * str)18112 static int sufalias_match(BIOSEQ_LIST node, char *suffix, STRING *str)
18113 {
18114   int count;
18115   char *s, *t, *s2, *t2, *file;
18116 
18117   if (node->files == NULL)
18118     return 0;
18119 
18120   s = node->files;
18121   while (*s) {
18122     while (*s && (isspace(*s) || *s == ',')) s++;
18123     if (!*s)
18124       break;
18125 
18126     if (*s == '#') {
18127       while (*s && *s != '\n') s++;
18128       continue;
18129     }
18130 
18131     file = s;
18132     while (*s && !isspace(*s) && *s != ',' && *s != '#' && *s != '(')
18133       s++;
18134 
18135     if (!*s)
18136       break;
18137     else if (isspace(*s) || *s == ',')
18138       continue;
18139     else if (*s == '#') {
18140       while (*s && *s != '\n') s++;
18141       continue;
18142     }
18143 
18144     /*
18145      * From this point on, *s must be '('.
18146      */
18147     if (*(s-1) == ':' && *file == '~') {
18148       for (s2=file+1,t2=suffix; s2 < s - 1 && *t2; s2++,t2++)
18149         if (toupper(*s2) != toupper(*t2))
18150           break;
18151 
18152       if (s2 == s - 1 && !*t2) {
18153         if (str != NULL) {
18154           for (s++; *s && (isspace(*s) || *s == ','); s++) ;
18155           while (*s && *s != ')') {
18156             if (*s == '#')
18157               while (*s && *s != '\n') s++;
18158             else {
18159               t = s;
18160               while (*s && !isspace(*s) && *s != ',' && *s != ')' && *s != '#')
18161                 s++;
18162 
18163               if (file_match(node, t, s, str, 0) < 0)
18164                 return -1;
18165             }
18166 
18167             while (*s && (isspace(*s) || *s == ',')) s++;
18168           }
18169         }
18170 
18171         return 1;
18172       }
18173     }
18174 
18175     count = 0;
18176     while (*s && (*s != ')' || --count != 0)) {
18177       if (*s == '(')
18178         count++;
18179       else if (*s == '#') {
18180         while (*s && *s != '\n') s++;
18181         if (!*s)
18182           break;
18183       }
18184 
18185       s++;
18186     }
18187     program_error(!*s, return -1,
18188                   print_error("    sufalias_match:  Previous syntax checking"
18189                               " done incorrectly.\n"));
18190     s++;
18191   }
18192 
18193   return 0;
18194 }
18195 
18196 
file_match(BIOSEQ_LIST node,char * start,char * end,STRING * str,int depth)18197 static int file_match(BIOSEQ_LIST node, char *start, char *end, STRING *str,
18198                       int depth)
18199 {
18200   int pos, inalias, found, status;
18201   char *s, *s2, *t2, *file;
18202   char path[FILENAME_MAX+1];
18203 
18204   error_test(depth >= 10, E_PARSEERROR, return -1,
18205              print_error("%s:  Runaway alias recursion in BIOSEQ entry.\n",
18206                          node->names));
18207 
18208   found = 0;
18209   inalias = 0;
18210   pos = 0;
18211   s = node->files;
18212   while (*s) {
18213     while (*s && (isspace(*s) || *s == ',')) s++;
18214     if (!*s)
18215       break;
18216 
18217     if (*s == '#') {
18218       while (*s && *s != '\n') s++;
18219       continue;
18220     }
18221 
18222     file = s;
18223     while (*s && !isspace(*s) && *s != ',' && *s != '#' &&
18224            *s != '(' && *s != ')')
18225       s++;
18226 
18227     if (*s == ')' && file == s) {
18228       if (inalias)
18229         inalias = 0;
18230       else {
18231         program_error(pos == 0, return -1,
18232                       print_error("    file_match:  Previous syntax checking"
18233                                   " done incorrectly.\n"));
18234         for (pos--,t2=path+pos; pos > 0 && *(t2-1) != '('; t2--,pos--) ;
18235       }
18236       s++;
18237       continue;
18238     }
18239 
18240     if (*s != '(') {
18241       if (inalias)
18242         status = file_match(node, file, s, str, depth+1);
18243       else if (pos == 0)
18244         status = match_and_expand(node, file, s, start, end, str);
18245       else {
18246         for (s2=file,t2=path+pos; s2 < s; s2++,t2++)
18247           *t2 = *s2;
18248         *t2 = '\0';
18249 
18250         status = match_and_expand(node, path, t2, start, end, str);
18251       }
18252 
18253       if (status == -1)
18254         return -1;
18255       else if (status == 1)
18256         found = 1;
18257 
18258       if (!*s)
18259         break;
18260       else if (isspace(*s) || *s == ',' || *s == ')')
18261         continue;
18262       else if (*s == '#') {
18263         while (*s && *s != '\n') s++;
18264         continue;
18265       }
18266     }
18267 
18268     program_error(inalias, return -1,
18269                   print_error("    file_match:  Previous syntax checking"
18270                               " done incorrectly.\n"));
18271 
18272     /*
18273      * From this point on, *s must be '('.
18274      */
18275     if (*(s-1) == dirch) {
18276       for (s2=file,t2=path+pos; s2 < s - 1; s2++,t2++,pos++)
18277         *t2 = *s2;
18278       *t2++ = '(';
18279       pos++;
18280       s++;
18281       continue;
18282     }
18283 
18284     program_error(*(s-1) != ':', return -1,
18285                   print_error("    file_match:  Previous syntax checking"
18286                               " done incorrectly.\n"));
18287 
18288     if (*file != '~' && start != NULL) {
18289       for (s2=file,t2=start; s2 < s - 1 && t2 < end; s2++,t2++)
18290         if (toupper(*s2) != toupper(*t2))
18291           break;
18292 
18293       if (s2 == s - 1 && t2 == end) {
18294         if (str == NULL)
18295           return 1;
18296         else {
18297           inalias = 1;
18298           found = 1;
18299           s++;
18300           continue;
18301         }
18302       }
18303     }
18304 
18305     for (s++; *s && *s != ')'; s++) {
18306       if (*s == '#') {
18307         while (*s && *s != '\n') s++;
18308         if (!*s)
18309           break;
18310       }
18311       else if (*s == '(') {
18312         program_error(1, return -1,
18313                       print_error("    file_match:  Previous syntax checking"
18314                                   " done incorrectly.\n"));
18315       }
18316     }
18317     program_error(!*s, return -1,
18318                   print_error("    file_match:  Previous syntax checking"
18319                               " done incorrectly.\n"));
18320     s++;
18321   }
18322   program_error(inalias, return -1,
18323                 print_error("    file_match:  Previous syntax checking"
18324                             " done incorrectly.\n"));
18325 
18326   return found;
18327 }
18328 
18329 
match_and_expand(BIOSEQ_LIST node,char * file,char * fileend,char * pattern,char * patend,STRING * str)18330 static int match_and_expand(BIOSEQ_LIST node, char *file, char *fileend,
18331                             char *pattern, char *patend, STRING *str)
18332 {
18333   int pathflag, count, flag;
18334   char *s, *t, *s2, *t2, *send, *tend, *last, *filestart;
18335   char path[FILENAME_MAX];
18336 
18337   /*
18338    * If a user-specified pattern has been given to bioseq_parse (i.e.,
18339    * specifying a particular file or wildcarded file pattern in the
18340    * database), perform an initial match of the filename against that
18341    * user-specified pattern.
18342    *
18343    * This matching takes into account that both the user-specified pattern
18344    * and the filename (gotten from the BIOSEQ entry) can contain wildcards.
18345    */
18346   pathflag = 0;
18347   if (pattern != NULL) {
18348     for (s=pattern; s < patend; s++) {
18349       if (*s == dirch) {
18350         pathflag = 1;
18351         break;
18352       }
18353     }
18354 
18355     if (pathflag) {
18356       s = file;
18357       t = pattern;
18358     }
18359     else {
18360       last = file;
18361       for (s=file; s < fileend; s++)
18362         if (*s == dirch || *s == '(')
18363           last = s+1;
18364 
18365       program_error(last == fileend, return -1,
18366                     print_error("    match_and_expand:  Incorrect syntax "
18367                                 "checking of BIOSEQ entry.\n"));
18368 
18369       s = last;
18370       t = pattern;
18371     }
18372 
18373     for ( ; s < fileend && t < patend; s++,t++) {
18374       if ((toupper(*s) == toupper(*t) && *s != '*') ||
18375           (*s == '(' && *t == dirch) || (*s == '?' && *t != dirch) ||
18376           (*t == '?' && *s != dirch && *s != '('))
18377         continue;
18378 
18379       if (*s != '*' && *t != '*')
18380         return 0;
18381 
18382       count = 0;
18383       for (s2=s+1; s2 < fileend && *s2 != dirch && *s2 != '('; s2++)
18384         if (*s2 == '*')
18385           count++;
18386       for (t2=t+1; t2 < patend && *t2 != dirch; t2++)
18387         if (*t2 == '*')
18388           count++;
18389 
18390       if (count == 0) {
18391         send = s2;
18392         tend = t2;
18393         for (s2--,t2--; s2 > s && t2 > t; s2--,t2--)
18394           if (!(toupper(*s2) == toupper(*t2) || *s2 == '?' || *t2 == '?'))
18395             return 0;
18396 
18397         if (!((*s == '*' && s2 == s) || (*t == '*' && t2 == t)))
18398           return 0;
18399 
18400         s = send;
18401         t = tend;
18402       }
18403       else {
18404         if (!complex_match(s, s2, t, t2))
18405           return 0;
18406 
18407         s = s2;
18408         t = t2;
18409       }
18410     }
18411 
18412     if ((s < fileend && (*s != '*' || s + 1 < fileend)) ||
18413         (t < patend && (*t != '*' || t + 1 < patend)))
18414       return 0;
18415   }
18416 
18417   /*
18418    * Build the pathname for the new file, converting all of the '('
18419    * used in the "file" string as stack markers (in file_match)
18420    * to '/' (or '\\' in Windows).
18421    *
18422    * Also, check the copied string to see if any wildcards appear in
18423    * it.
18424    */
18425   s = path;
18426   if (node->dir) {
18427     for (t=node->dir; *t; s++,t++) *s = *t;
18428     *s++ = dirch;
18429   }
18430   filestart = s;
18431 
18432   flag = 0;
18433   for (t=file; t < fileend; t++,s++) {
18434     if (*t == '(')
18435       *s = dirch;
18436     else {
18437       *s = *t;
18438       if (*s == '?' || *s == '*')
18439         flag = 1;
18440     }
18441   }
18442   *s = '\0';
18443 
18444   if (is_absolute(filestart)) {
18445     strcpy(path, filestart);
18446     filestart = path;
18447   }
18448 
18449   /*
18450    * Either add the string (if no wildcards), or perform a directory list
18451    * expansion of the wildcards in the path (and, of course, match each
18452    * of those against the user-specified pattern).
18453    */
18454   if (flag)
18455     return expand(path, 0, filestart, s, pattern, patend, pathflag, str, node);
18456   else {
18457     if (seqfisafile(path))
18458       addstring(str, path);
18459     else {
18460       set_error(E_DBFILEERROR);
18461       print_warning("Warning:  %s:  File listed in BIOSEQ entry `%s' does not "
18462                     "exist.\n", path, node->names);
18463     }
18464 
18465     return 1;
18466   }
18467 }
18468 
18469 
expand(char * initpath,int initpos,char * pat,char * patend,char * fpat,char * fpatend,int pathflag,STRING * str,BIOSEQ_LIST node)18470 static int expand(char *initpath, int initpos, char *pat, char *patend,
18471                   char *fpat, char *fpatend, int pathflag, STRING *str,
18472                   BIOSEQ_LIST node)
18473 {
18474   static char path[FILENAME_MAX];
18475   int flag, status, found;
18476   char *s, *t, *ptr, *s2, *t2, *ptr2, *dname, *end, *d2;
18477   DIRPTR dp;
18478 
18479   if (initpos == 0) {
18480     for (ptr=path,t=initpath; t < pat; ptr++,t++)
18481       *ptr = *t;
18482   }
18483   else
18484     ptr = path + initpos;
18485 
18486   s = s2 = pat;
18487   t = t2 = fpat;
18488   flag = 0;
18489   while (s < patend) {
18490     for (s2=s; s < patend && *s != dirch; s++)
18491       if (*s == '?' || *s == '*')
18492         flag = 1;
18493 
18494     if (pathflag)
18495       while (t < patend && *t != dirch) t++;
18496 
18497     if (flag)
18498       break;
18499 
18500     for (s=s2; s < patend && *s != dirch; s++,ptr++)
18501       *ptr = *s;
18502     if (s < patend)
18503       *ptr++ = *s++;
18504 
18505     if (pathflag)
18506       t2 = ++t;
18507   }
18508 
18509   if (!flag && s == patend) {
18510     *ptr = '\0';
18511     if (seqfisafile(path))
18512       addstring(str, path);
18513     else {
18514       set_error(E_DBFILEERROR);
18515       print_warning("Warning:  %s:  File listed in BIOSEQ entry `%s' does not "
18516                     "exist.\n", path, node->names);
18517     }
18518 
18519     return 1;
18520   }
18521 
18522   /*
18523    * If we hit a wildcard, read the files/sub-dirs in the path so far
18524    * and recurse on any matches to the patterns for that directory name.
18525    */
18526   if (!pathflag)
18527     t = fpatend;
18528 
18529   if (ptr == path)
18530     status = open_directory(".", &dp);
18531   else {
18532     *(ptr-1) = '\0';
18533     status = open_directory(get_truename(path, ptr), &dp);
18534     *(ptr-1) = dirch;
18535   }
18536   error_test(status != STATUS_OK, E_OPENFAILED, return -1,
18537              print_error("%s:  Cannot open directory listed in "
18538                          " BIOSEQ entry `%s'.\n", (ptr == path ? "." : path),
18539                          node->names));
18540 
18541   found = 0;
18542   while ((dname = read_dirname(dp)) != NULL) {
18543     if (dname[0] == '.' &&
18544         (dname[1] == '\0' || (dname[1] == '.' && dname[2] == '\0')))
18545       continue;
18546 
18547     for (end=dname; *end; end++) ;
18548 
18549     status = 0;
18550     if (match_string(dname, end, s2, s) &&
18551         (!pathflag || match_string(dname, end, t2, t))) {
18552       for (ptr2=ptr,d2=dname; d2 < end; ptr2++,d2++)
18553         *ptr2 = *d2;
18554       *ptr2 = '\0';
18555 
18556       if (s == patend) {
18557         if (pathflag || match_string(dname, end, t2, t)) {
18558           addstring(str, path);
18559           status = 1;
18560         }
18561       }
18562       else {
18563         *ptr2++ = dirch;
18564         status = expand(NULL, ptr2 - path, s+1, patend,
18565                         (pathflag ? t+1 : fpat), fpatend, pathflag, str, node);
18566       }
18567     }
18568 
18569     if (status == -1) {
18570       found = -1;
18571       break;
18572     }
18573     else if (status == 1)
18574       found = 1;
18575   }
18576   close_directory(dp);
18577 
18578   return found;
18579 }
18580 
18581 
match_string(char * s,char * send,char * t,char * tend)18582 static int match_string(char *s, char *send, char *t, char *tend)
18583 {
18584   int count;
18585   char *s2, *t2;
18586 
18587   for ( ; s < send && t < tend; s++,t++) {
18588     if ((toupper(*s) == toupper(*t) && *s != '*') ||
18589         (*s == '?' && *t != dirch) || (*t == '?' && *s != dirch))
18590       continue;
18591 
18592     if (*s != '*' && *t != '*')
18593       return 0;
18594 
18595     count = 0;
18596     for (s2=s+1; s2 < send && *s2 != dirch; s2++)
18597       if (*s2 == '*')
18598         count++;
18599     for (t2=t+1; t2 < tend && *t2 != dirch; t2++)
18600       if (*t2 == '*')
18601         count++;
18602 
18603     if (count == 0) {
18604       send = s2;
18605       tend = t2;
18606       for (s2--,t2--; s2 > s && t2 > t; s2--,t2--)
18607         if (!(toupper(*s2) == toupper(*t2) || *s2 == '?' || *t2 == '?'))
18608           return 0;
18609 
18610       if (!((*s == '*' && s2 == s) || (*t == '*' && t2 == t)))
18611         return 0;
18612 
18613       s = send;
18614       t = tend;
18615     }
18616     else {
18617       if (!complex_match(s, s2, t, t2))
18618         return 0;
18619 
18620       s = s2;
18621       t = t2;
18622     }
18623   }
18624 
18625   return 1;
18626 }
18627 
18628 
complex_match(char * s,char * send,char * t,char * tend)18629 static int complex_match(char *s, char *send, char *t, char *tend)
18630 {
18631   char *s2, *t2, *temp;
18632 
18633   /*
18634    * If s is sitting on an asterisk, loop through the substrings on t (using
18635    * loop variable "temp"), trying to match the strings from s+1 and temp
18636    * (thus, matching the asterisk at s to the substring from t to temp-1).
18637    * If, in that matching, another asterisk is reached, recurse to match
18638    * any possible substring of the other string against that asterisk.
18639    */
18640   if (*s == '*') {
18641     if (!s[1])
18642       return 1;
18643 
18644     for (temp=t; temp < tend; temp++) {
18645       for (s2=s+1,t2=temp; s2 < send && t2 < tend; s2++,t2++) {
18646         if ((toupper(*s2) == toupper(*t2) && *s2 != '*') ||
18647             *s2 == '?' || *t2 == '?')
18648           continue;
18649 
18650         if ((*s2 == '*' || *t2 == '*') &&
18651             complex_match(s2, send, t2, tend))
18652           return 1;
18653         else
18654           break;
18655       }
18656       if (s2 == send && t2 == tend)
18657         return 1;
18658     }
18659   }
18660 
18661   /*
18662    * Do the vice versa thing with the asterisk sitting on t.
18663    */
18664   if (*t == '*') {
18665     if (!t[1])
18666       return 1;
18667 
18668     temp = (*s == '*' ? s+1 : s);
18669     for ( ; temp < send; temp++) {
18670       for (s2=temp,t2=t+1; s2 < send && t2 < tend; s2++,t2++) {
18671         if ((toupper(*s2) == toupper(*t2) && *s2 != '*') ||
18672             *s2 == '?' || *t2 == '?')
18673           continue;
18674 
18675         if ((*s2 == '*' || *t2 == '*') &&
18676             complex_match(s2, send, t2, tend))
18677           return 1;
18678         else
18679           break;
18680       }
18681       if (s2 == send && t2 == tend)
18682         return 1;
18683     }
18684   }
18685 
18686   return 0;
18687 }
18688 
18689 
18690 static FILE *idxfp = NULL;
18691 static char *idxbuffer = NULL;
18692 int idxtype, idxsize, idxstart, idxbufsize;
18693 
ident_lookup(char * indexfile,int newfile,char * id,char * idend,STRING * string)18694 static int ident_lookup(char *indexfile, int newfile, char *id,
18695                         char *idend, STRING *string)
18696 {
18697   int size, count, offset, status, patsize, filenum, fileoffset, fileidlen;
18698   char *s, *s2, *t, *t2, *end, *end2, *temp, line[128], pattern[128];
18699   char buffer[FILENAME_MAX+32];
18700 
18701   if (idend == NULL)
18702     for (idend=id; *idend; idend++) ;
18703 
18704   for (s=pattern,t=id; t < idend && *t != '*' && *t != '?'; s++,t++)
18705     *s = *t;
18706   *s = '\0';
18707   patsize = s - pattern;
18708 
18709   /*
18710    * If we're starting queries on a new index file, open up that
18711    * file and store either the file pointer or the complete text
18712    * of the file (depending on the index file size) into the
18713    * global data structures.
18714    */
18715   if (newfile || (idxfp == NULL && idxbuffer == NULL)) {
18716     if (idxfp != NULL || idxbuffer != NULL)
18717       ident_close();
18718 
18719     idxsize = get_filesize(indexfile);
18720 
18721     /*
18722      * Open up the index file.
18723      */
18724     idxfp = fopen(get_truename(indexfile, NULL), "r");
18725     error_test(idxfp == NULL, E_OPENFAILED, return -1,
18726                print_error("%s:  Unable to open index file.\n", indexfile));
18727 
18728     /*
18729      * Get the first line of the file, extract the size of the header
18730      * lines (listing the database files) and check the line to make
18731      * sure it's a SEQIO Index File.
18732      */
18733     temp = fgets(line, 128, idxfp);
18734     error_test(temp == NULL, E_READFAILED, return -1,
18735                print_error("%s:  Unable to read index file.\n", indexfile));
18736 
18737     idxstart = myatoi(line, 10, '0');
18738     for (s=line; s < line + 128 && *s && (isdigit(*s) || isspace(*s)); s++) ;
18739     error_test(idxstart <= 0 || !*s || s >= line + 128 ||
18740                strncmp(s, "# SEQIO Index File", 18) != 0,
18741                E_PARSEERROR, return -1,
18742                print_error("%s:  File is not a SEQIO Index File.\n",
18743                            indexfile));
18744 
18745     /*
18746      * Seek back to the beginning of the file, and either read the
18747      * complete file (if it's small enough), or just read the header
18748      * lines containing the list of files.
18749      */
18750     status = fseek(idxfp, 0, SEEK_SET);
18751     error_test(status == -1, E_READFAILED, return -1,
18752                print_error("%s:  Unable to read index file.\n", indexfile));
18753 
18754     if (idxsize <= 50000) {
18755       idxbufsize = idxsize;
18756       idxtype = 0;
18757     }
18758     else {
18759       idxbufsize = idxstart;
18760       idxtype = 1;
18761     }
18762 
18763     idxbuffer = (char *) malloc(idxbufsize + 1);
18764     memory_error(idxbuffer == NULL, return -1);
18765 
18766     size = fread(idxbuffer, 1, idxbufsize, idxfp);
18767     error_test(size != idxbufsize, E_READFAILED, return -1,
18768                print_error("%s:  Unable to read index file.\n", indexfile));
18769 
18770     idxbuffer[idxbufsize] = '\0';
18771   }
18772 
18773   if (patsize == 0) {
18774     offset = idxstart;
18775 
18776     if (idxtype == 1 && fseek(idxfp, offset, SEEK_SET) == -1) {
18777       raise_error(E_READFAILED, return -1,
18778                   print_error("%s:  Read error occurred while accessing "
18779                               "index file.\n", indexfile));
18780     }
18781   }
18782   else {
18783     /*
18784      * If the current file is small enough to be stored in memory, then
18785      * do a binary search through the lines to find the index, then
18786      * scan the lines containing the identifier and construct the
18787      * filename/byte-offset specifications.
18788      */
18789     if (idxtype == 0)
18790       offset = findline2(idxbuffer, idxstart, idxbufsize, pattern, patsize);
18791     else {
18792       offset = findline(idxfp, idxstart, idxsize, pattern, patsize);
18793       error_test(offset == -2, E_PARSEERROR, return -1,
18794                  print_error("%s:  Read/Parse error occurred while accessing "
18795                              "index file.\n", indexfile));
18796       if (offset != -1 && fseek(idxfp, offset, SEEK_SET) == -1) {
18797         raise_error(E_READFAILED, return -1,
18798                     print_error("%s:  Read error occurred while accessing "
18799                                 "index file.\n", indexfile));
18800       }
18801 
18802     }
18803 
18804     if (offset == -1)
18805       return 0;
18806   }
18807 
18808   while (1) {
18809     if (idxtype == 0) {
18810       if (offset >= idxbufsize)
18811         break;
18812 
18813       s = idxbuffer + offset;
18814       fileidlen = 0;
18815       for (end=s; end < idxbuffer + idxbufsize && *end != '\n'; end++,offset++)
18816         if (!fileidlen && *end == '\t')
18817           fileidlen = end - s;
18818       offset++;
18819     }
18820     else {
18821       if (fgets(line, 128, idxfp) == NULL)
18822         break;
18823 
18824       s = line;
18825       for (end=s,fileidlen=0; end < line + 128 && *end && *end != '\n'; end++)
18826         if (!fileidlen && *end == '\t')
18827           fileidlen = end - s;
18828       error_test(end >= line + 128 || !*end || !fileidlen,
18829                  E_PARSEERROR, return -1,
18830                  print_error("%s:  Invalid format of index file lines.\n",
18831                              indexfile));
18832     }
18833 
18834     if (myncasecmp(pattern, s, patsize) < 0)
18835       break;
18836 
18837     if (!match_string(s, s + fileidlen, id, idend))
18838       continue;
18839 
18840     /*
18841      * Found a line matching that identifier.  Construct the
18842      * filename/byte-offset string and add it to the STRING structure.
18843      *
18844      * Recall, the format of the line is "ident\t#1\t#2\n", where "ident"
18845      * is the identifier, "#1" is the file number (i.e., the index into
18846      * the list of files), and "#2" is the file offset.  Also, "#1" and
18847      * "#2" are the encoded as base 64 numbers (the 64 ASCII characters
18848      * beginning with '0').
18849      */
18850     while (s < end && *s != '\t') s++;
18851     s++;
18852     error_test(s >= end || (*s - '0') < 0 || (*s - '0') >= 64,
18853                E_PARSEERROR, return -1,
18854                print_error("%s:  Invalid format of index file lines.\n",
18855                            indexfile));
18856     filenum = myatoi(s, 64, '0');
18857 
18858     while (s < end && *s != '\t') s++;
18859     s++;
18860     error_test(s >= end || (*s - '0') < 0 || (*s - '0') >= 64,
18861                E_PARSEERROR, return -1,
18862                print_error("%s:  Invalid format of index file lines.\n",
18863                            indexfile));
18864     fileoffset = myatoi(s, 64, '0');
18865 
18866     /*
18867      * Scan the list of files to find the filenum'th file.
18868      */
18869     end2 = idxbuffer + idxstart;
18870     for (s2=idxbuffer; s2 < end2 && *s2 != '\n'; s2++) ;
18871     s2++;
18872     for (count=0; count < filenum && s2 < end2; count++,s2++)
18873       for ( ; s2 < end2 && *s2 != '\n'; s2++)
18874         ;
18875 
18876     error_test(s2 >= end2, E_PARSEERROR, return -1,
18877                print_error("%s:  Invalid header format of index file.\n",
18878                            indexfile));
18879 
18880     for (t2=buffer; s2 < end2 && *s2 != '\n'; s2++,t2++)
18881       *t2 = *s2;
18882     *t2++ = '@';
18883     *t2++ = '#';
18884     t2 = myitoa(t2, fileoffset, 10, '0');
18885     *t2 = '\0';
18886 
18887     addstring(string, buffer);
18888   }
18889 
18890   return 1;
18891 }
18892 
ident_close(void)18893 static void ident_close(void)
18894 {
18895   if (idxfp != NULL) {
18896     fclose(idxfp);
18897     idxfp = NULL;
18898   }
18899   if (idxbuffer != NULL) {
18900     free(idxbuffer);
18901     idxbuffer = NULL;
18902     idxsize = idxbufsize = 0;
18903   }
18904 }
18905 
18906 
findline(FILE * fp,int low,int high,char * pattern,int patsize)18907 static int findline(FILE *fp, int low, int high, char *pattern, int patsize)
18908 {
18909   int middle, start, size, pagesize, status, offset, highflag, len;
18910   char *s, *t, *end, page[8196];
18911 
18912   start = low;
18913   size = high;
18914   highflag = 0;
18915   while (high - low > 8192) {
18916     middle = (high + low) / 2 - 4096;
18917     if (fseek(fp, middle, SEEK_SET) == -1)
18918       return -2;
18919     pagesize = fread(page, 1, 8192, fp);
18920     if (pagesize < 160)
18921       return -2;
18922 
18923     /*
18924      * If the search might read a page that begins and ends in the
18925      * middle of lines (when flag is 1), then adjust the beginning
18926      * and end to skip past the line fragment that may appear on
18927      * the edges of the read page.
18928      */
18929     for (s=page; s < page + pagesize && *s != '\n'; s++,middle++) ;
18930     s++;  middle++;
18931     for (end=page+pagesize; end > s && *(end-1) != '\n'; end--) ;
18932 
18933     if (s >= end)
18934       return -2;
18935 
18936     /*
18937      * Check to see if the pattern is smaller than (or equal to)
18938      * the first line.
18939      */
18940     for (t=s; t < end && *t != '\n'; t++) ;
18941     len = patsize;
18942     if (t - s < patsize)
18943       len = t - s;
18944 
18945     if ((status = myncasecmp(pattern, s, len)) <= 0) {
18946       high = middle;
18947       highflag = (status == 0);
18948       continue;
18949     }
18950 
18951     /*
18952      * Check to see if the pattern is larger than the first line.
18953      */
18954     for (t=end-1; t > s && *(t-1) != '\n'; t--) ;
18955     len = patsize;
18956     if (end - t < patsize)
18957       len = end - t;
18958 
18959     if (myncasecmp(pattern, t, len) > 0) {
18960       low = middle + (end - s);
18961       continue;
18962     }
18963 
18964     /*
18965      * If neither is the case, then the first line containing the pattern
18966      * must occur in the current page, so do a binary search on those
18967      * lines.
18968      */
18969     offset = findline2(page, s - page, end - page, pattern, patsize);
18970     if (offset == -1)
18971       return (highflag ? high : -1);
18972     else
18973       return middle + (offset - (s - page));
18974   }
18975 
18976   /*
18977    * If we've gotten down to where we can read the whole page into
18978    * memory, just do a binary search on that page.
18979    */
18980   if (fseek(fp, low, SEEK_SET) == -1)
18981     return -2;
18982   pagesize = fread(page, 1, high - low, fp);
18983   if (pagesize != high - low)
18984     return -2;
18985 
18986   offset = findline2(page, 0, high - low, pattern, patsize);
18987   if (offset == -1)
18988     return (highflag ? high : -1);
18989   else
18990     return low + offset;
18991 }
18992 
findline2(char * text,int low,int high,char * pattern,int patsize)18993 static int findline2(char *text, int low, int high, char *pattern, int patsize)
18994 {
18995   int middle, size, len, status;
18996   char *s, *end;
18997 
18998   /*
18999    * For this search to work correctly, the string starting with text[low]
19000    * must be the beginning of a line, and text[high] must be one larger then
19001    * the end of a line (i.e., either it must be one larger than the last
19002    * newline, or for strings that don't contain that trailing newline, it
19003    * must be one larger than the last character on the last line).
19004    *
19005    * The search can handle text where the trailing newline on the last
19006    * line of the text is missing.
19007    */
19008   size = high;
19009   while (low < high) {
19010     middle = (high + low) / 2;
19011     for (s=end=text+middle; s > text + low && *(s-1) != '\n'; s--) ;
19012     for ( ; end < text + high && *end != '\n'; end++) ;
19013 
19014     len = patsize;
19015     if (end - s < len)
19016       len = end - s;
19017 
19018     status = myncasecmp(pattern, s, len);
19019     if (status <= 0)
19020       high = (s - text);
19021     else if (status > 0)
19022       low = (end - text) + 1;
19023   }
19024 
19025   /*
19026    * At the end of this, low == high and either they both equal
19027    * the search size (in which case all the values in the file are
19028    * smaller than the pattern), or the contents of the next line
19029    * is where the lines containing the pattern should begin.
19030    */
19031   if (high >= size)
19032     return -1;
19033 
19034   if (myncasecmp(pattern, text + high, patsize) == 0)
19035     return high;
19036   else
19037     return -1;
19038 }
19039 
19040 
19041 
19042 /*
19043  *
19044  * Interfaces to the file i/o and error reporting procedures:
19045  *
19046  *     open_raw_file, open_raw_stdin, is_raw_stdin,
19047  *     close_raw_file, isa_file, isa_dir, match_file
19048  *
19049  * These files encapsulate all of the file I/O, so for portability only
19050  * these procedures should be different on different platforms.
19051  *
19052  *
19053  */
open_raw_file(char * filename,FILEPTR * ptr_out)19054 static int open_raw_file(char *filename, FILEPTR *ptr_out)
19055 {
19056   if ((*ptr_out = open(filename, O_RDONLY)) >= 0)
19057     return STATUS_OK;
19058   else
19059     return STATUS_ERROR;
19060 }
19061 
read_raw_file(FILEPTR ptr,char * buffer,int size)19062 static int read_raw_file(FILEPTR ptr, char *buffer, int size)
19063 {
19064   return read(ptr, buffer, size);
19065 }
19066 
seek_raw_file(FILEPTR ptr,int bytepos)19067 static int seek_raw_file(FILEPTR ptr, int bytepos)
19068 {
19069   if (lseek(ptr, bytepos, SEEK_SET) >= 0)
19070     return STATUS_OK;
19071   else
19072     return STATUS_ERROR;
19073 }
19074 
close_raw_file(FILEPTR ptr)19075 static int close_raw_file(FILEPTR ptr)
19076 {
19077   if (close(ptr) >= 0)
19078     return STATUS_OK;
19079   else
19080     return STATUS_ERROR;
19081 }
19082 
open_raw_stdin(FILEPTR * ptr_out)19083 static int open_raw_stdin(FILEPTR *ptr_out)
19084 {
19085   *ptr_out = 0;
19086   return STATUS_OK;
19087 }
19088 
open_stdout(FILE ** ptr_out)19089 static int open_stdout(FILE **ptr_out)
19090 {
19091   *ptr_out = stdout;
19092   return STATUS_OK;
19093 }
19094 
puterror(char * s)19095 static void puterror(char *s)
19096 {
19097   fputs(s, stderr);
19098 }
19099 
read_small_file(char * filename)19100 static char *read_small_file(char *filename)
19101 {
19102   int fd;
19103   char *buffer;
19104   struct stat sbuf;
19105 
19106   if (stat(filename, &sbuf) < 0 || (sbuf.st_mode & S_IFMT) != S_IFREG)
19107     return NULL;
19108 
19109   fd = -1;
19110   buffer = NULL;
19111   if ((buffer = (char *) malloc(sbuf.st_size + 1)) == NULL ||
19112       (fd = open(filename, O_RDONLY)) < 0 ||
19113       read(fd, buffer, sbuf.st_size) != sbuf.st_size) {
19114     if (buffer != NULL)  free(buffer);
19115     if (fd >= 0)  close(fd);
19116     return NULL;
19117   }
19118   close(fd);
19119   buffer[sbuf.st_size] = '\0';
19120 
19121   return buffer;
19122 }
19123 
open_directory(char * dirname,DIRPTR * dp_out)19124 static int open_directory(char *dirname, DIRPTR *dp_out)
19125 {
19126 #ifdef WIN32
19127 
19128   char curdir[FILENAME_MAX];
19129   DIRPTR dp;
19130 
19131   if ((dp = (DIRPTR) malloc(sizeof(DIRSTRUCT))) == NULL)
19132     return STATUS_ERROR;
19133 
19134   if (GetCurrentDirectory(FILENAME_MAX, curdir) == 0) {
19135     free(dp);
19136     return STATUS_ERROR;
19137   }
19138   if (!SetCurrentDirectory(dirname)) {
19139     free(dp);
19140     return STATUS_ERROR;
19141   }
19142 
19143   dp->init_flag = 1;
19144   dp->handle = FindFirstFile("*.*", &dp->dirinfo);
19145   if (dp->handle == NULL) {
19146     free(dp);
19147     return STATUS_ERROR;
19148   }
19149 
19150   if (!SetCurrentDirectory(curdir)) {
19151     CloseHandle(dp->handle);
19152     free(dp);
19153     return STATUS_ERROR;
19154   }
19155 
19156   *dp_out = dp;
19157   return STATUS_OK;
19158 
19159 #else
19160 
19161   if ((*dp_out = opendir(dirname)) != NULL)
19162     return STATUS_OK;
19163   else
19164     return STATUS_ERROR;
19165 
19166 #endif
19167 }
19168 
read_dirname(DIRPTR dp)19169 static char *read_dirname(DIRPTR dp)
19170 {
19171 #ifdef WIN32
19172 
19173   if (dp->init_flag) {
19174     dp->init_flag = 0;
19175     return dp->dirinfo.cFileName;
19176   }
19177 
19178   if (FindNextFile(dp->handle, &dp->dirinfo))
19179     return dp->dirinfo.cFileName;
19180   else
19181     return NULL;
19182 
19183 #else
19184 
19185   struct dirent *dent;
19186 
19187   dent = readdir(dp);
19188   if (dent == NULL)
19189     return NULL;
19190   else
19191     return dent->d_name;
19192 
19193 #endif
19194 }
19195 
close_directory(DIRPTR dp)19196 static void close_directory (DIRPTR dp)
19197 {
19198 #ifdef WIN32
19199 
19200   CloseHandle(dp->handle);
19201   free(dp);
19202 
19203 #else
19204 
19205   closedir(dp);
19206 
19207 #endif
19208 }
19209 
isa_file(char * filename)19210 static int isa_file(char *filename)
19211 {
19212   struct stat sbuf;
19213 
19214   if (stat(filename, &sbuf) >= 0 && (sbuf.st_mode & S_IFMT) == S_IFREG)
19215     return 1;
19216   else
19217     return 0;
19218 }
19219 
19220 
19221 /*
19222  * Not currently used, but I'm leaving it here just in case.
19223  *
19224 static int isa_dir(char *directory)
19225 {
19226   struct stat sbuf;
19227 
19228   return (stat(directory, &sbuf) >= 0 && (sbuf.st_mode & S_IFMT) == S_IFDIR);
19229 }
19230  *
19231  *
19232  */
19233 
19234 
get_filesize(char * filename)19235 static int get_filesize(char *filename)
19236 {
19237   struct stat sbuf;
19238 
19239   if (stat(filename, &sbuf) >= 0 && (sbuf.st_mode & S_IFMT) == S_IFREG)
19240     return sbuf.st_size;
19241   else
19242     return 0;
19243 }
19244 
19245 
get_truename(char * filename,char * fileend)19246 static char *get_truename(char *filename, char *fileend)
19247 {
19248   static char buf[FILENAME_MAX+1];
19249   int len;
19250   char ch, *s, *t, *s2;
19251 
19252   s = filename;
19253   t = buf;
19254   if (*s == '~' && (s2 = getenv("HOME")) != NULL) {
19255     while ((*t++ = *s2++)) ;
19256     t--;
19257 
19258     s++;
19259     ch = (fileend != NULL && s == fileend ? '\0' : *s);
19260 
19261     if (ch == dirch) {
19262       if (*(t-1) == dirch)
19263         t--;
19264     }
19265     else if (isalpha(ch))
19266       while (t > buf && *(t-1) != dirch) t--;
19267     else {
19268       t = buf;
19269       s = filename;
19270     }
19271   }
19272 
19273   len = (fileend != NULL ? fileend - s : strlen(s));
19274   if (len > FILENAME_MAX - (t - buf))
19275     len = FILENAME_MAX - (t - buf);
19276   memcpy(t, s, len);
19277   t[len] = '\0';
19278 
19279   return buf;
19280 }
19281 
is_absolute(char * path)19282 static int is_absolute(char *path)
19283 {
19284   int abspath;
19285 
19286   if (path[0] != '~') {
19287 #ifdef WIN32
19288     abspath = (path[0] == dirch ||
19289                (isalpha(path[0]) && path[1] == ':' && path[2] == dirch));
19290 #else
19291     abspath = (path[0] == dirch);
19292 #endif
19293 
19294     return abspath;
19295   }
19296 
19297   return 1;
19298 }
19299 
get_today()19300 static char *get_today()
19301 {
19302   static char buffer[32];
19303   static int flag = 0;
19304   time_t now;
19305   char *s;
19306 
19307   if (!flag) {
19308     now = time(NULL);
19309     s = ctime(&now);
19310     buffer[0] = (isspace(s[8]) ? '0' : s[8]);
19311     buffer[1] = s[9];
19312     buffer[2] = '-';
19313     buffer[3] = s[4];
19314     buffer[4] = s[5];
19315     buffer[5] = s[6];
19316     buffer[6] = '-';
19317     buffer[7] = s[20];
19318     buffer[8] = s[21];
19319     buffer[9] = s[22];
19320     buffer[10] = s[23];
19321     buffer[11] = '\0';
19322     buffer[12] = s[11];
19323     buffer[13] = s[12];
19324     buffer[14] = s[13];
19325     buffer[15] = s[14];
19326     buffer[16] = s[15];
19327     buffer[17] = '\0';
19328   }
19329 
19330   return buffer;
19331 }
19332 
19333