1 /* Generated automatically from squid.h.in by configure. */
2 /*****************************************************************
3  * SQUID - a library of functions for biological sequence analysis
4  * Copyright (C) 1992-2002 Washington University School of Medicine
5  *
6  *     This source code is freely distributed under the terms of the
7  *     GNU General Public License. See the files COPYRIGHT and LICENSE
8  *     for details.
9  *****************************************************************/
10 
11 #ifndef SQUIDH_INCLUDED
12 #define SQUIDH_INCLUDED
13 
14 /* squid.h
15  * Header file for my library of sequence functions.
16  *
17  * CVS $Id: squid.h.in,v 1.5 2002/10/09 14:26:09 eddy Exp)
18  */
19 
20 #include <stdio.h>
21 #include <math.h>
22 #include <stdlib.h>
23 #include <unistd.h>		/* for sysconf() #define's       */
24 
25 
26 #if DEBUGLEVEL > 0
27 #include <assert.h>		/* for SQD_DASSERT1(), etc.      */
28 #endif
29 
30 /* include clustal's config.h */
31 #ifdef CLUSTALO
32 #include "config.h"
33 #define CLUSTALO 1
34 #else
35 #include "clustal-omega-config.h"
36 #define CLUSTALO 1
37 #endif
38 
39 #ifdef CLUSTALO
40 /* we don't want squidconf.h but our own config header. but, there are
41  * some checks, espcially at the end of squidconf.h might be
42  * necessary for squid to work. They follow after the inclusion of
43  * config.h
44  */
45 #undef DEBUG
46 
47 /* squidconf.h checks:
48  */
49 #if defined HAVE_NTOHL && defined HAVE_NTOHS && defined HAVE_HTONS && defined HAVE_HTONL
50 #define USE_HOST_BYTESWAP_FUNCTIONS 1
51 #endif
52 /* On 64-bit machines like Alphas, strtoull doesn't exist, strotul will work
53  */
54 #if SIZEOF_UNSIGNED_LONG == 8 && defined HAVE_STRTOUL && ! defined HAVE_STRTOULL
55 #define strtoull strtoul
56 #endif
57 
58 #if   defined HAVE_FTELLO && defined HAVE_FSEEKO && SIZEOF_OFF_T == 8
59 #define HAS_64BIT_FILE_OFFSETS 1
60 #elif defined HAVE_FTELLO64 && defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8
61 #define HAS_64BIT_FILE_OFFSETS 1
62 #elif defined HAVE_FTELL64 && defined HAVE_FSEEK64
63 #define HAS_64BIT_FILE_OFFSETS 1
64 #elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8
65 #define HAS_64BIT_FILE_OFFSETS 1
66 #else
67 #undef HAS_64BIT_FILE_OFFSETS
68 #endif
69 
70 /* The following check seems like nonsense to me (AW), therefore */
71 #if 0
72 /* Stuff to work around Tru64 not having strtoull() -
73  * on systems with 64-bit longs, we can use strtoul()
74  */
75 #undef HAVE_STRTOULL
76 #if ! defined HAVE_STRTOULL && SIZEOF_UNSIGNED_LONG == 8
77 #define strtoull strtoul
78 #endif
79 #endif
80 
81 #else /* CLUSTALO */
82 #include "squidconf.h"		/* #define's generated by ./configure script */
83 #endif
84 
85 /*****************************************************************
86  * Integers of guaranteed size. (used for instance in gsi.c, gsi2.c)
87  * These are set by the ./configure script; if they show up as FIXME,
88  * they must be manually edited to appropriate type definitions. You
89  * do need 64-bit integers in the current code; email me if this
90  * prevents you from compiling SQUID and tell me your system (I don't
91  * know of any systems that don't have 64-bit integers these days).
92  *****************************************************************/
93 typedef unsigned short     sqd_uint16;
94 typedef unsigned int       sqd_uint32;
95 typedef unsigned long      sqd_uint64;
96 
97 #ifdef USE_HOST_BYTESWAP_FUNCTIONS
98 #include <sys/types.h>		/* only for ntohl() and friends. */
99 #include <netinet/in.h>		/* only for ntohl() and friends. */
100 #define sre_ntoh16(x) ntohs(x);
101 #define sre_ntoh32(x) ntohl(x);
102 #define sre_hton16(x) htons(x);
103 #define sre_hton32(x) htonl(x);
104 #endif /* USE_HOST_BYTESWAP_FUNCTIONS */
105 
106 /* Library version info is made available as a global to
107  * any interested program. These are defined in iupac.c
108  * with the other globals.
109  */
110 extern char squid_version[];	/* version number  */
111 extern char squid_date[];	/* date of release */
112 extern int  squid_errno;	/* error codes     */
113 
114 
115 
116 /****************************************************
117  * Error codes returned by squid library functions (squid_errno)
118  ****************************************************/
119 
120 #define SQERR_OK        0	/* no error                     */
121 #define SQERR_UNKNOWN   1       /* generic error, unidentified  */
122 #define SQERR_NODATA    2	/* unexpectedly NULL stream     */
123 #define SQERR_MEM       3	/* malloc or realloc failed     */
124 #define SQERR_NOFILE    4	/* file not found               */
125 #define SQERR_FORMAT    5	/* file format not recognized   */
126 #define SQERR_PARAMETER 6	/* bad parameter passed to func */
127 #define SQERR_DIVZERO   7	/* error in sre_math.c          */
128 #define SQERR_INCOMPAT  8	/* incompatible parameters      */
129 #define SQERR_EOD       9	/* end-of-data (often normal)   */
130 
131 /****************************************************
132  * Single sequence information
133  ****************************************************/
134 #define SQINFO_NAMELEN 128 /* increased 64 -> 128, FS, r274 -> */
135 #define SQINFO_DESCLEN 128
136 
137 struct seqinfo_s {
138   int      flags;               /* what extra data are available         */
139   char     name[SQINFO_NAMELEN];/* up to 63 characters of name           */
140   char     id[SQINFO_NAMELEN];	/* up to 63 char of database identifier  */
141   char     acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */
142   char     desc[SQINFO_DESCLEN];/* up to 127 char of description         */
143   int      len;                 /* length of this seq                    */
144   int      start;		/* (1..len) start position on source seq */
145   int      stop;                /* (1..len) end position on source seq   */
146   int      olen;                /* original length of source seq         */
147   int      type;                /* kRNA, kDNA, kAmino, or kOther         */
148   char    *ss;                  /* 0..len-1 secondary structure string   */
149   char    *sa;			/* 0..len-1 % side chain surface access. */
150   char    *co;			/* 0..len-1 secondary struct confidence  */
151 };
152 typedef struct seqinfo_s SQINFO;
153 
154 #define SQINFO_NAME  (1 << 0)
155 #define SQINFO_ID    (1 << 1)
156 #define SQINFO_ACC   (1 << 2)
157 #define SQINFO_DESC  (1 << 3)
158 #define SQINFO_START (1 << 4)
159 #define SQINFO_STOP  (1 << 5)
160 #define SQINFO_LEN   (1 << 6)
161 #define SQINFO_TYPE  (1 << 7)
162 #define SQINFO_OLEN  (1 << 8)
163 #define SQINFO_SS    (1 << 9)
164 #define SQINFO_SA    (1 << 10)
165 #define SQINFO_CO    (1 << 11)
166 
167 
168 /****************************************************
169  * Sequence alphabet: see also iupac.c
170  ****************************************************/
171 				/* IUPAC symbols defined globally in iupac.c */
172 struct iupactype {
173   char       sym;		/* character representation */
174   char       symcomp;           /* complement (regular char */
175   char       code;		/* my binary rep */
176   char       comp;              /* binary encoded complement */
177 };
178 extern struct iupactype iupac[];
179 #define IUPACSYMNUM 17
180 
181 extern char    *stdcode1[];	/* 1-letter amino acid translation code */
182 extern char    *stdcode3[];	/* 3-letter amino acid translation code */
183 extern float    dnafq[];        /* nucleotide occurrence frequencies    */
184 extern float    aafq[];		/* amino acid occurrence frequencies    */
185 extern char     aa_alphabet[];  /* amino acid alphabet                  */
186 extern int      aa_index[];     /* convert 0..19 indices to 0..26       */
187 
188 				/* valid symbols in IUPAC code */
189 #define NUCLEOTIDES    "ACGTUNRYMKSWHBVDacgtunrymkswhbvd"
190 #define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY"
191 #define DNA_ALPHABET   "ACGT"
192 #define RNA_ALPHABET   "ACGU"
193 #define WHITESPACE     " \t\n"
194 
195 #define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-' || (c) == '~')
196 
197 
198 /****************************************************
199  * Sequence i/o: originally from Don Gilbert's readseq
200  ****************************************************/
201 #include "msa.h"		/* for multiple sequence alignment support   */
202 
203 	/* buffer size for reading in lines from sequence files*/
204 #define LINEBUFLEN  4096
205 
206 /* sequence types parsed by Seqtype()                          */
207 /* note that these must match hmmAMINO and hmmNUCLEIC in HMMER */
208 #define kOtherSeq   0		/* hmmNOTSETYET */
209 #define kDNA        1
210 #define kRNA        2		/* hmmNUCLEIC   */
211 #define kAmino      3		/* hmmAMINO     */
212 
213 /* Unaligned sequence file formats recognized
214  * Coexists with definitions of multiple alignment formats in msa.h:
215  *   >100 reserved for alignment formats
216  *   <100 reserved for unaligned formats
217  *   0 reserved for unknown
218  *
219  * Some "legacy" formats are supported only when explicitly
220  * requested; not autodetected by SeqfileFormat().
221  *
222  * DON'T REASSIGN THESE CODES. They're written into
223  * GSI index files. You can use new ones, but reassigning
224  * the sense of old ones will break GSI indices.
225  * Alignment format codes were reassigned with the creation
226  * of msa.c, but before Stockholm format, there were no
227  * indexed alignment databases.
228  */
229 #define SQFILE_UNKNOWN  0	/* unknown format                  */
230 #define SQFILE_IG       1	/* Intelligenetics (!)             */
231 #define SQFILE_GENBANK  2	/* GenBank flatfile                */
232 				/* 3 was A2M. Now an alignment format  */
233 #define SQFILE_EMBL     4	/* EMBL or Swissprot flatfile      */
234 #define SQFILE_GCG      5	/* GCG single sequence files       */
235 #define SQFILE_STRIDER  6	/* MacStrider (!!)                 */
236 #define SQFILE_FASTA    7	/* FASTA format: default           */
237 #define SQFILE_ZUKER    8	/* Zuker MFOLD format (legacy)     */
238 #define SQFILE_IDRAW    9	/* Idraw-style PostScript (legacy) */
239 				/* 10 was SELEX. Now alignment format  */
240 				/* 11 was MSF. Now alignment format    */
241 #define SQFILE_PIR      12	/* PIR format                      */
242 #define SQFILE_RAW      13	/* raw sequence                    */
243 #define SQFILE_SQUID    14	/* my obsolete squid format        */
244 				/* 15 was kXPearson, extended FASTA; withdrawn */
245 #define SQFILE_GCGDATA  16	/* GCG data library file           */
246 				/* 17 was Clustal. Now alignment format*/
247 #ifdef CLUSTALO
248 #define SQFILE_VIENNA   18	/* Vienna format: concatenated fasta           */
249 #define SQFILE_DUBLIN   19      /* unaligned version of Stockholm */
250 #endif
251 #define IsUnalignedFormat(fmt)  ((fmt) && (fmt) < 100)
252 
253 #include "ssi.h"
254 
255 struct ReadSeqVars {
256   FILE   *f;                    /* open file pointer                  */
257   char   *fname;                /* name of file; used for diagnostics */
258   int     linenumber;           /* what line are we on in the file    */
259 
260   char   *buf;                  /* dynamically allocated sre_fgets() buffer */
261   int     buflen;               /* allocation length for buf                */
262 
263   int       ssimode;		/* SSI_OFFSET_I32 or SSI_OFFSET_I64        */
264   SSIOFFSET ssioffset;		/* disk offset to last line read into buf  */
265   SSIOFFSET r_off;		/* offset to start of record               */
266   SSIOFFSET d_off;		/* offset to start of sequence data        */
267 
268   int     rpl;			/* residues per data line for this file; -1 if unset, 0 if invalid */
269   int     lastrpl;		/* rpl on last line seen */
270   int     maxrpl;		/* max rpl on any line of the file */
271   int     bpl;			/* bytes per data line; -1 if unset, 0 if invalid */
272   int     lastbpl;		/* bpl on last line seen */
273   int     maxbpl;		/* max bpl on any line of the file */
274 
275   char   *seq;                  /* growing sequence during parse */
276   SQINFO *sqinfo;	        /* name, id, etc, gathered during parse */
277   char   *sp;
278   int     seqlen;		/* current sequence length */
279   int     maxseq;		/* current allocation length for seq */
280 
281   int     format;		/* format of seqfile we're reading. */
282   int     do_gzip;		/* TRUE if f is a pipe from gzip -dc */
283   int     do_stdin;		/* TRUE if f is stdin */
284 
285   /* An (important) hack for sequential access of multiple alignment files:
286    * we read the whole alignment in,
287    * and then copy it one sequence at a time into seq and sqinfo.
288    * It is active if msa is non NULL.
289    * msa->lastidx is reused/overloaded: used to keep track of what
290    * seq we'll return next.
291    * afp->format is the real format, while SQFILE->format is kMSA.
292    * Because we keep it in the SQFILE structure,
293    * ReadSeq() and friends are always reentrant for multiple seqfiles.
294    */
295   MSA      *msa;
296   MSAFILE  *afp;
297 };
298 typedef struct ReadSeqVars SQFILE;
299 
300 
301 /****************************************************
302  * Cluster analysis and phylogenetic tree support
303  ****************************************************/
304 
305 /* struct phylo_s - a phylogenetic tree
306  *
307  * For N sequences, there will generally be an array of 0..N-2
308  * phylo_s structures representing the nodes of a tree.
309  * [0] is the root. The indexes of left and
310  * right children are somewhat confusing so be careful. The
311  * indexes can have values of 0..2N-2. If they are 0..N-1, they
312  * represent pointers to individual sequences. If they are
313  * >= N, they represent pointers to a phylo_s structure
314  * at (index - N).
315  */
316 struct phylo_s {
317   int    parent;                /* index of parent, N..2N-2, or -1 for root */
318   int    left;			/* index of one of the branches, 0..2N-2 */
319   int    right;			/* index of other branch, 0..2N-2        */
320   float  diff;			/* difference score between seqs         */
321   float  lblen;      		/* left branch length                    */
322   float  rblen;                 /* right branch length                   */
323   char  *is_in;                 /* 0..N-1 flag array, 1 if seq included  */
324   int    incnum;                /* number of seqs included at this node  */
325 };
326 
327 
328 /* Strategies for cluster analysis; cluster by mean distance,
329  * minimum distance, or maximum distance.
330  */
331 enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN };
332 
333 /****************************************************
334  * Generic data structure support
335  ****************************************************/
336 
337 /* a struct intstack_s implements a pushdown stack for storing
338  * single integers.
339  */
340 struct intstack_s {
341   int                data;
342   struct intstack_s *nxt;
343 };
344 
345 /****************************************************
346  * Binary nucleotide alphabet support
347  ****************************************************/
348 
349 /* Binary encoding of the IUPAC code for nucleotides
350  *
351  *    four-bit "word", permitting rapid degenerate matching
352  *         A  C  G  T/U
353  *         0  0  1  0
354  */
355 #define NTA 8
356 #define NTC 4
357 #define NTG 2
358 #define NTT 1
359 #define NTU 1
360 #define NTN 15			/* A|C|G|T */
361 #define NTR 10			/* A|G */
362 #define NTY 5			/* C|T */
363 #define NTM 12			/* A|C */
364 #define NTK 3			/* G|T */
365 #define NTS 6			/* C|G */
366 #define NTW 9			/* A|T */
367 #define NTH 13			/* A|C|T */
368 #define NTB 7			/* C|G|T */
369 #define NTV 14			/* A|C|G */
370 #define NTD 11			/* A|G|T */
371 #define NTGAP 16		/* GAP */
372 #define NTEND 0			/* null string terminator */
373 
374 /* ntmatch(): bitwise comparison of two nuc's
375  * note that it's sensitive to the order;
376  * probe may be degenerate but target should not be
377  */
378 #define ntmatch(probe, target)  ((probe & target) == target)
379 
380 /****************************************************
381  * Support for a portable, flexible Getopt()
382  ****************************************************/
383 
384 /* Structure: opt_s
385  *
386  * Structure for declaring options to a main().
387  */
388 struct opt_s {
389   char *name;			/* name of option, e.g. "--option1" or "-o" */
390   int   single;			/* TRUE if a single letter option           */
391   int   argtype;		/* for typechecking, e.g. sqdARG_INT        */
392 };
393 				/* acceptable argtype's...           */
394 #define sqdARG_NONE   0		/* no argument                       */
395 #define sqdARG_INT    1		/* something that atoi() can grok    */
396 #define sqdARG_FLOAT  2		/* something that atof() can grok    */
397 #define sqdARG_CHAR   3		/* require single character or digit */
398 #define sqdARG_STRING 4		/* anything goes                     */
399 
400 /****************************************************
401  * Support for convenient Perl-y regexp matching
402  * See hsregexp.c for copyright notice: this code is derived
403  * from Henry Spencer's freely distributed regexp library.
404  ****************************************************/
405 
406 #define NSUBEXP  10
407 typedef struct sqd_regexp {
408 	char *startp[NSUBEXP];
409 	char *endp[NSUBEXP];
410 	char regstart;		/* Internal use only. */
411 	char reganch;		/* Internal use only. */
412 	char *regmust;		/* Internal use only. */
413 	int regmlen;		/* Internal use only. */
414 	char program[1];	/* Unwarranted chumminess with compiler. */
415 } sqd_regexp;
416 
417 /* Strparse() defines and manages these.
418  * sqd_parse[0] contains the substring that matched the pattern.
419  * sqd_parse[1-9] contain substrings matched with ()'s.
420  */
421 extern char *sqd_parse[10];
422 
423 /****************************************************
424  * Portable detection of multiprocessor # of CPUs.
425  *      #include <unistd.h>
426  *      long foo = SQD_NPROC;
427  *      returns the number of available processors.
428  *      if foo == -1, we failed.
429  ****************************************************/
430 
431 /* Our problem here is that POSIX apparently doesn't specify
432  * a standard for how to get sysconf() to report the number of
433  * processors on-line. _SC_NPROCESSORS_ONLN is specified
434  * by SVR4.0MP. Thanks to W. Gish for help here.
435  */
436 #undef SQD_NPROC
437 #ifdef  _SC_NPROCESSORS_ONLN    /* Sun Solaris, Digital UNIX */
438 #define SQD_NPROC  sysconf(_SC_NPROCESSORS_ONLN)
439 #else
440 #ifdef _SC_NPROC_ONLN		/* Silicon Graphics IRIX */
441 #define SQD_NPROC  sysconf(_SC_NPROC_ONLN)
442 #else   /* FreeBSD, Linux don't support getting ncpu via sysconf() */
443 #define SQD_NPROC  -1
444 #endif
445 #endif
446 
447 /****************************************************
448  * Three levels of debugging printf's and assert's
449  *      level 1: little impact on verbosity or performance
450  *      level 2: moderate impact
451  *      level 3: high impact
452  * Example:
453  *    SQD_DPRINTF3(("Matrix row %d col %d = %f\n", i, j, val));
454  * Note the double parentheses; these are important.
455  ****************************************************/
456 
457 #ifndef DEBUGLEVEL
458 #define DEBUGLEVEL 0
459 #endif
460 
461 #if (DEBUGLEVEL >= 1)
462 #define SQD_DPRINTF1(x)  printf x
463 #define SQD_DASSERT1(x)  assert x
464 #else
465 #define SQD_DPRINTF1(x)
466 #define SQD_DASSERT1(x)
467 #endif
468 #if (DEBUGLEVEL >= 2)
469 #define SQD_DPRINTF2(x)  printf x
470 #define SQD_DASSERT2(x)  assert x
471 #else
472 #define SQD_DPRINTF2(x)
473 #define SQD_DASSERT2(x)
474 #endif
475 #if (DEBUGLEVEL >= 3)
476 #define SQD_DPRINTF3(x)  printf x
477 #define SQD_DASSERT3(x)  assert x
478 #else
479 #define SQD_DPRINTF3(x)
480 #define SQD_DASSERT3(x)
481 #endif
482 
483 /* PANIC is called for failures of Std C/POSIX functions,
484  * instead of my own functions. Panic() calls perror() and exits
485  * abnormally.
486  */
487 #define PANIC   Panic(__FILE__, __LINE__)
488 
489 /* Malloc/realloc calls are wrapped
490  */
491 #define MallocOrDie(x)     sre_malloc(__FILE__, __LINE__, (x))
492 #define ReallocOrDie(x,y)  sre_realloc(__FILE__, __LINE__, (x), (y))
493 
494 /****************************************************
495  * Miscellaneous macros and defines
496  ****************************************************/
497 
498 #define SQDCONST_E    2.71828182845904523536028747135
499 #define SQDCONST_PI   3.14159265358979323846264338328
500 
501 				/* must declare swapfoo to use SWAP() */
502 #define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;}
503 #define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7)
504 
505 #ifndef MIN
506 #define MIN(a,b)         (((a)<(b))?(a):(b))
507 #endif
508 #ifndef MAX
509 #define MAX(a,b)         (((a)>(b))?(a):(b))
510 #endif
511 
512 /* For convenience and (one hopes) clarity in boolean tests:
513  */
514 #ifndef TRUE
515 #define TRUE 1
516 #endif
517 #ifndef FALSE
518 #define FALSE 0
519 #endif
520 
521 /* Somewhere, there is a universe in which Unix vendors comply
522  * with the ANSI C standard. Unfortunately, it is not ours:
523  */
524 #ifndef EXIT_SUCCESS
525 #define EXIT_SUCCESS 0
526 #endif
527 #ifndef EXIT_FAILURE
528 #define EXIT_FAILURE 1
529 #endif
530 
531 #include "sqfuncs.h"		/* squid function declarations */
532 #include "sre_random.h"         /* random number generator and samplers */
533 #include "vectorops.h"          /* vector operations  */
534 #endif /* SQUIDH_INCLUDED */
535