1 /* SQUID - A C function library for biological sequence analysis
2  * Copyright (C) 1992-1996 Sean R. Eddy
3  *
4  *    This source code is distributed under terms of the
5  *    GNU General Public License. See the files COPYING
6  *    and GNULICENSE for further details.
7  *
8  */
9 
10 #ifndef SQUIDH_INCLUDED
11 #define SQUIDH_INCLUDED
12 
13 /* squid.h
14  * last modified Sun Aug 15 12:05:58 1993
15  *
16  * Header file for my library of sequence functions.
17  *
18  */
19 
20 #include <stdio.h>
21 #include <math.h>
22 #include <stdlib.h>
23 #include <sys/types.h>		/* only for ntohl() and friends. */
24 #include <netinet/in.h>		/* only for ntohl() and friends. */
25 
26 /* Library version info is made available as a global to
27  * any interested program. These are defined in iupac.c
28  * with the other globals.
29  */
30 extern char squid_version[];	/* version number  */
31 extern char squid_date[];	/* date of release */
32 extern int  squid_errno;	/* error codes     */
33 
34 /****************************************************
35  * Error codes returned by squid library functions (squid_errno)
36  ****************************************************/
37 
38 #define SQERR_OK        0	/* no error                     */
39 #define SQERR_UNKNOWN   1       /* generic error, unidentified  */
40 #define SQERR_NODATA    2	/* unexpectedly NULL stream     */
41 #define SQERR_MEM       3	/* malloc or realloc failed     */
42 #define SQERR_NOFILE    4	/* file not found               */
43 #define SQERR_FORMAT    5	/* file format not recognized   */
44 #define SQERR_PARAMETER 6	/* bad parameter passed to func */
45 #define SQERR_DIVZERO   7	/* error in sre_math.c          */
46 
47 /****************************************************
48  * Single sequence information
49  ****************************************************/
50 #define SQINFO_NAMELEN 64
51 #define SQINFO_DESCLEN 128
52 
53 struct seqinfo_s {
54   int      flags;               /* what extra data are available         */
55   char     name[SQINFO_NAMELEN];/* up to 63 characters of name           */
56   char     id[SQINFO_NAMELEN];	/* up to 63 char of database identifier  */
57   char     acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */
58   char     desc[SQINFO_DESCLEN];/* up to 127 char of description         */
59   int      len;                 /* length of this seq                    */
60   int      start;		/* (1..len) start position on source seq */
61   int      stop;                /* (1..len) end position on source seq   */
62   int      olen;                /* original length of source seq         */
63   int      type;                /* kRNA, kDNA, kAmino, or kOther         */
64   char    *ss;                  /* 0..len-1 secondary structure string   */
65   char    *sa;			/* 0..len-1 % side chain surface access. */
66 };
67 typedef struct seqinfo_s SQINFO;
68 
69 #define SQINFO_NAME  (1 << 0)
70 #define SQINFO_ID    (1 << 1)
71 #define SQINFO_ACC   (1 << 2)
72 #define SQINFO_DESC  (1 << 3)
73 #define SQINFO_START (1 << 4)
74 #define SQINFO_STOP  (1 << 5)
75 #define SQINFO_LEN   (1 << 6)
76 #define SQINFO_TYPE  (1 << 7)
77 #define SQINFO_OLEN  (1 << 8)
78 #define SQINFO_SS    (1 << 9)
79 #define SQINFO_SA    (1 << 10)
80 
81 /****************************************************
82  * Database indexing (GSI index file format)
83  ****************************************************/
84 /* A GSI (generic sequence index) file is composed of
85  * recnum + nfiles + 1 records. Each record contains
86  * three fields; key, file number, and disk offset.
87  * Record 0 contains:
88  *   [ "GSI" ]  [ nfiles ]  [ recnum ]
89  * Records 1..nfiles map file names to file numbers, and contain:
90  *   [ filename ] [ file number, 1..nfiles ] [ 0 (unused) ]
91  * Records nfiles+1 to recnum+nfiles+1 provide disk offset
92  * and file number indices for every key:
93  *   [ key ] [ file number ] [ offset]
94  */
95 typedef unsigned int   sqd_uint32; /* 32 bit integer. */
96 typedef unsigned short sqd_uint16; /* 16 bit integer. */
97 struct gsi_s {
98   FILE        *gsifp;		/* open GSI index file            */
99   sqd_uint16   nfiles;		/* number of files = 16 bit int   */
100   sqd_uint32   recnum;		/* number of records = 32 bit int */
101 };
102 typedef struct gsi_s GSIFILE;
103 
104 #define GSI_KEYSIZE    32         /* keys are 32 bytes long */
105 #define GSI_RECSIZE    38	  /* 32 + 2 + 4 bytes       */
106 /* if ntohl() and friends are not available, you
107  * can slip replacements in by providing sre_ntohl()
108  * functions. (i.e., there is a possible portability problem here.)
109  */
110 #define sre_ntohl(x)  ntohl(x);
111 #define sre_ntohs(x)  ntohs(x);
112 
113 /****************************************************
114  * Sequence alphabet: see also iupac.c
115  ****************************************************/
116 				/* IUPAC symbols defined globally in iupac.c */
117 struct iupactype {
118   char       sym;		/* character representation */
119   char       symcomp;           /* complement (regular char */
120   char       code;		/* my binary rep */
121   char       comp;              /* binary encoded complement */
122 };
123 extern struct iupactype iupac[];
124 #define IUPACSYMNUM 17
125 
126 extern char    *stdcode1[];	/* 1-letter amino acid translation code */
127 extern char    *stdcode3[];	/* 3-letter amino acid translation code */
128 extern float   aafq[];		/* amino acid occurrence frequencies    */
129 extern char     aa_alphabet[];  /* amino acid alphabet                  */
130 extern int      aa_index[];     /* convert 0..19 indices to 0..26       */
131 
132 				/* valid symbols in IUPAC code */
133 #define NUCLEOTIDES    "ACGTUNRYMKSWHBVDacgtunrymkswhbvd"
134 #define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY"
135 #define DNA_ALPHABET   "ACGT"
136 #define RNA_ALPHABET   "ACGU"
137 #define WHITESPACE     " \t\n"
138 
139 #define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-')
140 
141 /****************************************************
142  * Alignment information
143  ****************************************************/
144 
145 /* Structure: aliinfo_s
146  *
147  * Purpose:   Optional information returned from an alignment file.
148  *
149  *            flags: always used. Flags for which info is valid/alloced.
150  *
151  *            alen: mandatory. Alignments are always flushed right
152  *                  with gaps so that all aseqs are the same length, alen.
153  *                  Available for all alignment formats.
154  *
155  *            nseq: mandatory. Aligned seqs are indexed 0..nseq-1.
156  *
157  *            wgt:  0..nseq-1 vector of sequence weights. Mandatory.
158  *                  If not explicitly set, weights are initialized to 1.0.
159  *
160  *            cs:   0..alen-1, just like the alignment. Contains single-letter
161  *                  secondary structure codes for consensus structure; "<>^+"
162  *                  for RNA, "EHL." for protein. May be NULL if unavailable
163  *                  from seqfile. Only available for SELEX format files.
164  *
165  *            rf:   0..alen-1, just like the alignment. rf is an arbitrary string
166  *                  of characters, used for annotating columns. Blanks are
167  *                  interpreted as non-canonical columns and anything else is
168  *                  considered canonical. Only available from SELEX files.
169  *
170  *            sqinfo: mandatory. Array of 0..nseq-1
171  *                  per-sequence information structures, carrying
172  *                  name, id, accession, coords.
173  *
174  */
175 struct aliinfo_s {
176   int               flags;      /* flags for what info is valid             */
177   int               alen;	/* length of alignment (columns)            */
178   int               nseq;       /* number of seqs in alignment              */
179   char              au[64];	/* "author" information                     */
180   float            *wgt;	/* sequence weights [0..nseq-1]             */
181   char             *cs;         /* consensus secondary structure string     */
182   char             *rf;         /* reference coordinate system              */
183   struct seqinfo_s *sqinfo;     /* name, id, coord info for each sequence   */
184 };
185 typedef struct aliinfo_s AINFO;
186 
187 #define AINFO_AUTH    (1 << 0)
188 #define AINFO_CS      (1 << 1)
189 #define AINFO_RF      (1 << 2)
190 
191 
192 /****************************************************
193  * Sequence i/o: originally from Don Gilbert's readseq
194  ****************************************************/
195 	/* buffer size for reading in lines from sequence files*/
196 #define LINEBUFLEN  4096
197 
198 /* sequence types parsed by Seqtype()                          */
199 /* note that these must match hmmAMINO and hmmNUCLEIC in HMMER */
200 #define kOtherSeq   0
201 #define kDNA        1
202 #define kRNA        2
203 #define kAmino      3
204 
205 /* Sequence file formats recognized */
206 #define kUnknown        0   /* format not determinable */
207 #define kIG             1
208 #define kGenBank        2
209 #define kA2M            3	/* aligned FASTA (A2M is UCSC's terminology) */
210 #define kEMBL           4
211 #define kGCG            5
212 #define kStrider        6
213 #define kPearson        7
214 #define kZuker          8
215 #define kIdraw          9	/* idraw-style PostScript (write only)       */
216 #define kSelex          10	/* my flat text alignment format             */
217 #define kMSF		11	/* GCG MSF multiple alignment format         */
218 #define kPIR            12      /* PIR-CODATA format                         */
219 #define kRaw            13      /* unformatted, raw sequence (output only)   */
220 #define kSquid          14	/* my sequence database format               */
221 				/* 15 was kXPearson, extended Pearson        */
222 #define kGCGdata        16	/* GCG data library format                   */
223 #define kClustal        17	/* Clustal V or W multiple alignment format  */
224 
225 #define kMinFormat      1	/* SRE: kUnknown doesn't count */
226 #define kMaxFormat      17
227 #define kNumFormats     (kMaxFormat + 1)
228 #define kNoformat       -1	/* format not tested */
229 
230 struct ReadSeqVars {
231   FILE   *f;
232   char    sbuffer[LINEBUFLEN];	/* current line we're working on */
233 
234   char   *seq;
235   SQINFO *sqinfo;	/* name, id, etc. */
236   char   *sp;
237   int     seqlen;
238   int     maxseq;
239 
240   int     longline;		/* TRUE if last fgets() didn't have a \n */
241   int     dash_equals_n;        /* a hack - affects EMBL reading, to deal with Staden
242 				   experiment files */
243   int     do_gzip;		/* TRUE if f is a pipe from gzip -dc */
244   int     do_stdin;		/* TRUE if f is stdin */
245   int     format;		/* format of seqfile we're reading */
246 
247 
248   /* The ali_ section of the structure is a hack for sequential access
249    * of multiple alignment files: we read the whole alignment in,
250    * and then copy it one sequence at a time into seq and sqinfo.
251    * It is active if ali_num > 0. Because we keep it in the SQFILE structure,
252    * ReadSeq() and friends are always reentrant for multiple seqfiles.
253    */
254   char  **ali_aseqs;
255   AINFO   ali_ainfo;
256   int     ali_curridx;		/* next index to copy and return [0..ali_num-1]*/
257 };
258 typedef struct ReadSeqVars SQFILE;
259 
260 
261 /****************************************************
262  * Cluster analysis and phylogenetic tree support
263  ****************************************************/
264 
265 /* struct phylo_s - a phylogenetic tree
266  *
267  * For N sequences, there will generally be an array of 0..N-2
268  * phylo_s structures representing the nodes of a tree.
269  * [0] is the root. The indexes of left and
270  * right children are somewhat confusing so be careful. The
271  * indexes can have values of 0..2N-2. If they are 0..N-1, they
272  * represent pointers to individual sequences. If they are
273  * >= N, they represent pointers to a phylo_s structure
274  * at (index - N).
275  */
276 struct phylo_s {
277   int    parent;                /* index of parent, N..2N-2, or -1 for root */
278   int    left;			/* index of one of the branches, 0..2N-2 */
279   int    right;			/* index of other branch, 0..2N-2        */
280   float  diff;			/* difference score between seqs         */
281   float  lblen;      		/* left branch length                    */
282   float  rblen;                 /* right branch length                   */
283   char  *is_in;                 /* 0..N-1 flag array, 1 if seq included  */
284   int    incnum;                /* number of seqs included at this node  */
285 };
286 
287 
288 /* Strategies for cluster analysis; cluster by mean distance,
289  * minimum distance, or maximum distance.
290  */
291 enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN };
292 
293 /****************************************************
294  * Generic data structure support
295  ****************************************************/
296 
297 /* a struct intstack_s implements a pushdown stack for storing
298  * single integers.
299  */
300 struct intstack_s {
301   int                data;
302   struct intstack_s *nxt;
303 };
304 
305 /****************************************************
306  * Binary nucleotide alphabet support
307  ****************************************************/
308 
309 /* Binary encoding of the IUPAC code for nucleotides
310  *
311  *    four-bit "word", permitting rapid degenerate matching
312  *         A  C  G  T/U
313  *         0  0  1  0
314  */
315 #define NTA 8
316 #define NTC 4
317 #define NTG 2
318 #define NTT 1
319 #define NTU 1
320 #define NTN 15			/* A|C|G|T */
321 #define NTR 10			/* A|G */
322 #define NTY 5			/* C|T */
323 #define NTM 12			/* A|C */
324 #define NTK 3			/* G|T */
325 #define NTS 6			/* C|G */
326 #define NTW 9			/* A|T */
327 #define NTH 13			/* A|C|T */
328 #define NTB 7			/* C|G|T */
329 #define NTV 14			/* A|C|G */
330 #define NTD 11			/* A|G|T */
331 #define NTGAP 16		/* GAP */
332 #define NTEND 0			/* null string terminator */
333 
334 /* ntmatch(): bitwise comparison of two nuc's
335  * note that it's sensitive to the order;
336  * probe may be degenerate but target should not be
337  */
338 #define ntmatch(probe, target)  ((probe & target) == target)
339 
340 /****************************************************
341  * Support for a portable, flexible Getopt()
342  ****************************************************/
343 
344 /* Structure: opt_s
345  *
346  * Structure for declaring options to a main().
347  */
348 struct opt_s {
349   char *name;			/* name of option, e.g. "--option1" or "-o" */
350   int   single;			/* TRUE if a single letter option           */
351   int   argtype;		/* for typechecking, e.g. sqdARG_INT        */
352 };
353 				/* acceptable argtype's...           */
354 #define sqdARG_NONE   0		/* no argument                       */
355 #define sqdARG_INT    1		/* something that atoi() can grok    */
356 #define sqdARG_FLOAT  2		/* something that atof() can grok    */
357 #define sqdARG_CHAR   3		/* require single character or digit */
358 #define sqdARG_STRING 4		/* anything goes                     */
359 
360 /****************************************************
361  * Support for convenient Perl-y regexp matching
362  *
363  * Strparse() defines and manages these.
364  * sqd_parse[0] contains the substring that matched the pattern.
365  * sqd_parse[1-9] contain substrings matched with ()'s.
366  ****************************************************/
367 
368 extern char *sqd_parse[10];
369 
370 /****************************************************
371  * Miscellaneous macros and defines
372  ****************************************************/
373 
374 #define CHOOSE(a)     ((int) (sre_random() * (a)))
375 				/* must declare swapfoo to use SWAP() */
376 #define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;}
377 
378 #define Free2DArray(ptr, n) \
379 { int fooidx;\
380   if (ptr != NULL) { \
381     for (fooidx = 0; fooidx < (n); fooidx++) if (ptr[fooidx] != NULL) free(ptr[fooidx]);\
382     free(ptr);\
383   } }
384 
385 #define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7)
386 
387 #ifndef MIN
388 #define MIN(a,b)         ((a<b)?a:b)
389 #endif
390 #ifndef MAX
391 #define MAX(a,b)         ((a>b)?a:b)
392 #endif
393 
394 /* For convenience and (one hopes) clarity in boolean tests:
395  */
396 #ifndef TRUE
397 #define TRUE 1
398 #endif
399 #ifndef FALSE
400 #define FALSE 0
401 #endif
402 
403 /* Somewhere, there is a universe in which Unix vendors comply
404  * with the ANSI C standard. Unfortunately, it is not ours:
405  */
406 #ifndef EXIT_SUCCESS
407 #define EXIT_SUCCESS 0
408 #endif
409 #ifndef EXIT_FAILURE
410 #define EXIT_FAILURE 1
411 #endif
412 
413 #include "sqfuncs.h"		/* squid function declarations */
414 #endif /* SQUIDH_INCLUDED */
415