1 /* SQUID - A C function library for biological sequence analysis 2 * Copyright (C) 1992-1996 Sean R. Eddy 3 * 4 * This source code is distributed under terms of the 5 * GNU General Public License. See the files COPYING 6 * and GNULICENSE for further details. 7 * 8 */ 9 10 #ifndef SQUIDH_INCLUDED 11 #define SQUIDH_INCLUDED 12 13 /* squid.h 14 * last modified Sun Aug 15 12:05:58 1993 15 * 16 * Header file for my library of sequence functions. 17 * 18 */ 19 20 #include <stdio.h> 21 #include <math.h> 22 #include <stdlib.h> 23 #include <sys/types.h> /* only for ntohl() and friends. */ 24 #include <netinet/in.h> /* only for ntohl() and friends. */ 25 26 /* Library version info is made available as a global to 27 * any interested program. These are defined in iupac.c 28 * with the other globals. 29 */ 30 extern char squid_version[]; /* version number */ 31 extern char squid_date[]; /* date of release */ 32 extern int squid_errno; /* error codes */ 33 34 /**************************************************** 35 * Error codes returned by squid library functions (squid_errno) 36 ****************************************************/ 37 38 #define SQERR_OK 0 /* no error */ 39 #define SQERR_UNKNOWN 1 /* generic error, unidentified */ 40 #define SQERR_NODATA 2 /* unexpectedly NULL stream */ 41 #define SQERR_MEM 3 /* malloc or realloc failed */ 42 #define SQERR_NOFILE 4 /* file not found */ 43 #define SQERR_FORMAT 5 /* file format not recognized */ 44 #define SQERR_PARAMETER 6 /* bad parameter passed to func */ 45 #define SQERR_DIVZERO 7 /* error in sre_math.c */ 46 47 /**************************************************** 48 * Single sequence information 49 ****************************************************/ 50 #define SQINFO_NAMELEN 64 51 #define SQINFO_DESCLEN 128 52 53 struct seqinfo_s { 54 int flags; /* what extra data are available */ 55 char name[SQINFO_NAMELEN];/* up to 63 characters of name */ 56 char id[SQINFO_NAMELEN]; /* up to 63 char of database identifier */ 57 char acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */ 58 char desc[SQINFO_DESCLEN];/* up to 127 char of description */ 59 int len; /* length of this seq */ 60 int start; /* (1..len) start position on source seq */ 61 int stop; /* (1..len) end position on source seq */ 62 int olen; /* original length of source seq */ 63 int type; /* kRNA, kDNA, kAmino, or kOther */ 64 char *ss; /* 0..len-1 secondary structure string */ 65 char *sa; /* 0..len-1 % side chain surface access. */ 66 }; 67 typedef struct seqinfo_s SQINFO; 68 69 #define SQINFO_NAME (1 << 0) 70 #define SQINFO_ID (1 << 1) 71 #define SQINFO_ACC (1 << 2) 72 #define SQINFO_DESC (1 << 3) 73 #define SQINFO_START (1 << 4) 74 #define SQINFO_STOP (1 << 5) 75 #define SQINFO_LEN (1 << 6) 76 #define SQINFO_TYPE (1 << 7) 77 #define SQINFO_OLEN (1 << 8) 78 #define SQINFO_SS (1 << 9) 79 #define SQINFO_SA (1 << 10) 80 81 /**************************************************** 82 * Database indexing (GSI index file format) 83 ****************************************************/ 84 /* A GSI (generic sequence index) file is composed of 85 * recnum + nfiles + 1 records. Each record contains 86 * three fields; key, file number, and disk offset. 87 * Record 0 contains: 88 * [ "GSI" ] [ nfiles ] [ recnum ] 89 * Records 1..nfiles map file names to file numbers, and contain: 90 * [ filename ] [ file number, 1..nfiles ] [ 0 (unused) ] 91 * Records nfiles+1 to recnum+nfiles+1 provide disk offset 92 * and file number indices for every key: 93 * [ key ] [ file number ] [ offset] 94 */ 95 typedef unsigned int sqd_uint32; /* 32 bit integer. */ 96 typedef unsigned short sqd_uint16; /* 16 bit integer. */ 97 struct gsi_s { 98 FILE *gsifp; /* open GSI index file */ 99 sqd_uint16 nfiles; /* number of files = 16 bit int */ 100 sqd_uint32 recnum; /* number of records = 32 bit int */ 101 }; 102 typedef struct gsi_s GSIFILE; 103 104 #define GSI_KEYSIZE 32 /* keys are 32 bytes long */ 105 #define GSI_RECSIZE 38 /* 32 + 2 + 4 bytes */ 106 /* if ntohl() and friends are not available, you 107 * can slip replacements in by providing sre_ntohl() 108 * functions. (i.e., there is a possible portability problem here.) 109 */ 110 #define sre_ntohl(x) ntohl(x); 111 #define sre_ntohs(x) ntohs(x); 112 113 /**************************************************** 114 * Sequence alphabet: see also iupac.c 115 ****************************************************/ 116 /* IUPAC symbols defined globally in iupac.c */ 117 struct iupactype { 118 char sym; /* character representation */ 119 char symcomp; /* complement (regular char */ 120 char code; /* my binary rep */ 121 char comp; /* binary encoded complement */ 122 }; 123 extern struct iupactype iupac[]; 124 #define IUPACSYMNUM 17 125 126 extern char *stdcode1[]; /* 1-letter amino acid translation code */ 127 extern char *stdcode3[]; /* 3-letter amino acid translation code */ 128 extern float aafq[]; /* amino acid occurrence frequencies */ 129 extern char aa_alphabet[]; /* amino acid alphabet */ 130 extern int aa_index[]; /* convert 0..19 indices to 0..26 */ 131 132 /* valid symbols in IUPAC code */ 133 #define NUCLEOTIDES "ACGTUNRYMKSWHBVDacgtunrymkswhbvd" 134 #define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY" 135 #define DNA_ALPHABET "ACGT" 136 #define RNA_ALPHABET "ACGU" 137 #define WHITESPACE " \t\n" 138 139 #define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-') 140 141 /**************************************************** 142 * Alignment information 143 ****************************************************/ 144 145 /* Structure: aliinfo_s 146 * 147 * Purpose: Optional information returned from an alignment file. 148 * 149 * flags: always used. Flags for which info is valid/alloced. 150 * 151 * alen: mandatory. Alignments are always flushed right 152 * with gaps so that all aseqs are the same length, alen. 153 * Available for all alignment formats. 154 * 155 * nseq: mandatory. Aligned seqs are indexed 0..nseq-1. 156 * 157 * wgt: 0..nseq-1 vector of sequence weights. Mandatory. 158 * If not explicitly set, weights are initialized to 1.0. 159 * 160 * cs: 0..alen-1, just like the alignment. Contains single-letter 161 * secondary structure codes for consensus structure; "<>^+" 162 * for RNA, "EHL." for protein. May be NULL if unavailable 163 * from seqfile. Only available for SELEX format files. 164 * 165 * rf: 0..alen-1, just like the alignment. rf is an arbitrary string 166 * of characters, used for annotating columns. Blanks are 167 * interpreted as non-canonical columns and anything else is 168 * considered canonical. Only available from SELEX files. 169 * 170 * sqinfo: mandatory. Array of 0..nseq-1 171 * per-sequence information structures, carrying 172 * name, id, accession, coords. 173 * 174 */ 175 struct aliinfo_s { 176 int flags; /* flags for what info is valid */ 177 int alen; /* length of alignment (columns) */ 178 int nseq; /* number of seqs in alignment */ 179 char au[64]; /* "author" information */ 180 float *wgt; /* sequence weights [0..nseq-1] */ 181 char *cs; /* consensus secondary structure string */ 182 char *rf; /* reference coordinate system */ 183 struct seqinfo_s *sqinfo; /* name, id, coord info for each sequence */ 184 }; 185 typedef struct aliinfo_s AINFO; 186 187 #define AINFO_AUTH (1 << 0) 188 #define AINFO_CS (1 << 1) 189 #define AINFO_RF (1 << 2) 190 191 192 /**************************************************** 193 * Sequence i/o: originally from Don Gilbert's readseq 194 ****************************************************/ 195 /* buffer size for reading in lines from sequence files*/ 196 #define LINEBUFLEN 4096 197 198 /* sequence types parsed by Seqtype() */ 199 /* note that these must match hmmAMINO and hmmNUCLEIC in HMMER */ 200 #define kOtherSeq 0 201 #define kDNA 1 202 #define kRNA 2 203 #define kAmino 3 204 205 /* Sequence file formats recognized */ 206 #define kUnknown 0 /* format not determinable */ 207 #define kIG 1 208 #define kGenBank 2 209 #define kA2M 3 /* aligned FASTA (A2M is UCSC's terminology) */ 210 #define kEMBL 4 211 #define kGCG 5 212 #define kStrider 6 213 #define kPearson 7 214 #define kZuker 8 215 #define kIdraw 9 /* idraw-style PostScript (write only) */ 216 #define kSelex 10 /* my flat text alignment format */ 217 #define kMSF 11 /* GCG MSF multiple alignment format */ 218 #define kPIR 12 /* PIR-CODATA format */ 219 #define kRaw 13 /* unformatted, raw sequence (output only) */ 220 #define kSquid 14 /* my sequence database format */ 221 /* 15 was kXPearson, extended Pearson */ 222 #define kGCGdata 16 /* GCG data library format */ 223 #define kClustal 17 /* Clustal V or W multiple alignment format */ 224 225 #define kMinFormat 1 /* SRE: kUnknown doesn't count */ 226 #define kMaxFormat 17 227 #define kNumFormats (kMaxFormat + 1) 228 #define kNoformat -1 /* format not tested */ 229 230 struct ReadSeqVars { 231 FILE *f; 232 char sbuffer[LINEBUFLEN]; /* current line we're working on */ 233 234 char *seq; 235 SQINFO *sqinfo; /* name, id, etc. */ 236 char *sp; 237 int seqlen; 238 int maxseq; 239 240 int longline; /* TRUE if last fgets() didn't have a \n */ 241 int dash_equals_n; /* a hack - affects EMBL reading, to deal with Staden 242 experiment files */ 243 int do_gzip; /* TRUE if f is a pipe from gzip -dc */ 244 int do_stdin; /* TRUE if f is stdin */ 245 int format; /* format of seqfile we're reading */ 246 247 248 /* The ali_ section of the structure is a hack for sequential access 249 * of multiple alignment files: we read the whole alignment in, 250 * and then copy it one sequence at a time into seq and sqinfo. 251 * It is active if ali_num > 0. Because we keep it in the SQFILE structure, 252 * ReadSeq() and friends are always reentrant for multiple seqfiles. 253 */ 254 char **ali_aseqs; 255 AINFO ali_ainfo; 256 int ali_curridx; /* next index to copy and return [0..ali_num-1]*/ 257 }; 258 typedef struct ReadSeqVars SQFILE; 259 260 261 /**************************************************** 262 * Cluster analysis and phylogenetic tree support 263 ****************************************************/ 264 265 /* struct phylo_s - a phylogenetic tree 266 * 267 * For N sequences, there will generally be an array of 0..N-2 268 * phylo_s structures representing the nodes of a tree. 269 * [0] is the root. The indexes of left and 270 * right children are somewhat confusing so be careful. The 271 * indexes can have values of 0..2N-2. If they are 0..N-1, they 272 * represent pointers to individual sequences. If they are 273 * >= N, they represent pointers to a phylo_s structure 274 * at (index - N). 275 */ 276 struct phylo_s { 277 int parent; /* index of parent, N..2N-2, or -1 for root */ 278 int left; /* index of one of the branches, 0..2N-2 */ 279 int right; /* index of other branch, 0..2N-2 */ 280 float diff; /* difference score between seqs */ 281 float lblen; /* left branch length */ 282 float rblen; /* right branch length */ 283 char *is_in; /* 0..N-1 flag array, 1 if seq included */ 284 int incnum; /* number of seqs included at this node */ 285 }; 286 287 288 /* Strategies for cluster analysis; cluster by mean distance, 289 * minimum distance, or maximum distance. 290 */ 291 enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN }; 292 293 /**************************************************** 294 * Generic data structure support 295 ****************************************************/ 296 297 /* a struct intstack_s implements a pushdown stack for storing 298 * single integers. 299 */ 300 struct intstack_s { 301 int data; 302 struct intstack_s *nxt; 303 }; 304 305 /**************************************************** 306 * Binary nucleotide alphabet support 307 ****************************************************/ 308 309 /* Binary encoding of the IUPAC code for nucleotides 310 * 311 * four-bit "word", permitting rapid degenerate matching 312 * A C G T/U 313 * 0 0 1 0 314 */ 315 #define NTA 8 316 #define NTC 4 317 #define NTG 2 318 #define NTT 1 319 #define NTU 1 320 #define NTN 15 /* A|C|G|T */ 321 #define NTR 10 /* A|G */ 322 #define NTY 5 /* C|T */ 323 #define NTM 12 /* A|C */ 324 #define NTK 3 /* G|T */ 325 #define NTS 6 /* C|G */ 326 #define NTW 9 /* A|T */ 327 #define NTH 13 /* A|C|T */ 328 #define NTB 7 /* C|G|T */ 329 #define NTV 14 /* A|C|G */ 330 #define NTD 11 /* A|G|T */ 331 #define NTGAP 16 /* GAP */ 332 #define NTEND 0 /* null string terminator */ 333 334 /* ntmatch(): bitwise comparison of two nuc's 335 * note that it's sensitive to the order; 336 * probe may be degenerate but target should not be 337 */ 338 #define ntmatch(probe, target) ((probe & target) == target) 339 340 /**************************************************** 341 * Support for a portable, flexible Getopt() 342 ****************************************************/ 343 344 /* Structure: opt_s 345 * 346 * Structure for declaring options to a main(). 347 */ 348 struct opt_s { 349 char *name; /* name of option, e.g. "--option1" or "-o" */ 350 int single; /* TRUE if a single letter option */ 351 int argtype; /* for typechecking, e.g. sqdARG_INT */ 352 }; 353 /* acceptable argtype's... */ 354 #define sqdARG_NONE 0 /* no argument */ 355 #define sqdARG_INT 1 /* something that atoi() can grok */ 356 #define sqdARG_FLOAT 2 /* something that atof() can grok */ 357 #define sqdARG_CHAR 3 /* require single character or digit */ 358 #define sqdARG_STRING 4 /* anything goes */ 359 360 /**************************************************** 361 * Support for convenient Perl-y regexp matching 362 * 363 * Strparse() defines and manages these. 364 * sqd_parse[0] contains the substring that matched the pattern. 365 * sqd_parse[1-9] contain substrings matched with ()'s. 366 ****************************************************/ 367 368 extern char *sqd_parse[10]; 369 370 /**************************************************** 371 * Miscellaneous macros and defines 372 ****************************************************/ 373 374 #define CHOOSE(a) ((int) (sre_random() * (a))) 375 /* must declare swapfoo to use SWAP() */ 376 #define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;} 377 378 #define Free2DArray(ptr, n) \ 379 { int fooidx;\ 380 if (ptr != NULL) { \ 381 for (fooidx = 0; fooidx < (n); fooidx++) if (ptr[fooidx] != NULL) free(ptr[fooidx]);\ 382 free(ptr);\ 383 } } 384 385 #define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7) 386 387 #ifndef MIN 388 #define MIN(a,b) ((a<b)?a:b) 389 #endif 390 #ifndef MAX 391 #define MAX(a,b) ((a>b)?a:b) 392 #endif 393 394 /* For convenience and (one hopes) clarity in boolean tests: 395 */ 396 #ifndef TRUE 397 #define TRUE 1 398 #endif 399 #ifndef FALSE 400 #define FALSE 0 401 #endif 402 403 /* Somewhere, there is a universe in which Unix vendors comply 404 * with the ANSI C standard. Unfortunately, it is not ours: 405 */ 406 #ifndef EXIT_SUCCESS 407 #define EXIT_SUCCESS 0 408 #endif 409 #ifndef EXIT_FAILURE 410 #define EXIT_FAILURE 1 411 #endif 412 413 #include "sqfuncs.h" /* squid function declarations */ 414 #endif /* SQUIDH_INCLUDED */ 415