1 /* Generated automatically from squid.h.in by configure. */ 2 /***************************************************************** 3 * SQUID - a library of functions for biological sequence analysis 4 * Copyright (C) 1992-2002 Washington University School of Medicine 5 * 6 * This source code is freely distributed under the terms of the 7 * GNU General Public License. See the files COPYRIGHT and LICENSE 8 * for details. 9 *****************************************************************/ 10 11 #ifndef SQUIDH_INCLUDED 12 #define SQUIDH_INCLUDED 13 14 /* squid.h 15 * Header file for my library of sequence functions. 16 * 17 * CVS $Id: squid.h.in,v 1.5 2002/10/09 14:26:09 eddy Exp) 18 */ 19 20 #include <stdio.h> 21 #include <math.h> 22 #include <stdlib.h> 23 #include <unistd.h> /* for sysconf() #define's */ 24 25 26 #if DEBUGLEVEL > 0 27 #include <assert.h> /* for SQD_DASSERT1(), etc. */ 28 #endif 29 30 /* include clustal's config.h */ 31 #ifdef CLUSTALO 32 #include "config.h" 33 #define CLUSTALO 1 34 #else 35 #include "clustal-omega-config.h" 36 #define CLUSTALO 1 37 #endif 38 39 #ifdef CLUSTALO 40 /* we don't want squidconf.h but our own config header. but, there are 41 * some checks, espcially at the end of squidconf.h might be 42 * necessary for squid to work. They follow after the inclusion of 43 * config.h 44 */ 45 #undef DEBUG 46 47 /* squidconf.h checks: 48 */ 49 #if defined HAVE_NTOHL && defined HAVE_NTOHS && defined HAVE_HTONS && defined HAVE_HTONL 50 #define USE_HOST_BYTESWAP_FUNCTIONS 1 51 #endif 52 /* On 64-bit machines like Alphas, strtoull doesn't exist, strotul will work 53 */ 54 #if SIZEOF_UNSIGNED_LONG == 8 && defined HAVE_STRTOUL && ! defined HAVE_STRTOULL 55 #define strtoull strtoul 56 #endif 57 58 #if defined HAVE_FTELLO && defined HAVE_FSEEKO && SIZEOF_OFF_T == 8 59 #define HAS_64BIT_FILE_OFFSETS 1 60 #elif defined HAVE_FTELLO64 && defined HAVE_FSEEKO64 && SIZEOF_OFF64_T == 8 61 #define HAS_64BIT_FILE_OFFSETS 1 62 #elif defined HAVE_FTELL64 && defined HAVE_FSEEK64 63 #define HAS_64BIT_FILE_OFFSETS 1 64 #elif defined ARITHMETIC_FPOS_T && SIZEOF_FPOS_T == 8 65 #define HAS_64BIT_FILE_OFFSETS 1 66 #else 67 #undef HAS_64BIT_FILE_OFFSETS 68 #endif 69 70 /* The following check seems like nonsense to me (AW), therefore */ 71 #if 0 72 /* Stuff to work around Tru64 not having strtoull() - 73 * on systems with 64-bit longs, we can use strtoul() 74 */ 75 #undef HAVE_STRTOULL 76 #if ! defined HAVE_STRTOULL && SIZEOF_UNSIGNED_LONG == 8 77 #define strtoull strtoul 78 #endif 79 #endif 80 81 #else /* CLUSTALO */ 82 #include "squidconf.h" /* #define's generated by ./configure script */ 83 #endif 84 85 /***************************************************************** 86 * Integers of guaranteed size. (used for instance in gsi.c, gsi2.c) 87 * These are set by the ./configure script; if they show up as FIXME, 88 * they must be manually edited to appropriate type definitions. You 89 * do need 64-bit integers in the current code; email me if this 90 * prevents you from compiling SQUID and tell me your system (I don't 91 * know of any systems that don't have 64-bit integers these days). 92 *****************************************************************/ 93 typedef unsigned short sqd_uint16; 94 typedef unsigned int sqd_uint32; 95 typedef unsigned long sqd_uint64; 96 97 #ifdef USE_HOST_BYTESWAP_FUNCTIONS 98 #include <sys/types.h> /* only for ntohl() and friends. */ 99 #include <netinet/in.h> /* only for ntohl() and friends. */ 100 #define sre_ntoh16(x) ntohs(x); 101 #define sre_ntoh32(x) ntohl(x); 102 #define sre_hton16(x) htons(x); 103 #define sre_hton32(x) htonl(x); 104 #endif /* USE_HOST_BYTESWAP_FUNCTIONS */ 105 106 /* Library version info is made available as a global to 107 * any interested program. These are defined in iupac.c 108 * with the other globals. 109 */ 110 extern char squid_version[]; /* version number */ 111 extern char squid_date[]; /* date of release */ 112 extern int squid_errno; /* error codes */ 113 114 115 116 /**************************************************** 117 * Error codes returned by squid library functions (squid_errno) 118 ****************************************************/ 119 120 #define SQERR_OK 0 /* no error */ 121 #define SQERR_UNKNOWN 1 /* generic error, unidentified */ 122 #define SQERR_NODATA 2 /* unexpectedly NULL stream */ 123 #define SQERR_MEM 3 /* malloc or realloc failed */ 124 #define SQERR_NOFILE 4 /* file not found */ 125 #define SQERR_FORMAT 5 /* file format not recognized */ 126 #define SQERR_PARAMETER 6 /* bad parameter passed to func */ 127 #define SQERR_DIVZERO 7 /* error in sre_math.c */ 128 #define SQERR_INCOMPAT 8 /* incompatible parameters */ 129 #define SQERR_EOD 9 /* end-of-data (often normal) */ 130 131 /**************************************************** 132 * Single sequence information 133 ****************************************************/ 134 #define SQINFO_NAMELEN 128 /* increased 64 -> 128, FS, r274 -> */ 135 #define SQINFO_DESCLEN 128 136 137 struct seqinfo_s { 138 int flags; /* what extra data are available */ 139 char name[SQINFO_NAMELEN];/* up to 63 characters of name */ 140 char id[SQINFO_NAMELEN]; /* up to 63 char of database identifier */ 141 char acc[SQINFO_NAMELEN]; /* up to 63 char of database accession # */ 142 char desc[SQINFO_DESCLEN];/* up to 127 char of description */ 143 int len; /* length of this seq */ 144 int start; /* (1..len) start position on source seq */ 145 int stop; /* (1..len) end position on source seq */ 146 int olen; /* original length of source seq */ 147 int type; /* kRNA, kDNA, kAmino, or kOther */ 148 char *ss; /* 0..len-1 secondary structure string */ 149 char *sa; /* 0..len-1 % side chain surface access. */ 150 char *co; /* 0..len-1 secondary struct confidence */ 151 }; 152 typedef struct seqinfo_s SQINFO; 153 154 #define SQINFO_NAME (1 << 0) 155 #define SQINFO_ID (1 << 1) 156 #define SQINFO_ACC (1 << 2) 157 #define SQINFO_DESC (1 << 3) 158 #define SQINFO_START (1 << 4) 159 #define SQINFO_STOP (1 << 5) 160 #define SQINFO_LEN (1 << 6) 161 #define SQINFO_TYPE (1 << 7) 162 #define SQINFO_OLEN (1 << 8) 163 #define SQINFO_SS (1 << 9) 164 #define SQINFO_SA (1 << 10) 165 #define SQINFO_CO (1 << 11) 166 167 168 /**************************************************** 169 * Sequence alphabet: see also iupac.c 170 ****************************************************/ 171 /* IUPAC symbols defined globally in iupac.c */ 172 struct iupactype { 173 char sym; /* character representation */ 174 char symcomp; /* complement (regular char */ 175 char code; /* my binary rep */ 176 char comp; /* binary encoded complement */ 177 }; 178 extern struct iupactype iupac[]; 179 #define IUPACSYMNUM 17 180 181 extern char *stdcode1[]; /* 1-letter amino acid translation code */ 182 extern char *stdcode3[]; /* 3-letter amino acid translation code */ 183 extern float dnafq[]; /* nucleotide occurrence frequencies */ 184 extern float aafq[]; /* amino acid occurrence frequencies */ 185 extern char aa_alphabet[]; /* amino acid alphabet */ 186 extern int aa_index[]; /* convert 0..19 indices to 0..26 */ 187 188 /* valid symbols in IUPAC code */ 189 #define NUCLEOTIDES "ACGTUNRYMKSWHBVDacgtunrymkswhbvd" 190 #define AMINO_ALPHABET "ACDEFGHIKLMNPQRSTVWY" 191 #define DNA_ALPHABET "ACGT" 192 #define RNA_ALPHABET "ACGU" 193 #define WHITESPACE " \t\n" 194 195 #define isgap(c) ((c) == ' ' || (c) == '.' || (c) == '_' || (c) == '-' || (c) == '~') 196 197 198 /**************************************************** 199 * Sequence i/o: originally from Don Gilbert's readseq 200 ****************************************************/ 201 #include "msa.h" /* for multiple sequence alignment support */ 202 203 /* buffer size for reading in lines from sequence files*/ 204 #define LINEBUFLEN 4096 205 206 /* sequence types parsed by Seqtype() */ 207 /* note that these must match hmmAMINO and hmmNUCLEIC in HMMER */ 208 #define kOtherSeq 0 /* hmmNOTSETYET */ 209 #define kDNA 1 210 #define kRNA 2 /* hmmNUCLEIC */ 211 #define kAmino 3 /* hmmAMINO */ 212 213 /* Unaligned sequence file formats recognized 214 * Coexists with definitions of multiple alignment formats in msa.h: 215 * >100 reserved for alignment formats 216 * <100 reserved for unaligned formats 217 * 0 reserved for unknown 218 * 219 * Some "legacy" formats are supported only when explicitly 220 * requested; not autodetected by SeqfileFormat(). 221 * 222 * DON'T REASSIGN THESE CODES. They're written into 223 * GSI index files. You can use new ones, but reassigning 224 * the sense of old ones will break GSI indices. 225 * Alignment format codes were reassigned with the creation 226 * of msa.c, but before Stockholm format, there were no 227 * indexed alignment databases. 228 */ 229 #define SQFILE_UNKNOWN 0 /* unknown format */ 230 #define SQFILE_IG 1 /* Intelligenetics (!) */ 231 #define SQFILE_GENBANK 2 /* GenBank flatfile */ 232 /* 3 was A2M. Now an alignment format */ 233 #define SQFILE_EMBL 4 /* EMBL or Swissprot flatfile */ 234 #define SQFILE_GCG 5 /* GCG single sequence files */ 235 #define SQFILE_STRIDER 6 /* MacStrider (!!) */ 236 #define SQFILE_FASTA 7 /* FASTA format: default */ 237 #define SQFILE_ZUKER 8 /* Zuker MFOLD format (legacy) */ 238 #define SQFILE_IDRAW 9 /* Idraw-style PostScript (legacy) */ 239 /* 10 was SELEX. Now alignment format */ 240 /* 11 was MSF. Now alignment format */ 241 #define SQFILE_PIR 12 /* PIR format */ 242 #define SQFILE_RAW 13 /* raw sequence */ 243 #define SQFILE_SQUID 14 /* my obsolete squid format */ 244 /* 15 was kXPearson, extended FASTA; withdrawn */ 245 #define SQFILE_GCGDATA 16 /* GCG data library file */ 246 /* 17 was Clustal. Now alignment format*/ 247 #ifdef CLUSTALO 248 #define SQFILE_VIENNA 18 /* Vienna format: concatenated fasta */ 249 #define SQFILE_DUBLIN 19 /* unaligned version of Stockholm */ 250 #endif 251 #define IsUnalignedFormat(fmt) ((fmt) && (fmt) < 100) 252 253 #include "ssi.h" 254 255 struct ReadSeqVars { 256 FILE *f; /* open file pointer */ 257 char *fname; /* name of file; used for diagnostics */ 258 int linenumber; /* what line are we on in the file */ 259 260 char *buf; /* dynamically allocated sre_fgets() buffer */ 261 int buflen; /* allocation length for buf */ 262 263 int ssimode; /* SSI_OFFSET_I32 or SSI_OFFSET_I64 */ 264 SSIOFFSET ssioffset; /* disk offset to last line read into buf */ 265 SSIOFFSET r_off; /* offset to start of record */ 266 SSIOFFSET d_off; /* offset to start of sequence data */ 267 268 int rpl; /* residues per data line for this file; -1 if unset, 0 if invalid */ 269 int lastrpl; /* rpl on last line seen */ 270 int maxrpl; /* max rpl on any line of the file */ 271 int bpl; /* bytes per data line; -1 if unset, 0 if invalid */ 272 int lastbpl; /* bpl on last line seen */ 273 int maxbpl; /* max bpl on any line of the file */ 274 275 char *seq; /* growing sequence during parse */ 276 SQINFO *sqinfo; /* name, id, etc, gathered during parse */ 277 char *sp; 278 int seqlen; /* current sequence length */ 279 int maxseq; /* current allocation length for seq */ 280 281 int format; /* format of seqfile we're reading. */ 282 int do_gzip; /* TRUE if f is a pipe from gzip -dc */ 283 int do_stdin; /* TRUE if f is stdin */ 284 285 /* An (important) hack for sequential access of multiple alignment files: 286 * we read the whole alignment in, 287 * and then copy it one sequence at a time into seq and sqinfo. 288 * It is active if msa is non NULL. 289 * msa->lastidx is reused/overloaded: used to keep track of what 290 * seq we'll return next. 291 * afp->format is the real format, while SQFILE->format is kMSA. 292 * Because we keep it in the SQFILE structure, 293 * ReadSeq() and friends are always reentrant for multiple seqfiles. 294 */ 295 MSA *msa; 296 MSAFILE *afp; 297 }; 298 typedef struct ReadSeqVars SQFILE; 299 300 301 /**************************************************** 302 * Cluster analysis and phylogenetic tree support 303 ****************************************************/ 304 305 /* struct phylo_s - a phylogenetic tree 306 * 307 * For N sequences, there will generally be an array of 0..N-2 308 * phylo_s structures representing the nodes of a tree. 309 * [0] is the root. The indexes of left and 310 * right children are somewhat confusing so be careful. The 311 * indexes can have values of 0..2N-2. If they are 0..N-1, they 312 * represent pointers to individual sequences. If they are 313 * >= N, they represent pointers to a phylo_s structure 314 * at (index - N). 315 */ 316 struct phylo_s { 317 int parent; /* index of parent, N..2N-2, or -1 for root */ 318 int left; /* index of one of the branches, 0..2N-2 */ 319 int right; /* index of other branch, 0..2N-2 */ 320 float diff; /* difference score between seqs */ 321 float lblen; /* left branch length */ 322 float rblen; /* right branch length */ 323 char *is_in; /* 0..N-1 flag array, 1 if seq included */ 324 int incnum; /* number of seqs included at this node */ 325 }; 326 327 328 /* Strategies for cluster analysis; cluster by mean distance, 329 * minimum distance, or maximum distance. 330 */ 331 enum clust_strategy { CLUSTER_MEAN, CLUSTER_MAX, CLUSTER_MIN }; 332 333 /**************************************************** 334 * Generic data structure support 335 ****************************************************/ 336 337 /* a struct intstack_s implements a pushdown stack for storing 338 * single integers. 339 */ 340 struct intstack_s { 341 int data; 342 struct intstack_s *nxt; 343 }; 344 345 /**************************************************** 346 * Binary nucleotide alphabet support 347 ****************************************************/ 348 349 /* Binary encoding of the IUPAC code for nucleotides 350 * 351 * four-bit "word", permitting rapid degenerate matching 352 * A C G T/U 353 * 0 0 1 0 354 */ 355 #define NTA 8 356 #define NTC 4 357 #define NTG 2 358 #define NTT 1 359 #define NTU 1 360 #define NTN 15 /* A|C|G|T */ 361 #define NTR 10 /* A|G */ 362 #define NTY 5 /* C|T */ 363 #define NTM 12 /* A|C */ 364 #define NTK 3 /* G|T */ 365 #define NTS 6 /* C|G */ 366 #define NTW 9 /* A|T */ 367 #define NTH 13 /* A|C|T */ 368 #define NTB 7 /* C|G|T */ 369 #define NTV 14 /* A|C|G */ 370 #define NTD 11 /* A|G|T */ 371 #define NTGAP 16 /* GAP */ 372 #define NTEND 0 /* null string terminator */ 373 374 /* ntmatch(): bitwise comparison of two nuc's 375 * note that it's sensitive to the order; 376 * probe may be degenerate but target should not be 377 */ 378 #define ntmatch(probe, target) ((probe & target) == target) 379 380 /**************************************************** 381 * Support for a portable, flexible Getopt() 382 ****************************************************/ 383 384 /* Structure: opt_s 385 * 386 * Structure for declaring options to a main(). 387 */ 388 struct opt_s { 389 char *name; /* name of option, e.g. "--option1" or "-o" */ 390 int single; /* TRUE if a single letter option */ 391 int argtype; /* for typechecking, e.g. sqdARG_INT */ 392 }; 393 /* acceptable argtype's... */ 394 #define sqdARG_NONE 0 /* no argument */ 395 #define sqdARG_INT 1 /* something that atoi() can grok */ 396 #define sqdARG_FLOAT 2 /* something that atof() can grok */ 397 #define sqdARG_CHAR 3 /* require single character or digit */ 398 #define sqdARG_STRING 4 /* anything goes */ 399 400 /**************************************************** 401 * Support for convenient Perl-y regexp matching 402 * See hsregexp.c for copyright notice: this code is derived 403 * from Henry Spencer's freely distributed regexp library. 404 ****************************************************/ 405 406 #define NSUBEXP 10 407 typedef struct sqd_regexp { 408 char *startp[NSUBEXP]; 409 char *endp[NSUBEXP]; 410 char regstart; /* Internal use only. */ 411 char reganch; /* Internal use only. */ 412 char *regmust; /* Internal use only. */ 413 int regmlen; /* Internal use only. */ 414 char program[1]; /* Unwarranted chumminess with compiler. */ 415 } sqd_regexp; 416 417 /* Strparse() defines and manages these. 418 * sqd_parse[0] contains the substring that matched the pattern. 419 * sqd_parse[1-9] contain substrings matched with ()'s. 420 */ 421 extern char *sqd_parse[10]; 422 423 /**************************************************** 424 * Portable detection of multiprocessor # of CPUs. 425 * #include <unistd.h> 426 * long foo = SQD_NPROC; 427 * returns the number of available processors. 428 * if foo == -1, we failed. 429 ****************************************************/ 430 431 /* Our problem here is that POSIX apparently doesn't specify 432 * a standard for how to get sysconf() to report the number of 433 * processors on-line. _SC_NPROCESSORS_ONLN is specified 434 * by SVR4.0MP. Thanks to W. Gish for help here. 435 */ 436 #undef SQD_NPROC 437 #ifdef _SC_NPROCESSORS_ONLN /* Sun Solaris, Digital UNIX */ 438 #define SQD_NPROC sysconf(_SC_NPROCESSORS_ONLN) 439 #else 440 #ifdef _SC_NPROC_ONLN /* Silicon Graphics IRIX */ 441 #define SQD_NPROC sysconf(_SC_NPROC_ONLN) 442 #else /* FreeBSD, Linux don't support getting ncpu via sysconf() */ 443 #define SQD_NPROC -1 444 #endif 445 #endif 446 447 /**************************************************** 448 * Three levels of debugging printf's and assert's 449 * level 1: little impact on verbosity or performance 450 * level 2: moderate impact 451 * level 3: high impact 452 * Example: 453 * SQD_DPRINTF3(("Matrix row %d col %d = %f\n", i, j, val)); 454 * Note the double parentheses; these are important. 455 ****************************************************/ 456 457 #ifndef DEBUGLEVEL 458 #define DEBUGLEVEL 0 459 #endif 460 461 #if (DEBUGLEVEL >= 1) 462 #define SQD_DPRINTF1(x) printf x 463 #define SQD_DASSERT1(x) assert x 464 #else 465 #define SQD_DPRINTF1(x) 466 #define SQD_DASSERT1(x) 467 #endif 468 #if (DEBUGLEVEL >= 2) 469 #define SQD_DPRINTF2(x) printf x 470 #define SQD_DASSERT2(x) assert x 471 #else 472 #define SQD_DPRINTF2(x) 473 #define SQD_DASSERT2(x) 474 #endif 475 #if (DEBUGLEVEL >= 3) 476 #define SQD_DPRINTF3(x) printf x 477 #define SQD_DASSERT3(x) assert x 478 #else 479 #define SQD_DPRINTF3(x) 480 #define SQD_DASSERT3(x) 481 #endif 482 483 /* PANIC is called for failures of Std C/POSIX functions, 484 * instead of my own functions. Panic() calls perror() and exits 485 * abnormally. 486 */ 487 #define PANIC Panic(__FILE__, __LINE__) 488 489 /* Malloc/realloc calls are wrapped 490 */ 491 #define MallocOrDie(x) sre_malloc(__FILE__, __LINE__, (x)) 492 #define ReallocOrDie(x,y) sre_realloc(__FILE__, __LINE__, (x), (y)) 493 494 /**************************************************** 495 * Miscellaneous macros and defines 496 ****************************************************/ 497 498 #define SQDCONST_E 2.71828182845904523536028747135 499 #define SQDCONST_PI 3.14159265358979323846264338328 500 501 /* must declare swapfoo to use SWAP() */ 502 #define SWAP(a,b) {swapfoo = b; b = a; a = swapfoo;} 503 #define ScalarsEqual(a,b) (fabs((a)-(b)) < 1e-7) 504 505 #ifndef MIN 506 #define MIN(a,b) (((a)<(b))?(a):(b)) 507 #endif 508 #ifndef MAX 509 #define MAX(a,b) (((a)>(b))?(a):(b)) 510 #endif 511 512 /* For convenience and (one hopes) clarity in boolean tests: 513 */ 514 #ifndef TRUE 515 #define TRUE 1 516 #endif 517 #ifndef FALSE 518 #define FALSE 0 519 #endif 520 521 /* Somewhere, there is a universe in which Unix vendors comply 522 * with the ANSI C standard. Unfortunately, it is not ours: 523 */ 524 #ifndef EXIT_SUCCESS 525 #define EXIT_SUCCESS 0 526 #endif 527 #ifndef EXIT_FAILURE 528 #define EXIT_FAILURE 1 529 #endif 530 531 #include "sqfuncs.h" /* squid function declarations */ 532 #include "sre_random.h" /* random number generator and samplers */ 533 #include "vectorops.h" /* vector operations */ 534 #endif /* SQUIDH_INCLUDED */ 535