1 /******************************************************************* 2 * Maximum likelihood estimate of recombination rates using * 3 * coalescent trees. * 4 * * 5 * Mary Kuhner * 6 * Genetics 357360 * 7 * University of Washington * 8 * Seattle, WA 98195-7360, USA * 9 * mkkuhner@genetics.washington.edu * 10 * * 11 * With the help of Joe Felsenstein (joe@genetics.washington.edu), * 12 * Jon Yamato, and Sean Lamont. * 13 * Special THANKS to Peter Beerli (beerli@genetics.washington.edu) * 14 * * 15 * Original creation of this file by SeanLamont and JoeFelsenstein * 16 * * 17 * Substantial rework by PeterBeerli * 18 *******************************************************************/ 19 20 #define MEMDEBUG 21 22 #ifndef RECOMBINE_INCLUDE 23 #define RECOMBINE_INCLUDE 24 #endif 25 26 #define VERSION "0.40" 27 28 #ifdef GNUDOS 29 #define DJGPP 30 #define DOS 31 #endif /* GNUDOS */ 32 33 #ifdef THINK_C 34 #define MAC 35 #endif /* THINK_C */ 36 37 #ifdef __MWERKS__ 38 #undef HAVE_LGAMMA 39 #ifdef macintosh 40 #define MAC 41 #include <ansi_prefix.mac.h> /*fixes time problems*/ 42 /* #include "unix.mach.h" */ 43 #else /* macintosh */ 44 #define DOS 45 #endif /* windows */ 46 #endif /* MWERKS */ 47 48 /* for Metrowerks compiler 49 GENERATINGPOWERPC - Compiler is generating PowerPC 50 instructions 51 GENERATING68K - Compiler is generating 68k 52 family instructions 53 GENERATING68881 - Compiler is generating mc68881 54 floating point instructions 55 56 SystemSevenFiveOrLater - Compiled code will only be run on a 57 System 7.5 or later Macintosh 58 SystemSevenOrLater - Compiled code will only be run 59 on a System 7.0 or later Macintosh 60 SystemSixOrLater - Compiled code will only be run 61 on a System 6.0 or later Macintosh */ 62 #ifdef GENERATINGPOWERPC 63 #define MAC 64 #endif 65 66 #ifdef GENERATING68K 67 #define MAC 68 #endif 69 70 #ifdef GENERATING68881 71 #define MAC 72 #endif 73 74 #ifdef SystemSevenFiveOrLater 75 #define MAC 76 #endif 77 78 #ifdef SystemSevenOrLater 79 #define MAC 80 #endif 81 82 #ifdef SystemSixOrLater 83 #define MAC 84 #endif 85 86 #ifdef __CMS_OPEN 87 #define CMS 88 #define EBCDIC true 89 #define INFILE "infile data" 90 #define OUTFILE "outfile data" 91 #define TREEFILE "treefile data" 92 #define FONTFILE "fontfile data" 93 #define PLOTFILE "plotfile data" 94 #define INTREE "intree data" 95 #define OUTTREE "outtree data" 96 #else 97 #define EBCDIC false 98 #define INFILE "infile" 99 #define OUTFILE "outfile" 100 #define TREEFILE "treefile" 101 #define FONTFILE "fontfile" /* on unix this might be /usr/local/lib/fontfile */ 102 #define PLOTFILE "plotfile" 103 #define INTREE "intree" 104 #define OUTTREE "outtree" 105 #endif /* CMS_OPEN */ 106 107 #ifdef L_ctermid /* try and detect for sysV or V7. */ 108 #define SYSTEM_FIVE 109 #endif /* L_ctermid */ 110 111 #ifdef sequent 112 #define SYSTEM_FIVE 113 #endif /* sequent */ 114 115 #ifndef MAC 116 #ifndef SYSTEM_FIVE 117 # include<stdlib.h> 118 # if defined(_STDLIB_H_) || defined(_H_STDLIB) || defined(H_SCCSID) || defined(unix) 119 # define UNIX 120 # define MACHINE_TYPE "BSD Unix C" 121 # endif /* defined.....*/ 122 #endif /* SYSTEM_FIVE */ 123 #endif /* MAC */ 124 125 #ifdef __STDIO_LOADED 126 #define VMS 127 #define MACHINE_TYPE "VAX/VMS C" 128 #define printf vax_printf_is_broken 129 #define fprintf vax_fprintf_is_broken 130 void vax_printf_is_broken(const char *fmt,...); 131 void vax_fprintf_is_broken(FILE *fp,const char *fmt,...); 132 void vax_tweak_fmt(char *); 133 #endif /* __STDIO_LOADED */ 134 135 #ifdef __WATCOMC__ 136 #define QUICKC 137 #define WATCOM 138 #define DOS 139 #endif /* __WATCOMC__ */ 140 /* watcom-c has graphics library calls that are almost identical to * 141 * quick-c, so the "QUICKC" symbol name stays. */ 142 143 #ifdef _QC 144 #define MACHINE_TYPE "MS-DOS / Quick C" 145 #define QUICKC 146 #include "graph.h" 147 #define DOS 148 #endif /* _QC */ 149 150 #ifdef _DOS_MODE 151 #define MACHINE_TYPE "MS-DOS /Microsoft C " 152 #define DOS /* DOS is always defined if on a dos machine */ 153 #define MSC /* MSC is defined for microsoft C */ 154 #endif /* _DOS_MODE */ 155 156 #ifdef __MSDOS__ /* TURBO c compiler, ONLY (no other DOS C compilers) */ 157 #define DOS 158 #define TURBOC 159 #include<stdlib.h> 160 #include<graphics.h> 161 #endif /* __MSDOS__ */ 162 163 #ifdef DJGPP /* DJ's gnu C/C++ port */ 164 #include<graphics.h> 165 #endif 166 167 #ifndef MACHINE_TYPE 168 #define MACHINE_TYPE "ANSI C" 169 #endif 170 171 #ifdef DOS 172 #define MALLOCRETURN void 173 #else 174 #define MALLOCRETURN void 175 #endif /* DOS */ 176 177 #ifdef VMS 178 #define signed /* signed doesn't exist in VMS */ 179 #endif /* VMS */ 180 181 /* default screen types */ 182 #ifdef DOS 183 #define IBMCRT true 184 #define ANSICRT false 185 #else 186 #ifdef MAC 187 #define IBMCRT false 188 #define ANSICRT false 189 #else 190 #define IBMCRT false 191 #define ANSICRT true 192 #endif /* MAC */ 193 #endif /* DOS */ 194 195 #ifdef DJGPP 196 #undef MALLOCRETURN 197 #define MALLOCRETURN void 198 #endif /* DJGPP */ 199 200 201 /* includes: */ 202 #ifdef UNIX 203 #include<strings.h> 204 #include<string.h> 205 #else 206 #include<string.h> 207 #endif /* UNIX */ 208 209 #include <stddef.h> 210 #include <stdio.h> 211 #include <stdlib.h> 212 #include <math.h> 213 #include <ctype.h> 214 215 #ifdef MAC 216 #include "mac_interface.h" 217 #endif /* MAC */ 218 219 #include "constants.h" 220 221 #ifdef __MWERKS__ 222 #undef DBL_EPSILON 223 #undef DBL_MAX 224 #include <float.h> 225 #endif /* __MWERKS */ 226 227 #define FClose(file) if (file) fclose(file) ; file=NULL 228 229 typedef unsigned char boolean; 230 231 /* warning WARNING debug DEBUG */ 232 #define true 1 233 #define false 0 234 235 #ifndef TRUE 236 #define TRUE 1 237 #endif 238 239 #ifndef FALSE 240 #define FALSE 0 241 #endif 242 243 #define SETBITS 32 244 245 #ifndef DBL_EPSILON 246 #include <float.h> 247 #ifndef DBL_MAX 248 #define DBL_MAX ((double)1.7976931348623157e308) 249 #define DBL_EPSILON 2.2204460492503131e-16 250 #endif 251 #endif 252 253 typedef struct treerec { 254 long *kk; 255 double *kend; 256 long *eventtype; 257 double *actives; 258 long numcoals; 259 double numrecombs; /* double to allow fatal attraction avoidance */ 260 long *sitescore; 261 boolean hastings_adjust; 262 double tk, ts; 263 double llike; 264 } treerec; 265 266 #ifdef MAC 267 MALLOCRETURN *mymalloc(long); 268 #else 269 MALLOCRETURN *mymalloc(); 270 #endif 271 272 typedef unsigned int base; 273 typedef double sitelike[4]; 274 typedef sitelike *ratelike; 275 typedef ratelike *phenotype; 276 typedef double *contribarr; 277 typedef char naym[NMLNGTH]; 278 typedef long longer[3]; 279 typedef char **sequence; 280 281 282 /* holds the cumulative data likelihoods in the tree structure */ 283 typedef struct datalike_fmt { 284 double **a; /* by loci by MICRO_ALLELEMAX */ 285 double ****s; /* by site by category by slice by base */ 286 } datalike_fmt; 287 288 /* used in the tree structure */ 289 typedef struct _node { 290 struct _node *next, *back; 291 char type; 292 long id; 293 long number; 294 datalike_fmt *x; 295 char *nayme; 296 boolean top; 297 double v, tyme, length; 298 boolean updated; 299 long futileflag; 300 301 /* sequence specific stuff */ 302 long *ranges; 303 boolean *members; 304 long memberstrue; 305 306 /* recombination specific stuff */ 307 long recstart, recend; 308 long *coal; 309 310 /* migration specific stuff */ 311 long pop, actualpop; 312 double lxmax; 313 314 /* disease trait likelihood specific stuff */ 315 double *z; 316 317 } node; 318 319 320 /* this is a list describing the tree as a series of horizontal 321 slices of the tree. Each slice starts at a node, and extends 322 down until the next cut is encountered. */ 323 typedef struct tlist { 324 struct tlist *prev, *succ; 325 node *eventnode; /* points to a random tip for the tips */ 326 double age; /* tyme from tree top */ 327 long numbranch; 328 node **branchlist; 329 } tlist; 330 331 typedef struct creature { 332 long numhaplotypes; 333 node **haplotypes; 334 long numflipsites; 335 long *flipsites; 336 } creature; 337 338 typedef struct tree { 339 node **nodep; 340 creature *creatures; 341 double likelihood; 342 double coalprob; 343 node *root; 344 tlist *tymelist; 345 long numcoals; 346 long numrecombs; 347 } tree; 348 349 typedef struct reclist { 350 struct reclist *prev, *succ; 351 node *recnode; 352 long startsite, endsite; 353 boolean iscross; 354 } reclist; 355 356 typedef struct rlrec { 357 double *val; 358 } rlrec; 359 360 typedef struct valrec { 361 double rat_xi, rat_xv, zz, z1, y1, ww1, zz1, ww2, zz2, z1zz, z1yy, 362 xiz1, xiy1_xv, ww1zz1, vv1zz1, ww2zz2, vv2zz2; 363 } valrec; 364 365 typedef struct dnadata { 366 char *title; /* datafile title, if any */ 367 char **popnames; /* population names, if any */ 368 char ****indnames; /* individual sequence names */ 369 long numpop; /* number of populations */ 370 long numloci; /* number of loci */ 371 long *numseq; /* number of individuals per population */ 372 long *sites; /* number of base pairs per locus */ 373 long **sitecount; /* number of sites per locus per site */ 374 long **markersite; /* map of markers into sites: per locus per marker */ 375 376 int *segranges0; /* 4 special auxiliary arrays to speed up some */ 377 int *segranges1; /* brlist, hidden-passages, code; */ 378 int *segranges2; /* all referenced by site */ 379 int *segranges3; 380 381 char ****seqs; /* DNA sequences */ 382 double ***sspace; /* spacing between DNA sites per pop per locus 383 per site */ 384 char sdlm; /* spacing file delimiter for site/gap pairs */ 385 double freqa, freqc, freqg, freqt, freqr, freqy, freqar, freqcy, 386 freqgr, freqty; /* base frequencies */ 387 double xi; /* transition pool prob */ 388 double xv; /* transversion pool prob */ 389 double fracchange; /* probability of change */ 390 double ttratio; /* transition/transversion ratio */ 391 long *dnaweight; /* arbitrary dna sitewise weighting factor */ 392 } dnadata; 393 394 typedef struct msatdata { 395 char *title; /* datafile title, if any */ 396 char **popnames; /* population names, if any */ 397 char ****indnames; /* individual tip names */ 398 char dlm; /* input file delimiter for diploids */ 399 long numpop; /* number of populations */ 400 long numloci; /* number of loci */ 401 long *numind; /* number of individuals per population */ 402 long ****msats; /* micro satellite counts */ 403 double ***mspace; /* spacing after a particular msat locus */ 404 double **steps; /* table of transition probabilities from 405 state "x" to state "y". */ 406 } msatdata; 407 408 typedef struct data_fmt { 409 dnadata *dnaptr; 410 msatdata *msptr; 411 long *siteptr; /* site aliasing array */ 412 } data_fmt; 413 414 typedef struct option_struct { 415 boolean ctgry, watt, printdata, usertree, progress, 416 treeprint, interleaved, ibmpc, ansi, autocorr, 417 freqsfrom, plump, weights, spacing, newdata, same_ne, 418 interactive, mhmcsave, datatypeset, panel, map, fc, 419 haplotyping, norecsnp, profile, print_recbythmaxcurve; 420 char datatype; 421 long steps[NUM_TYPE_CHAINS], numchains[NUM_TYPE_CHAINS], 422 increm[NUM_TYPE_CHAINS], numout[NUM_TYPE_CHAINS], holding, categs; 423 long *numpanel; /* number of individuals in SNP panel per 424 population */ 425 double *ne_ratio, *rate, *probcat, lambda; 426 double mutrait, traitratio, pd; /* for mapper */ 427 long hapdrop; /* for haplotyping: strat 0 = fliphap, 1 = single flipdrop, 428 2 = double flipdrop */ 429 double happrob; /* probability of doing haplotype rearrangement */ 430 431 /* The next set of variables are for the "full" SNP option */ 432 boolean full; 433 double chance_seen; 434 435 /* these variables handle different temperature chains */ 436 long *temperature, ctemp, numtempchains; 437 438 /* these variables are for use in communicating the upper and lower 439 bounds in theta and rec-rate from the confidence interval calculator 440 to the likelihood curve printer */ 441 double thlb, thub, reclb, recub; 442 443 } option_struct; 444 445 #define FILEP(A,B) (MENU ? (A) : (B)) 446 #define ERRFILE FILEP(stderr,simlog) 447 #define REF_CHAIN(A) (((A) < op->numchains[0]) ? 0 : (A)+1-op->numchains[0]) 448 #define TYPE_CHAIN(A) (((A) < op->numchains[0]) ? 0 : 1) 449 #define MAX(A,B) (((A) > (B)) ? (A) : (B)) 450 #define MIN(A,B) (((A) < (B)) ? (A) : (B)) 451 #define EXP(A) (((A) > EXPMAX) ? printf("\n\nTOO BIG\n") : exp(A)) 452 #define NUM_CHROM(A) (((A) == 's') ? 1 : 2) 453 #define BOOLPRINT(A) ((A) ? "true" : "false") 454 455 #ifndef GETDATA_INCLUDE 456 #include "getdata.h" 457 #endif 458 459 void setupoption_struct(option_struct **op); 460 char lowercase(char c); 461 void openfile(FILE **fp, char *filename, char *mode, char *application, 462 char *perm); 463 boolean isrecomb(node *p); 464 boolean iscoal(node *p); 465 boolean branchsub(tlist *t, node *oldbranch, node *newbranch); 466 void insertaftertymelist(tlist *t, node *p); 467 void subtymelist(tlist *t, node *branchtop, boolean both); 468 void printtymelist(tlist *t); 469 node *findtop(node *p); 470 node *findunique(node *p); 471 node *otherdtr(node *p); 472 node *otherparent(node *p); 473 long findlink(node *p); 474 void free_z(option_struct *op, node *p); 475 void allocate_z(option_struct *op, data_fmt *data, node *p); 476 void free_x(option_struct *op, node *p); 477 void allocate_x(option_struct *op, data_fmt *data, node *p); 478 void VarMalloc(dnadata *dna, node *p, boolean allokate); 479 void init_coal_alloc(long **coal, long numcoalpairs); 480 void coal_Malloc(node *p, boolean allokate, long numcoalpairs); 481 void init_ranges_alloc(long **ranges, long numrangepairs); 482 void ranges_Malloc(node *p, boolean allokate, long numrangepairs); 483 void meld_adjacent_ranges(long *cranges, long newelem); 484 void addrange(long **newranges, long newstart, long newend); 485 boolean inrange(long *ranges, long site); 486 void subrangefc(long **newranges, long substart, long subend); 487 void printrange(long *ranges); 488 void newnode(node **p); 489 void freenode(node *p); 490 void newtymenode(tlist **t); 491 void freetymenode(tlist *t); 492 void freetymelist(tlist *t); 493 void hookup(node *p, node *q); 494 void atr(node *p); 495 void probatr(node *p); 496 tlist *gettymenode(tree *tr, long target); 497 double vtol(option_struct *op, data_fmt *data, double v); 498 void ltov(option_struct *op, data_fmt *data, node *p); 499 double findcoal_ltov(option_struct *op, data_fmt *data, double value); 500 void joinnode(option_struct *op, data_fmt *data, double length, 501 node *p, node *q); 502 void readparmfile(option_struct *op); 503 boolean whichopbool(option_struct *op, long i); 504 void rec_parmfilewrite(option_struct *op, long numloci, long numpop); 505 void readseedfile(void); 506 void print_menuheader(option_struct *op); 507 void print_startmenu(option_struct *op, boolean writeout); 508 void print_datamenu(option_struct *op); 509 void print_searchmenu(option_struct *op); 510 void print_menuend(void); 511 void initoptions(option_struct *op); 512 void workmenu1(option_struct *op, char ch, boolean *menu1, 513 boolean *writeout); 514 void workmenu2(option_struct *op, char ch, boolean *menu1, 515 long *numloci, long *numpop); 516 void checkparmfile(option_struct *op); 517 void getoptions(option_struct *op); 518 void firstinit(option_struct *op, data_fmt *data); 519 void popinit(option_struct *op, data_fmt *data); 520 void locusinit(option_struct *op, data_fmt *data); 521 void inputcategories(option_struct *op, data_fmt *data); 522 void end_of_population_free(option_struct *op, data_fmt *data); 523 void end_of_locus_free(option_struct *op, data_fmt *data); 524 void freetree(option_struct *op, data_fmt *data, tree *target); 525 void makesiteptr(option_struct *op, data_fmt *data); 526 527 node *allocate_nodelet(long num, char type); 528 void allocate_root(option_struct *op, data_fmt *data, tree *tr); 529 void allocate_tip(option_struct *op, data_fmt *data, tree *tr, 530 long num); 531 void allocate_interior(option_struct *op, data_fmt *data, tree *tr, long num); 532 void init_creature(creature *cr, long numhaplotypes); 533 void treesetup(option_struct *op, data_fmt *data); 534 535 void read_line(FILE *source, char *line); 536 void read_header(option_struct *op, long *numpop, long *numloci, 537 long **numsites, char *title, char *dlm); 538 void read_popheader(long *numind, char *poptitle); 539 540 void getinput(option_struct *op, data_fmt *data); 541 void orient(tree *tr, option_struct *op, data_fmt *data, node *p); 542 void plumptree(option_struct *op, data_fmt *data, double th0); 543 void finishsetup(option_struct *op, data_fmt *data, node *p); 544 void initbranchlist(option_struct *op, data_fmt *data); 545 void inittable(option_struct *op, dnadata *dna); 546 void initweightrat(option_struct *op, data_fmt *data); 547 void treeout(node *p, long s, FILE **usefile); 548 boolean nuview_usebranch(data_fmt *data, node *p, long site); 549 double prob_micro(msatdata *ms, double t, long diff); 550 void calcrange(option_struct *op, data_fmt *data, node *p, node *q, 551 node *r, long whichtree, long start, long finish, long numcategs); 552 void nuview(option_struct *op, data_fmt *data, node *p, long indexsite, 553 long startsite, long endsite); 554 void nuview_micro(option_struct *op, data_fmt *data, node *p, long indexsite, 555 long startsite, long endsite); 556 node *findcoal(option_struct *op, data_fmt *data, node *p, double *v); 557 void localsmooth(option_struct *op, data_fmt *data, node *p, 558 long indexsite, long startsite, long endsite); 559 void snpsmooth(option_struct *op, data_fmt *data, tree *tr, 560 long indexsite, long startsite, long endsite); 561 void localeval(option_struct *op, data_fmt *data, node *p, boolean first); 562 double micro_evaluate(option_struct *op, msatdata *ms, tree *tr, 563 long start, long end); 564 void seekch(char c); 565 void getch(char *c); 566 void processlength(node *p); 567 void addelement(option_struct *op, data_fmt *data, node *p, long *nextnode); 568 void treeread(option_struct *op, data_fmt *data); 569 void finddnasubtrees(option_struct *op, data_fmt *data, tlist *tstart, 570 long *sranges); 571 void findsubtrees_node(option_struct *op, data_fmt *data, 572 tlist *tlast, tree *tr, long **coal); 573 void drop_findsubtrees(option_struct *op, data_fmt *data, tree *tr, 574 node *p); 575 void findsubtrees(option_struct *op, data_fmt *data, tlist *tstart, 576 long *sranges); 577 void findsubtreemarkers(option_struct *op, data_fmt *data, long tstart, 578 long tend, long *mstart, long *mend); 579 long markertosite(option_struct *op, data_fmt *data, long marker); 580 long sitetorightmarker(option_struct *op, data_fmt *data, long site); 581 long sitetomarker(option_struct *op, data_fmt *data, long site); 582 boolean sameranges(long *range1, long *range2); 583 void copyranges(node *source, node *target); 584 void copycoal(node *source, node *target); 585 void copylikes(option_struct *op, data_fmt *data, node *source, 586 node *target); 587 void copynode(option_struct *op, data_fmt *data, node *source, 588 node *target); 589 void addnode(option_struct *op, data_fmt *data, node *p, node *q); 590 node *getrecnodelet(node *copy, node *orig); 591 void make_tree_copy(option_struct *op, data_fmt *data, tree *source, 592 tree *target); 593 void copycreature(option_struct *op, data_fmt *data, creature *source, 594 creature *target, tree *tr); 595 void copycreatures(option_struct *op, data_fmt *data, tree *source, 596 tree *target); 597 tree *copytree(option_struct *op, data_fmt *data, tree *source); 598 void constructtree(option_struct *op, data_fmt *data, double branchlength); 599 boolean rearrange(option_struct *op, data_fmt *data, long whichchain); 600 void temptreeswap(option_struct *op, data_fmt *data, boolean *changed, 601 long chain); 602 void maketree(option_struct *op, data_fmt *data); 603 void freenodelet(option_struct *op, data_fmt *data, node *p); 604 void finalfree(option_struct *op, data_fmt *data); 605 double randum(void); 606 double lengthof(node *p); 607 void fixlength(option_struct *op, data_fmt *data, node *p); 608 double watterson(option_struct *op, data_fmt *data); 609 void invardatacheck(option_struct *op, data_fmt *data); 610 void buildtymelist(tree *tr, option_struct *op, data_fmt *data, node *p); 611 double eval_calcrange(option_struct *op, data_fmt *data, tree *tr, 612 boolean first, long start, long finish, long numcategs); 613 double snp_eval_calcrange(option_struct *op, data_fmt *data, tree *tr, 614 boolean first, long start, long finish, long numcategs); 615 double panel_eval_calcrange(option_struct *op, data_fmt *data, tree *tr, 616 boolean first, long start, long finish, long numcategs); 617 double evaluate(option_struct *op, data_fmt *data, tree *tr, 618 double llike); 619 double snp_evaluate(option_struct *op, data_fmt *data, tree *tr, 620 double llike, long start, long finish); 621 boolean hasrec_chain(option_struct *op, long chain); 622 void chainendcheck(option_struct *op, data_fmt *data, tree *tr, 623 long chain, boolean locusend); 624 double min_theta_calc(option_struct *op, data_fmt *data, double th); 625 long boolcheck(char ch); 626 long countbranches(option_struct *op, data_fmt *data, tree *tr); 627 int main(int argc, char *argv[]); 628 boolean booleancheck(option_struct *op, char *var, char *value); 629 boolean numbercheck(option_struct *op, char *var, char *value); 630 boolean sitecompare(dnadata *dna, long site1, long site2); 631 boolean testratio(option_struct *op, data_fmt *data, tree *oldtree, 632 tree *newtree, char ratiotype); 633 634 boolean istip(node *p); 635 void makeinvarvalues(dnadata *dna, long numcategs, tree *tr); 636 void getnodata(option_struct *op, data_fmt *data); 637 void rec_outtree(node *p, boolean first, FILE **usefile); 638 639 int eof(FILE *f); 640 int eoln(FILE *f); 641