1 /*------------------------------------------------------
2 Maximum likelihood estimation
3 of migration rate and effectice population size
4 using a Metropolis-Hastings Monte Carlo algorithm
5 -------------------------------------------------------
6 S E Q U E N C E S R O U T I N E S
7
8
9 Peter Beerli 1996, Seattle
10 beerli@fsu.edu
11
12 Copyright 2002 Peter Beerli and Joseph Felsenstein
13
14 Permission is hereby granted, free of charge, to any person obtaining
15 a copy of this software and associated documentation files (the
16 "Software"), to deal in the Software without restriction, including
17 without limitation the rights to use, copy, modify, merge, publish,
18 distribute, sublicense, and/or sell copies of the Software, and to
19 permit persons to whom the Software is furnished to do so, subject
20 to the following conditions:
21
22 The above copyright notice and this permission notice shall be
23 included in all copies or substantial portions of the Software.
24
25 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
28 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
29 ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
30 CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
31 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32
33
34 $Id: sequence.c 2171 2013-09-22 22:05:02Z beerli $
35
36 -------------------------------------------------------*/
37 /* \file sequence.c
38
39 */
40 #include "migration.h"
41 #include "sighandler.h"
42 #include "data.h"
43 #include "tools.h"
44 #include "migrate_mpi.h"
45 #include "watterson.h"
46 #include "tree.h"
47
48 #ifdef DMALLOC_FUNC_CHECK
49 #include <dmalloc.h>
50 #endif
51 /* prototypes ------------------------------------------- */
52 void make_sequences (world_fmt * world, option_fmt * options, data_fmt * data,
53 long locus);
54 void init_sequences (world_fmt * world, option_fmt * options, data_fmt * data,
55 long locus);
56 void init_sequences2 (world_fmt * world, seqmodel_fmt * seq, long locus);
57 void initratio (option_fmt * options);
58 void initfreqs (MYREAL *freqa, MYREAL *freqc, MYREAL *freqg, MYREAL *freqt);
59 void initcatn (long *categs);
60 boolean initcategs (long categs, MYREAL *rate, MYREAL *probcat);
61 void initprobcat (long categs, MYREAL *probsum, MYREAL *probcat);
62 void init_tbl (world_fmt * world, long locus);
63 void print_weights (FILE * outfile, world_fmt * world, option_fmt * options,
64 long locus);
65 void print_tbl (FILE * outfile, world_fmt * world, option_fmt * options,
66 long locus);
67 MYREAL treelike_seq (world_fmt * world, long locus);
68 MYREAL treelike_snp (world_fmt * world, long locus);
69 /*private functions */
70 void getbasefreqs (option_fmt * options, seqmodel_fmt * seq, long locus);
71 void empiricalfreqs (world_fmt * world, option_fmt * options,
72 seqmodel_fmt * seq, long locus);
73 void makeweights (world_fmt * world, data_fmt * data, option_fmt *options, long locus);
74 void makevalues_seq (world_fmt * world, option_fmt * options, data_fmt * data,
75 long locus);
76 void make_invarsites (world_fmt * world, data_fmt * data, long locus);
77 void make_invarsites_unlinked (world_fmt * world, data_fmt * data,
78 long locus);
79 void sitecombine2 (world_fmt * world, data_fmt * data, option_fmt *options, long sites,
80 long locus);
81 void sitesort2 (world_fmt * world, data_fmt * data, option_fmt *options, long sites, long locus);
82 void sitescrunch2 (world_fmt * world, long sites, long i, long j, long locus);
83 void inputoptions (world_fmt *earth, world_fmt * world, option_fmt * options, data_fmt * data,
84 long locus);
85 void inputweights (world_fmt * world, data_fmt * data, long chars);
86 void inputcategs (long a, long b, world_fmt * world, option_fmt * options,
87 data_fmt * data, world_fmt *earth);
88 void initlambda (option_fmt * options);
89 void printweights (FILE * outfile, world_fmt * world, option_fmt * options,
90 short inc, long chars, short *weight, char *letters);
91 void print_seqfreqs (FILE * outfile, world_fmt * world, option_fmt * options);
92
93 void snp_invariants (contribarr invariants, world_fmt *world, long locus, phenotype x1, MYREAL *scaler);
94
95 void makevalues_snp (world_fmt * world, option_fmt * options, data_fmt * data,
96 long locus);
97
98 void init_sequences_aliases (world_fmt * earth, world_fmt * world, option_fmt * options,
99 data_fmt * data, long locus);
100
101
102 void check_basefreq (option_fmt * options);
103
104 extern void swap (void *, void *);
105
106
107
108 /* allocation things */
109 void
init_sequences(world_fmt * world,option_fmt * options,data_fmt * data,long locus)110 init_sequences (world_fmt * world, option_fmt * options, data_fmt * data,
111 long locus)
112 {
113
114 long sites = world->data->seq[0]->sites[locus];
115 if (options->datatype == 'u')
116 sites *= 5;
117 if (options->datatype == 'n')
118 sites += 4;
119 if (!world->data->seq[0]->done || world->data->seq[0]->alias == NULL)
120 {
121 world->data->seq[0]->done = TRUE;
122 world->data->seq[0]->alias = (long *) mycalloc (1, sizeof (long) * sites);
123 world->data->seq[0]->ally = (long *) mycalloc (1, sizeof (long) * sites);
124 world->data->seq[0]->savealiasweight = (long *) mycalloc (sites, sizeof (long));
125 world->data->seq[0]->aliasweight = (long *) mycalloc (1, sizeof (long) * sites);
126 world->data->seq[0]->location = (long *) mycalloc (1, sizeof (long) * sites);
127 world->data->seq[0]->category = (long *) mycalloc (1, sizeof (long) * sites);
128 world->data->seq[0]->weight = (short *) mycalloc (1, sizeof (short) * sites);
129 }
130 else
131 {
132 world->data->seq[0]->alias =
133 (long *) myrealloc (world->data->seq[0]->alias, sizeof (long) * sites);
134 world->data->seq[0]->ally =
135 (long *) myrealloc (world->data->seq[0]->ally, sizeof (long) * sites);
136 world->data->seq[0]->savealiasweight =
137 (long *) myrealloc (world->data->seq[0]->savealiasweight,
138 sizeof (long) * sites);
139 world->data->seq[0]->aliasweight =
140 (long *) myrealloc (world->data->seq[0]->aliasweight,
141 sizeof (long) * sites);
142 world->data->seq[0]->location =
143 (long *) myrealloc (world->data->seq[0]->location, sizeof (long) * sites);
144 world->data->seq[0]->category =
145 (long *) myrealloc (world->data->seq[0]->category, sizeof (long) * sites);
146 world->data->seq[0]->weight =
147 (short *) myrealloc (world->data->seq[0]->weight, sizeof (short) * sites);
148 }
149 }
150
151 void
init_sequences_aliases(world_fmt * earth,world_fmt * world,option_fmt * options,data_fmt * data,long locus)152 init_sequences_aliases (world_fmt *earth, world_fmt * world, option_fmt * options,
153 data_fmt * data, long locus)
154 {
155 inputoptions (earth, world, options, data, locus);
156 if (!options->freqsfrom)
157 getbasefreqs (options, world->data->seq[0], locus);
158 makeweights (world, data, options, locus);
159 }
160
161
162 /* menu material ----------------------------------------- */
163 void
initratio(option_fmt * options)164 initratio (option_fmt * options)
165 {
166 long z = 0;
167 char *tmp;
168 char input[LINESIZE];
169 printf
170 ("Transition/transversion ratio?\nEnter a value for each locus, spaced by blanks or commas\n[Minium value > 0.5]");
171 FGETS (input, LINESIZE, stdin);
172 tmp = strtok (input, " ,\n");
173 while (tmp != NULL)
174 {
175 options->ttratio[z++] = atof (tmp);
176 tmp = strtok (NULL, " ,;\n");
177 options->ttratio =
178 (MYREAL *) myrealloc (options->ttratio, sizeof (MYREAL) * (z + 1));
179 options->ttratio[z] = 0.0;
180 }
181 }
182
183 void
initfreqs(MYREAL * freqa,MYREAL * freqc,MYREAL * freqg,MYREAL * freqt)184 initfreqs (MYREAL *freqa, MYREAL *freqc, MYREAL *freqg, MYREAL *freqt)
185 {
186 char input[LINESIZE];
187 int scanned;
188 MYREAL summ = 0;
189
190 printf
191 ("Base frequencies for A, C, G, T/U\n (use blanks to separate, if all are equal use a sign [=])?\n");
192 for (;;)
193 {
194 FGETS (input, LINESIZE, stdin);
195 if (input[0] == '=')
196 {
197 scanned = 4;
198 *freqa = *freqc = *freqg = *freqt = 0.25;
199 }
200 else
201 #ifdef USE_MYREAL_FLOAT
202 scanned = sscanf (input, "%f%f%f%f%*[^\n]", freqa, freqc, freqg, freqt);
203 #else
204 scanned = sscanf (input, "%lf%lf%lf%lf%*[^\n]", freqa, freqc, freqg, freqt);
205 #endif
206 if (scanned == 4)
207 break;
208 else
209 printf ("Please enter exactly 4 values.\n");
210 };
211 // adjust frequencies to a total of 1
212 summ = *freqa + *freqc + *freqg + *freqt;
213 if (summ != 1.0)
214 printf ("Frequency values were adjusted to add up to 1.0.\n");
215 *freqa /= summ;
216 *freqc /= summ;
217 *freqg /= summ;
218 *freqt /= summ;
219 printf ("Nucleotide frequencies: A=%f, C=%f, G=%f, T=%f\n", *freqa, *freqc,
220 *freqg, *freqt);
221 }
222
223
224 void
initcatn(long * categs)225 initcatn (long *categs)
226 { /* initialize category number */
227 int retval;
228 do
229 {
230 printf ("Number of categories (1-%d)?\n", MAXCATEGS);
231 retval = scanf ("%ld%*[^\n]", categs);
232 getchar ();
233 }
234 while (*categs > MAXCATEGS || *categs < 1);
235 }
236
237
238 boolean
initcategs(long categs,MYREAL * rate,MYREAL * probcat)239 initcategs (long categs, MYREAL *rate, MYREAL *probcat)
240 { /* initialize rate categories */
241 long i;
242 char input[LINESIZE];
243 char rest[LINESIZE];
244 int scanned;
245 boolean done;
246
247 for (;;)
248 {
249 printf
250 ("Either enter the Shape parameter alpha for Gamma deviated rates\n*OR* enter the rates for each category (use a space to separate)\n===>");fflush(stdout);
251 FGETS (input, LINESIZE, stdin);
252 done = TRUE;
253 if (count_words (input) == 1)
254 {
255 gamma_rates (rate, probcat, categs, input);
256 return TRUE;
257 }
258 for (i = 0; i < categs; i++)
259 {
260 #ifdef USE_MYREAL_FLOAT
261 scanned = sscanf (input, "%f %[^\n]", &rate[i], rest);
262 #else
263 scanned = sscanf (input, "%lf %[^\n]", &rate[i], rest);
264 #endif
265 if ((scanned < 2 && i < (categs - 1))
266 || (scanned < 1 && i == (categs - 1)))
267 {
268 printf ("Please enter exactly %ld values.\n", categs);
269 done = FALSE;
270 break;
271 }
272 strcpy (input, rest);
273 }
274 if (done)
275 break;
276 }
277 return FALSE;
278 }
279
280 void
initprobcat(long categs,MYREAL * probsum,MYREAL * probcat)281 initprobcat (long categs, MYREAL *probsum, MYREAL *probcat)
282 {
283 long i;
284 boolean done;
285 char input[LINESIZE];
286 char rest[LINESIZE];
287 int scanned;
288
289 do
290 {
291 printf ("Probability for each category?");
292 printf (" (use a space to separate)\n");
293 FGETS (input, LINESIZE, stdin);
294 done = TRUE;
295 for (i = 0; i < categs; i++)
296 {
297 #ifdef USE_MYREAL_FLOAT
298 scanned = sscanf (input, "%f %[^\n]", &probcat[i], rest);
299 #else
300 scanned = sscanf (input, "%lf %[^\n]", &probcat[i], rest);
301 #endif
302 if ((scanned < 2 && i < (categs - 1))
303 || (scanned < 1 && i == (categs - 1)))
304 {
305 done = FALSE;
306 printf ("Please enter exactly %ld values.\n", categs);
307 break;
308 }
309 strcpy (input, rest);
310 }
311 if (!done)
312 continue;
313 *probsum = 0.0;
314 for (i = 0; i < categs; i++)
315 *probsum += probcat[i];
316
317 if (fabs (1.0 - (*probsum)) > 0.001)
318 {
319 for (i = 0; i < categs; i++)
320 probcat[i] /= *probsum;
321 printf ("Probabilities were adjusted to add up to one\n");
322 for (i = 0; i < categs; i++)
323 printf (" %li> %f\n", i + 1, probcat[i]);
324 printf ("\n\n");
325 }
326 }
327 while (!done);
328 }
329
330 ///
331 /// constrains the arbitrary site rate variation to an average of 1.0
constrain_rates(long categs,MYREAL * rate,MYREAL * probcat)332 void constrain_rates(long categs, MYREAL *rate, MYREAL *probcat)
333 {
334 char input[LINESIZE];
335 long i;
336 MYREAL mean;
337 printf("By default the rates will be constrained to an have an average rate of 1.0\nPress return if that is OK (preferred option) or enter the word RAW\n===>"); fflush(stdout);
338 FGETS (input, LINESIZE, stdin);
339 if(input[0] == '\0')
340 {
341 mean = 0.0;
342 for (i = 0; i < categs; i++)
343 {
344 mean += probcat[i] * rate[i];
345 }
346 for (i = 0; i < categs; i++)
347 {
348 rate[i] /= mean;
349 }
350 }
351 }
352
353 /*data read material ===================================== */
354 ///
355 /// read sequence data and linked SNP data
356 void
make_sequences(world_fmt * world,option_fmt * options,data_fmt * data,long locus)357 make_sequences (world_fmt * world, option_fmt * options, data_fmt * data,
358 long locus)
359 {
360 if (world->sumtips<=1)
361 {
362 data->skiploci[locus] = TRUE;
363 world->data->skiploci[locus] = TRUE;
364 world->skipped += 1;
365 }
366
367 makevalues_seq (world, options, data, locus);
368 if (options->freqsfrom)
369 {
370 empiricalfreqs (world, options, world->data->seq[0], locus);
371 getbasefreqs (options, world->data->seq[0], locus);
372 }
373 #ifdef BEAGLE
374 if(world->mutationmodels[world->sublocistarts[locus]].basefreqs==NULL)
375 world->mutationmodels[world->sublocistarts[locus]].basefreqs = (double*) calloc(4,sizeof(double));
376 world->mutationmodels[world->sublocistarts[locus]].basefreqs[0] = world->data->seq[0]->basefrequencies[0];
377 world->mutationmodels[world->sublocistarts[locus]].basefreqs[1] = world->data->seq[0]->basefrequencies[1];
378 world->mutationmodels[world->sublocistarts[locus]].basefreqs[2] = world->data->seq[0]->basefrequencies[2];
379 world->mutationmodels[world->sublocistarts[locus]].basefreqs[3] = world->data->seq[0]->basefrequencies[3];
380 world->mutationmodels[world->sublocistarts[locus]].numstates=4;
381 #endif
382
383 }
384
385 ///
386 /// read unlinked SNP data (each locus must be only 1 site)
387 void
make_snp(world_fmt * world,option_fmt * options,data_fmt * data,long locus)388 make_snp (world_fmt * world, option_fmt * options, data_fmt * data,
389 long locus)
390 {
391 makevalues_snp (world, options, data, locus);
392 /*if (options->freqsfrom)
393 {
394 empiricalfreqs (world, world->data->seq, locus); */
395 options->freqsfrom = FALSE;
396 getbasefreqs (options, world->data->seq[0], locus);
397 /* }
398 */
399 }
400
401 /* private functions================================== */
402 void
getbasefreqs(option_fmt * options,seqmodel_fmt * seq,long locus)403 getbasefreqs (option_fmt * options, seqmodel_fmt * seq, long locus)
404 {
405 long l;
406
407 register MYREAL freqa, freqc, freqg, freqt, freqr, freqy;
408 register MYREAL /*freqar, freqcy,*/ freqgr, freqty;
409
410 MYREAL aa, bb;
411 if (locus == 0)
412 seq->ttratio = options->ttratio[0];
413 else
414 {
415 for (l = 1; l <= locus; l++)
416 {
417 if (options->ttratio[l] == 0.0)
418 {
419 seq->ttratio = options->ttratio[l - 1];
420 break;
421 }
422 seq->ttratio = options->ttratio[l];
423 }
424 if (l > locus)
425 seq->ttratio = options->ttratio[locus];
426 }
427 check_basefreq (options);
428
429 seq->basefrequencies[NUC_A] = options->freqa;
430 seq->basefrequencies[NUC_C] = options->freqc;
431 seq->basefrequencies[NUC_G] = options->freqg;
432 seq->basefrequencies[NUC_T] = options->freqt;
433 freqa = seq->basefrequencies[NUC_A];
434 freqc = seq->basefrequencies[NUC_C];
435 freqg = seq->basefrequencies[NUC_G];
436 freqt = seq->basefrequencies[NUC_T];
437 seq->basefrequencies[NUC_R] = freqa + freqg;
438 seq->basefrequencies[NUC_Y] = freqc + freqt;
439 freqr = seq->basefrequencies[NUC_R];
440 freqy = seq->basefrequencies[NUC_Y];
441 seq->basefrequencies[NUC_AR]= freqa / freqr;
442 seq->basefrequencies[NUC_CY]= freqc / freqy;
443 seq->basefrequencies[NUC_GR]= freqg / freqr;
444 seq->basefrequencies[NUC_TY]= freqt / freqy;
445 //freqar = seq->basefrequencies[NUC_AR];
446 //freqcy = seq->basefrequencies[NUC_CY];
447 freqgr = seq->basefrequencies[NUC_GR];
448 freqty = seq->basefrequencies[NUC_TY];
449
450 aa =
451 seq->ttratio * (freqr) * (freqy) - freqa * freqg -
452 freqc * freqt;
453 bb = freqa * (freqgr) + freqc * (freqty);
454 seq->xi = aa / (aa + bb);
455 seq->xv = 1.0 - seq->xi;
456 if (seq->xi <= 0.0)
457 {
458 warning ("This transition/transversion ratio (%f)\n",seq->ttratio);
459 warning ("is impossible with these base frequencies (%f, %f, %f, %f)!\n",freqa,freqc,freqg,freqt);
460 seq->xi = 0.00001; // do not set this to zero because of the 1/(fracchange=xi*(...))
461 seq->xv = 0.99999;
462 seq->ttratio =
463 (freqa * freqg +
464 freqc * freqt) / ((freqr) * (freqy));
465
466 warning (" Transition/transversion parameter reset\n");
467 warning (" so transition/transversion ratio is %10.6f\n\n",
468 (seq->ttratio));
469 }
470 // use 1/frac as precomputation speed up
471 seq->fracchange = 1. / (
472 (seq->xi) * (2. * freqa * (freqgr) +
473 2. * freqc * (freqty)) + (seq->xv) * (1.0 -
474 freqa *
475 freqa -
476 freqc *
477 freqc -
478 freqg *
479 freqg -
480 freqt *
481 freqt));
482 }
483
484 /*===================================================*/
485
486 void
makeweights(world_fmt * world,data_fmt * data,option_fmt * options,long locus)487 makeweights (world_fmt * world, data_fmt * data, option_fmt *options, long locus)
488 {
489 /* make up weights vector to avoid duplicate computations */
490 long i;
491 seqmodel_fmt *seq = world->data->seq[0];
492 world->data->seq[0]->endsite = 1;
493 for (i = 0; i < seq->sites[locus]; i++)
494 {
495 seq->alias[i] = i + 1;
496 seq->ally[i] = 0;
497 seq->aliasweight[i] = seq->weight[i];
498 seq->location[i] = 0;
499 }
500 sitesort2 (world, data, options, seq->sites[locus], locus);
501 sitecombine2 (world, data, options, seq->sites[locus], locus);
502 sitescrunch2 (world, seq->sites[locus], 1, 2, locus);
503 for (i = 1; i <= seq->sites[locus]; i++)
504 {
505 if (seq->aliasweight[i - 1] > 0)
506 seq->endsite = i;
507 }
508 for (i = 1; i <= seq->endsite; i++)
509 {
510 seq->location[seq->alias[i - 1] - 1] = i;
511 seq->ally[seq->alias[i - 1] - 1] =
512 seq->alias[i - 1];
513 }
514 init_sequences2 (world, seq, locus);
515 memcpy(seq->savealiasweight, seq->aliasweight, sizeof(long) * seq->endsite);
516 } /* makeweights */
517
518 void
init_sequences2(world_fmt * world,seqmodel_fmt * seq,long locus)519 init_sequences2 (world_fmt * world, seqmodel_fmt * seq, long locus)
520 {
521 if (world->contribution == NULL)
522 {
523 world->contribution =
524 (contribarr *) mymalloc ((4 + seq->endsite) * sizeof (contribarr));
525 // printf("%i> temp=%f size=%li: world->contribution allocated\n",myID, world->heat,(4 + seq->endsite) * sizeof (contribarr));
526 }
527 else
528 {
529 world->contribution =
530 (contribarr *) myrealloc (world->contribution,
531 (4 + seq->endsite) * sizeof (contribarr));
532 // printf("%i> temp=%f size=%li: world->contribution REallocated\n",myID, world->heat,(4 + seq->endsite) * sizeof (contribarr));
533 }
534 }
535
536
set_nucleotide(MYREAL * treedata,const char nucleotide,const MYREAL seqerr)537 void set_nucleotide(MYREAL *treedata, const char nucleotide, const MYREAL seqerr)
538 {
539 long b;
540 const MYREAL seqerr3 = seqerr / 3.;
541 const MYREAL seqerr23 = 2. * seqerr / 3.;
542 const MYREAL oneseqerr = 1.0 - seqerr;
543 const MYREAL oneseqerr23 = 1.0 - seqerr23;
544 const MYREAL oneseqerr3 = 1.0 - seqerr3;
545
546 for (b = 0; b < 4; b++)
547 treedata[b] = 0.0 + seqerr3;
548 switch (nucleotide)
549 {
550 case 'A':
551 treedata[NUC_A] = oneseqerr;
552 break;
553
554 case 'C':
555 treedata[NUC_C] = oneseqerr;
556 break;
557
558 case 'G':
559 treedata[NUC_G] = oneseqerr;
560 break;
561
562 case 'T':
563 treedata[NUC_T] = oneseqerr;
564 break;
565
566 case 'U':
567 treedata[NUC_T] = oneseqerr;
568 break;
569
570 case 'M':
571 treedata[NUC_A] = oneseqerr;
572 treedata[NUC_C] = oneseqerr;
573 break;
574
575 case 'R':
576 treedata[NUC_A] = oneseqerr23;
577 treedata[NUC_G] = oneseqerr23;
578 break;
579
580 case 'W':
581 treedata[NUC_A] = oneseqerr23;
582 treedata[NUC_T] = oneseqerr23;
583 break;
584
585 case 'S':
586 treedata[NUC_C] = oneseqerr23;
587 treedata[NUC_G] = oneseqerr23;
588 break;
589
590 case 'Y':
591 treedata[NUC_C] = oneseqerr23;
592 treedata[NUC_T] = oneseqerr23;
593 break;
594
595 case 'K':
596 treedata[NUC_G] = oneseqerr23;
597 treedata[NUC_T] = oneseqerr23;
598 break;
599
600 case 'B':
601 treedata[NUC_A] = seqerr;
602 treedata[NUC_C] = oneseqerr3;
603 treedata[NUC_G] = oneseqerr3;
604 treedata[NUC_T] = oneseqerr3;
605 break;
606
607 case 'D':
608 treedata[NUC_A] = oneseqerr3;
609 treedata[NUC_C] = seqerr;
610 treedata[NUC_G] = oneseqerr3;
611 treedata[NUC_T] = oneseqerr3;
612 break;
613
614 case 'H':
615 treedata[NUC_A] = oneseqerr3;
616 treedata[NUC_C] = oneseqerr3;
617 treedata[NUC_G] = seqerr;
618 treedata[NUC_T] = oneseqerr3;
619 break;
620
621 case 'V':
622 treedata[NUC_A] = oneseqerr3;
623 treedata[NUC_C] = oneseqerr3;
624 treedata[NUC_G] = oneseqerr3;
625 treedata[NUC_T] = seqerr;
626 break;
627
628 case 'N':
629 for (b = 0; b < 4; b++)
630 treedata[b] = 1.0;
631 break;
632
633 case 'X':
634 for (b = 0; b < 4; b++)
635 treedata[b] = 1.0;
636 break;
637
638 case '?':
639 for (b = 0; b < 4; b++)
640 treedata[b] = 1.0;
641 break;
642
643 case 'O':
644 for (b = 0; b < 4; b++)
645 treedata[b] = 1.0;
646 break;
647 #ifdef GAP
648 case '-':
649 treedata[4] = 1.0;
650 break;
651 #else
652 case '-':
653 for (b = 0; b < 4; b++)
654 treedata[b] = 1.0;
655 break;
656 #endif
657 }
658 }
659
660
661 void
makevalues_seq(world_fmt * world,option_fmt * options,data_fmt * data,long locus)662 makevalues_seq (world_fmt * world, option_fmt * options, data_fmt * data,
663 long locus)
664 {
665 seqmodel_fmt *seq = world->data->seq[0];
666 long ii, ind, j, k, l, pop, top;
667 long z=0;
668 node **treenode = world->nodep;
669 MYREAL seqerr = options->seqerror;
670 long zpop;
671 // we need to recalculate the sumtips
672 world->sumtips = 0;
673
674 for (pop = 0; pop < data->numpop; pop++)
675 {
676 zpop = 0;
677 top = max_shuffled_individuals(options, data, pop, locus);
678 for (ii = 0; ii < top; ii++)
679 {
680 ind = data->shuffled[pop][locus][ii];
681 zpop++;
682 // if(options->include_unknowns)
683 // {
684 if (!options->usertree)
685 {
686 allocate_tip (world, options, &(treenode[z]), pop, locus, z, ind,
687 data->indnames[pop][ind]);
688 //treenode[z]->tyme = find_tipdate(treenode[z]->nayme, pop,world);
689 world->data->sampledates[pop][locus][ind].id = treenode[z]->id;
690 z++;
691 }
692 for (k = 0; k < seq->endsite; k++)
693 {
694 j = seq->alias[k] - 1;
695 for (l = 0; l < world->options->rcategs; l++)
696 {
697 set_nucleotide(treenode[z-1]->x.s[k][l],data->yy[pop][ind][locus][0][j], seqerr);
698 }
699 }
700 }
701 data->numalleles[pop][locus] = zpop;
702 world->sumtips += zpop;
703 }
704 }
705
706
707
708 void
makevalues_snp(world_fmt * world,option_fmt * options,data_fmt * data,long locus)709 makevalues_snp (world_fmt * world, option_fmt * options, data_fmt * data,
710 long locus)
711 {
712 long i, ii, j, k, l, pop;
713 long b, f;
714 long ind;
715 seqmodel_fmt *seq = world->data->seq[0];
716 node **treenode = world->nodep;
717 f = seq->addon;
718 for (k = 0; k < seq->endsite * (f + 1);
719 k += (1 + seq->addon))
720 {
721 j = seq->alias[k / (f + 1)] - 1;
722 i = -1;
723 for (pop = 0; pop < data->numpop; pop++)
724 {
725 for (ii = 0; ii < data->numalleles[pop][locus]; ii++)
726 {
727 ind = data->shuffled[pop][locus][ii];
728 i++;
729 if (k==0 && !options->usertree)
730 {
731 strcpy (treenode[i]->nayme, data->indnames[pop][ind][locus]);
732 treenode[i]->tyme = find_tipdate(treenode[i]->nayme, pop,world);
733 }
734 for (l = 0; l < world->options->rcategs; l++)
735 {
736 if (pop == 0)
737 {
738 // treenode[i]->x.s[k][l] = vec_float_one();
739 // reset all values for the snp columns
740 // treenode[i]->x.s[k][l] = vec_zero();
741 for (b = 1; b <= 4; b++)
742 {
743 memset (treenode[i]->x.s[k + b][l], 0,
744 sizeof (MYREAL) * 4);
745 treenode[i]->x.s[k + b][l][b - 1] = 1.0;
746 }
747 continue;
748 }
749 else
750 {
751 for (b = 0; b <= 4; b++)
752 memset (treenode[i]->x.s[k + b][l], 0,
753 sizeof (MYREAL) * 4);
754 }
755
756 switch (data->yy[pop][ind][locus][0][j])
757 {
758 case 'A':
759 treenode[i]->x.s[k][l][NUC_A] = 1.0;
760 treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
761 treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
762 treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
763 treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
764 break;
765
766 case 'C':
767 treenode[i]->x.s[k][l][NUC_C] = 1.0;
768 treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
769 treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
770 treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
771 treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
772 break;
773
774 case 'G':
775 treenode[i]->x.s[k][l][NUC_G] = 1.0;
776 treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
777 treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
778 treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
779 treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
780 break;
781 case 'T':
782 case 'U':
783 treenode[i]->x.s[k][l][NUC_T] = 1.0;
784 treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
785 treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
786 treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
787 treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
788 break;
789
790 case 'M':
791 treenode[i]->x.s[k][l][NUC_A] = 1.0;
792 treenode[i]->x.s[k][l][NUC_C] = 1.0;
793 treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
794 treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
795 treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
796 treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
797 treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
798 treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
799 treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
800 treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
801 break;
802
803 case 'R':
804 treenode[i]->x.s[k][l][NUC_A] = 1.0;
805 treenode[i]->x.s[k][l][NUC_G] = 1.0;
806 treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
807 treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
808 treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
809 treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
810 treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
811 treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
812 treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
813 treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
814 break;
815
816 case 'W':
817 treenode[i]->x.s[k][l][NUC_A] = 1.0;
818 treenode[i]->x.s[k][l][NUC_T] = 1.0;
819 treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
820 treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
821 treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
822 treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
823 treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
824 treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
825 treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
826 treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
827 break;
828
829 case 'S':
830 treenode[i]->x.s[k][l][NUC_C] = 1.0;
831 treenode[i]->x.s[k][l][NUC_G] = 1.0;
832 treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
833 treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
834 treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
835 treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
836 treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
837 treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
838 treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
839 treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
840 break;
841
842 case 'Y':
843 treenode[i]->x.s[k][l][NUC_C] = 1.0;
844 treenode[i]->x.s[k][l][NUC_T] = 1.0;
845 treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
846 treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
847 treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
848 treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
849 treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
850 treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
851 treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
852 treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
853 break;
854
855 case 'K':
856 treenode[i]->x.s[k][l][NUC_G] = 1.0;
857 treenode[i]->x.s[k][l][NUC_T] = 1.0;
858 treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
859 treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
860 treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
861 treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
862 treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
863 treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
864 treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
865 treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
866 break;
867
868 case 'B':
869 treenode[i]->x.s[k][l][NUC_C] = 1.0;
870 treenode[i]->x.s[k][l][NUC_G] = 1.0;
871 treenode[i]->x.s[k][l][NUC_T] = 1.0;
872 treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
873 treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
874 treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
875 treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
876 treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
877 treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
878 treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
879 treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
880 treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
881 treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
882 treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
883 treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
884 break;
885
886 case 'D':
887 treenode[i]->x.s[k][l][NUC_A] = 1.0;
888 treenode[i]->x.s[k][l][NUC_G] = 1.0;
889 treenode[i]->x.s[k][l][NUC_T] = 1.0;
890 treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
891 treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
892 treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
893 treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
894 treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
895 treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
896 treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
897 treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
898 treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
899 treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
900 treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
901 treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
902 break;
903
904 case 'H':
905 treenode[i]->x.s[k][l][NUC_A] = 1.0;
906 treenode[i]->x.s[k][l][NUC_C] = 1.0;
907 treenode[i]->x.s[k][l][NUC_T] = 1.0;
908 treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
909 treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
910 treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
911 treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
912 treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
913 treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
914 treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
915 treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
916 treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
917 treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
918 treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
919 treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
920 break;
921
922 case 'V':
923 treenode[i]->x.s[k][l][NUC_A] = 1.0;
924 treenode[i]->x.s[k][l][NUC_C] = 1.0;
925 treenode[i]->x.s[k][l][NUC_G] = 1.0;
926 treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
927 treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
928 treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
929 treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
930 treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
931 treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
932 treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
933 treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
934 treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
935 treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
936 treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
937 treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
938 break;
939
940 case 'N':
941 case 'X':
942 case '-':
943 case 'O':
944 case '?':
945 for (b = 0; b < 4; b++)
946 {
947 treenode[i]->x.s[k][l][b] = 1.0;
948 treenode[i]->x.s[k + 1][l][b] = 1.0;
949 treenode[i]->x.s[k + 2][l][b] = 1.0;
950 treenode[i]->x.s[k + 3][l][b] = 1.0;
951 treenode[i]->x.s[k + 4][l][b] = 1.0;
952 }
953 break;
954 default:
955 break;
956
957 }
958 }
959 }
960 }
961 }
962 }
963
964
965 void
make_invarsites(world_fmt * world,data_fmt * data,long locus)966 make_invarsites (world_fmt * world, data_fmt * data, long locus)
967 {
968 long i, k, z, ii, pop, l;
969 long endsite = world->data->seq[0]->endsite;
970 node **treenode = world->nodep;
971 i = -1;
972 for (pop = 0; pop < data->numpop; pop++)
973 {
974 for (ii = 0; ii < data->numalleles[pop][locus]; ii++)
975 {
976 i++;
977 z = 0;
978 for (k = endsite; k < endsite + 4; k++)
979 {
980 for (l = 0; l < world->options->rcategs; l++)
981 {
982 memset (treenode[i]->x.s[k][l], 0, sizeof (MYREAL) * 4);
983 treenode[i]->x.s[k][l][z] = 1.0;
984 }
985 z++;
986 }
987 }
988 }
989 }
990
991 void
make_invarsites_unlinked(world_fmt * world,data_fmt * data,long locus)992 make_invarsites_unlinked (world_fmt * world, data_fmt * data, long locus)
993 {
994 long i, k, z, ii, pop, l;
995 long endsite = world->data->seq[0]->endsite;
996 node **treenode = world->nodep;
997 i = -1;
998 for (pop = 0; pop < data->numpop; pop++)
999 {
1000 for (ii = 0; ii < data->numalleles[pop][locus]; ii++)
1001 {
1002 i++;
1003 z = 0;
1004 if (pop == PANEL)
1005 {
1006 for (k = endsite * 5; k < endsite * 5 + 4; k++)
1007 {
1008 for (l = 0; l < world->options->rcategs; l++)
1009 {
1010 #ifdef ALTIVEC
1011 memset (treenode[i]->x.s[k][l].f, 0, sizeof (MYREAL) * 4);
1012 treenode[i]->x.s[k][l].f[z] = 1.0;
1013 #else
1014 memset (treenode[i]->x.s[k][l], 0, sizeof (MYREAL) * 4);
1015 treenode[i]->x.s[k][l][z] = 1.0;
1016 #endif
1017 }
1018 z++;
1019 }
1020 }
1021 else
1022 {
1023 for (k = endsite * 5; k < endsite * 5 + 4; k++)
1024 {
1025 for (l = 0; l < world->options->rcategs; l++)
1026 {
1027 #ifdef ALTIVEC
1028 treenode[i]->x.s[k][l].f[0] = 1.0; //putting ? for data
1029 treenode[i]->x.s[k][l].f[1] = 1.0;
1030 treenode[i]->x.s[k][l].f[2] = 1.0;
1031 treenode[i]->x.s[k][l].f[3] = 1.0;
1032 #else
1033 treenode[i]->x.s[k][l][0] = 1.0; //putting ? for data
1034 treenode[i]->x.s[k][l][1] = 1.0;
1035 treenode[i]->x.s[k][l][2] = 1.0;
1036 treenode[i]->x.s[k][l][3] = 1.0;
1037 #endif
1038 }
1039 }
1040 }
1041 }
1042 }
1043 }
1044
1045 void
sitesort2(world_fmt * world,data_fmt * data,option_fmt * options,long sites,long locus)1046 sitesort2 (world_fmt * world, data_fmt * data, option_fmt *options, long sites, long locus)
1047 {
1048 long gap, i, i1, j, jj, jg, k, kk, kkk, itemp, pop, z = 0;
1049 boolean flip, tied, samewt;
1050 seqmodel_fmt *seq;
1051 long *tempsum, *temppop;
1052 long numind;
1053 MYREAL a1,a2;
1054 tempsum = (long *) mycalloc (1, sizeof (long) * data->numpop);
1055 temppop = (long *) mycalloc (1, sizeof (long) * data->numpop);
1056 for (i = 0; i < data->numpop; i++)
1057 {
1058 if(options->randomsubset > 0)
1059 {
1060 numind = (options->randomsubset < data->numind[i][locus] ? options->randomsubset : data->numind[i][locus]);
1061 }
1062 else
1063 {
1064 numind = data->numind[i][locus] ;
1065 }
1066 // if (numind > 0)
1067 // {
1068 temppop[z] = i;
1069 if (z == 0)
1070 tempsum[z] = numind;
1071 else
1072 tempsum[z] = tempsum[z - 1] + numind;
1073 z++;
1074 // }
1075 }
1076 seq = world->data->seq[0];
1077 gap = sites / 2;
1078 while (gap > 0)
1079 {
1080 for (i = gap + 1; i <= sites; i++)
1081 {
1082 j = i - gap;
1083 flip = TRUE;
1084 while (j > 0 && flip)
1085 {
1086 jj = seq->alias[j - 1];
1087 jg = seq->alias[j + gap - 1];
1088 samewt = ((seq->weight[jj - 1] != 0)
1089 && (seq->weight[jg - 1] != 0))
1090 || ((seq->weight[jj - 1] == 0) && (seq->weight[jg - 1] == 0));
1091 tied = samewt
1092 && (seq->category[jj - 1] == seq->category[jg - 1]);
1093 flip = ((!samewt) && (seq->weight[jj - 1] == 0))
1094 || (samewt
1095 && (seq->category[jj - 1] > seq->category[jg - 1]));
1096 k = 0;
1097 pop = 0;
1098 kk = -1;
1099 while (k < world->sumtips && tied)
1100 {
1101 if (k == tempsum[pop])
1102 {
1103 kk = 0;
1104 pop++;
1105 }
1106 else
1107 {
1108 kk++;
1109 }
1110 i1 = temppop[pop];
1111 //debug1
1112 if (data->numind[i1][locus]>0)
1113 {
1114 kkk = data->shuffled[i1][locus][kk];
1115 a1 = data->yy[i1][kkk][locus][0][jj - 1];
1116 a2 = data->yy[i1][kkk][locus][0][jg - 1];
1117 flip =
1118 (a1 > a2);
1119 tied = (tied
1120 && a1 == a2);
1121 }
1122 k++;
1123 }
1124 if (!flip)
1125 break;
1126 itemp = seq->alias[j - 1];
1127 seq->alias[j - 1] = seq->alias[j + gap - 1];
1128 seq->alias[j + gap - 1] = itemp;
1129 itemp = seq->aliasweight[j - 1];
1130 seq->aliasweight[j - 1] = seq->aliasweight[j + gap - 1];
1131 seq->aliasweight[j + gap - 1] = itemp;
1132 j -= gap;
1133 }
1134 }
1135 gap /= 2;
1136 }
1137 myfree(tempsum);
1138 myfree(temppop);
1139 } /* sitesort2 */
1140
1141
1142 void
sitecombine2(world_fmt * world,data_fmt * data,option_fmt * options,long sites,long locus)1143 sitecombine2 (world_fmt * world, data_fmt * data, option_fmt *options, long sites, long locus)
1144 {
1145 long i, i1, j, k, kk, pop, z = 0;
1146 boolean tied, samewt;
1147 seqmodel_fmt *seq;
1148 long *tempsum, *temppop;
1149 long numind;
1150 long kkk;
1151 tempsum = (long *) mycalloc (1, sizeof (long) * data->numpop);
1152 temppop = (long *) mycalloc (1, sizeof (long) * data->numpop);
1153 if(options->randomsubset > 0)
1154 {
1155 tempsum[0] = (options->randomsubset < data->numind[0][locus] ? options->randomsubset : data->numind[0][locus]);
1156 }
1157 else
1158 {
1159 tempsum[0] = data->numind[0][locus];
1160 }
1161
1162 for (i = 0; i < data->numpop; i++)
1163 {
1164 numind = ((options->randomsubset > 0) && (options->randomsubset < data->numind[i][locus])) ? options->randomsubset : data->numind[i][locus];
1165 //debug2
1166 // if (numind > 0)
1167 // {
1168 temppop[z] = i;
1169 if (z == 0)
1170 tempsum[z] = numind;
1171 else
1172 tempsum[z] = tempsum[z - 1] + numind;
1173 z++;
1174 // }
1175 }
1176
1177 seq = world->data->seq[0];
1178 i = 1;
1179 while (i < sites)
1180 {
1181 j = i + 1;
1182 tied = TRUE;
1183 while (j <= sites && tied)
1184 {
1185 samewt = ((seq->aliasweight[i - 1] != 0)
1186 && (seq->aliasweight[j - 1] != 0))
1187 || ((seq->aliasweight[i - 1] == 0)
1188 && (seq->aliasweight[j - 1] == 0));
1189 tied = samewt
1190 && (seq->category[seq->alias[i - 1] - 1] ==
1191 seq->category[seq->alias[j - 1] - 1]);
1192 k = 0;
1193 pop = 0;
1194 kk = -1;
1195 while (k < world->sumtips && tied)
1196 {
1197 if (k == tempsum[pop])
1198 {
1199 kk = 0;
1200 pop++;
1201 }
1202 else
1203 {
1204 kk++;
1205 }
1206 i1 = temppop[pop];
1207 if (data->numind[i1][locus]>0)
1208 {
1209 kkk = data->shuffled[i1][locus][kk];
1210 tied = (tied
1211 && data->yy[i1][kkk][locus][0][seq->
1212 alias[i - 1] -
1213 1] ==
1214 data->yy[i1][kkk][locus][0][seq->alias[j - 1] -
1215 1]);
1216 }
1217 k++;
1218 }
1219 if (!tied)
1220 break;
1221 seq->aliasweight[i - 1] += seq->aliasweight[j - 1];
1222 seq->aliasweight[j - 1] = 0;
1223 seq->ally[seq->alias[j - 1] - 1] = seq->alias[i - 1];
1224 j++;
1225 }
1226 i = j;
1227 }
1228 myfree(temppop);
1229 myfree(tempsum);
1230 } /* sitecombine2 */
1231
1232
1233 void
sitescrunch2(world_fmt * world,long sites,long i,long j,long locus)1234 sitescrunch2 (world_fmt * world, long sites, long i, long j, long locus)
1235 {
1236 /* move so positively weighted sites come first */
1237 /* used by dnainvar, dnaml, dnamlk, & restml */
1238 long itemp;
1239 boolean done, found;
1240 seqmodel_fmt *seq;
1241 seq = world->data->seq[0];
1242 done = FALSE;
1243 while (!done)
1244 {
1245 //found = FALSE;
1246 if (seq->aliasweight[i - 1] > 0)
1247 i++;
1248 else
1249 {
1250 if (j <= i)
1251 j = i + 1;
1252 if (j <= sites)
1253 {
1254 //found = FALSE;
1255 do
1256 {
1257 found = (seq->aliasweight[j - 1] > 0);
1258 j++;
1259 }
1260 while (!(found || j > sites));
1261 if (found)
1262 {
1263 j--;
1264 itemp = seq->alias[i - 1];
1265 seq->alias[i - 1] = seq->alias[j - 1];
1266 seq->alias[j - 1] = itemp;
1267 itemp = seq->aliasweight[i - 1];
1268 seq->aliasweight[i - 1] = seq->aliasweight[j - 1];
1269 seq->aliasweight[j - 1] = itemp;
1270 }
1271 else
1272 done = TRUE;
1273 }
1274 else
1275 done = TRUE;
1276 }
1277 done = (done || i >= sites);
1278 }
1279 } /* sitescrunch2 */
1280
1281 void
inputoptions(world_fmt * earth,world_fmt * world,option_fmt * options,data_fmt * data,long locus)1282 inputoptions (world_fmt *earth, world_fmt * world, option_fmt * options, data_fmt * data,
1283 long locus)
1284 {
1285 long i;
1286 long sites = world->data->seq[0]->sites[locus];
1287 for (i = 0; i < sites; i++)
1288 world->data->seq[0]->category[i] = 1;
1289 for (i = 0; i < sites; i++)
1290 world->data->seq[0]->weight[i] = 1;
1291 if (options->weights)
1292 inputweights (world, data, sites);
1293 world->data->seq[0]->weightsum = 0;
1294 for (i = 0; i < sites; i++)
1295 world->data->seq[0]->weightsum += world->data->seq[0]->weight[i];
1296 if (world->options->categs > 1)
1297 {
1298 inputcategs (0, sites, world, options, data, earth);
1299 }
1300 } /* inputoptions */
1301
1302 void
inputweights(world_fmt * world,data_fmt * data,long chars)1303 inputweights (world_fmt * world, data_fmt * data, long chars)
1304 {
1305 /* input the character weights, 0-9 and A-Z for weights 0 - 35 */
1306 char ch;
1307 long i;
1308 char input[1024];
1309
1310 ch = getc (data->weightfile);
1311 while (ch == '#')
1312 {
1313 FGETS(input, LINESIZE,data->weightfile);
1314 ch = getc (data->weightfile);
1315 }
1316 ungetc (ch, data->weightfile);
1317 for(i=0;i<chars;i++)
1318 {
1319 world->data->seq[0]->weight[i] = 1;
1320 ch = getc (data->weightfile);
1321
1322 if (isdigit ((int) ch))
1323 world->data->seq[0]->weight[i] = ch - '0';
1324 else
1325 {
1326 if (isalpha ((int) ch))
1327 {
1328 ch = uppercase (ch);
1329 world->data->seq[0]->weight[i] = (short) (ch - 'A' + 10);
1330 }
1331 else
1332 {
1333 if(isspace((int) ch))
1334 {
1335 i--;
1336 continue;
1337 }
1338 else
1339 printf ("ERROR: Bad weight character: %c\n", ch);
1340 exit (EXIT_FAILURE);
1341 }
1342 }
1343 }
1344 } /* inputweights */
1345
1346 void
inputcategs(long a,long b,world_fmt * world,option_fmt * options,data_fmt * data,world_fmt * earth)1347 inputcategs (long a, long b, world_fmt * world, option_fmt * options,
1348 data_fmt * data, world_fmt *earth)
1349 {
1350 /* input the categories, 1-9 */
1351 char ch;
1352 long i;
1353 char input[LINESIZE];
1354 int retval;
1355 if(data->catfile!=NULL)
1356 {
1357 ch = getc (data->catfile);
1358 while (ch == '#')
1359 {
1360 FGETS(input, LINESIZE,data->catfile);
1361 ch = getc (data->catfile);
1362 }
1363 ungetc (ch, data->catfile);
1364 retval = fscanf (data->catfile, "%ld", &options->categs);
1365 options->rate =
1366 (MYREAL *) myrealloc (options->rate, sizeof (MYREAL) * options->categs);
1367 for (i = 0; i < options->categs; i++)
1368 {
1369 #ifdef USE_MYREAL_FLOAT
1370 retval = fscanf (data->catfile, "%f", &options->rate[i]);
1371 #else
1372 retval = fscanf (data->catfile, "%lf", &options->rate[i]);
1373 #endif
1374 }
1375 for (i = a; i < b; i++)
1376 {
1377 if ((ch >= '1') && (ch <= ('0' + world->options->categs)))
1378 world->data->seq[0]->category[i] = ch - '0';
1379 else
1380 {
1381 if(isspace((int) ch))
1382 {
1383 i--;
1384 continue;
1385 }
1386 else
1387 {
1388 printf
1389 ("BAD CATEGORY CHARACTER: %c -- CATEGORIES ARE CURRENTLY 1-%ld\n",
1390 ch, world->options->categs);
1391 exit (EXIT_FAILURE);
1392 }
1393 }
1394 }
1395 fclose(data->catfile);
1396 data->catfile=NULL;
1397 }
1398 else
1399 {
1400 memcpy(world->data->seq[0]->category,earth->data->seq[0]->category,sizeof(int)*b);
1401 }
1402 } /* inputcategs */
1403
1404
1405
1406
1407 void
empiricalfreqs(world_fmt * world,option_fmt * options,seqmodel_fmt * seq,long locus)1408 empiricalfreqs (world_fmt * world, option_fmt * options, seqmodel_fmt * seq,
1409 long locus)
1410 {
1411 /* Get empirical base frequencies from the data */
1412 long i, j, k;
1413 MYREAL summ, suma, sumc, sumg, sumt, w;
1414 long snps = world->options->datatype == 'u' ? 5 : 1;
1415 options->freqa = 0.25;
1416 options->freqc = 0.25;
1417 options->freqg = 0.25;
1418 options->freqt = 0.25;
1419 for (k = 1; k <= 8; k++)
1420 {
1421 suma = 0.0;
1422 sumc = 0.0;
1423 sumg = 0.0;
1424 sumt = 0.0;
1425 for (i = 0; i < world->sumtips; i++)
1426 {
1427 for (j = 0; j < seq->endsite * snps; j += snps)
1428 {
1429 w = (MYREAL) seq->aliasweight[j / snps];
1430 summ = (options->freqa) * world->nodep[i]->x.s[j][0][NUC_A] + (options->freqc) * world->nodep[i]->x.s[j][0][NUC_C]
1431 + (options->freqg) * world->nodep[i]->x.s[j][0][NUC_G]
1432 + (options->freqt) * world->nodep[i]->x.s[j][0][NUC_T];
1433 suma += w * (options->freqa) * world->nodep[i]->x.s[j][0][NUC_A] / summ;
1434 sumc += w * (options->freqc) * world->nodep[i]->x.s[j][0][NUC_C] / summ;
1435 sumg += w * (options->freqg) * world->nodep[i]->x.s[j][0][NUC_G] / summ;
1436 sumt += w * (options->freqt) * world->nodep[i]->x.s[j][0][NUC_T] / summ;
1437
1438 }
1439 }
1440 summ = suma + sumc + sumg + sumt;
1441 options->freqa = suma / summ;
1442 options->freqc = sumc / summ;
1443 options->freqg = sumg / summ;
1444 options->freqt = sumt / summ;
1445 }
1446 } /* empiricalfreqs */
1447
1448
1449 void
initlambda(option_fmt * options)1450 initlambda (option_fmt * options)
1451 {
1452 int retval;
1453 while (options->lambda <= 1.0)
1454 {
1455 printf
1456 ("Mean block length of sites having the same rate\n (needs to be greater than 1)?\n");
1457 #ifdef USE_MYREAL_FLOAT
1458 retval = scanf ("%f%*[^\n]", &options->lambda);
1459 #else
1460 retval = scanf ("%lf%*[^\n]", &options->lambda);
1461 #endif
1462 getchar ();
1463 }
1464 options->lambda = 1.0 / options->lambda;
1465 }
1466
1467
1468
1469 void
init_tbl(world_fmt * world,long locus)1470 init_tbl (world_fmt * world, long locus)
1471 {
1472 /* Define a lookup table. Precompute values and print them out in tables */
1473 long i, j;
1474 MYREAL sumrates;
1475 long categs = world->options->categs;
1476 long rcategs = world->options->rcategs;
1477 world->tbl = (valrec ***) mymalloc (rcategs * sizeof (valrec **));
1478 for (i = 0; i < rcategs; i++)
1479 {
1480 world->tbl[i] = (valrec **) mymalloc (categs * sizeof (valrec *));
1481 for (j = 0; j < categs; j++)
1482 world->tbl[i][j] = (valrec *) mymalloc (sizeof (valrec));
1483 }
1484 for (i = 0; i < rcategs; i++)
1485 {
1486 for (j = 0; j < categs; j++)
1487 {
1488 world->tbl[i][j]->rat =
1489 world->options->rrate[i] * world->options->rate[j];
1490 world->tbl[i][j]->ratxi =
1491 world->tbl[i][j]->rat * world->data->seq[0]->xi;
1492 world->tbl[i][j]->ratxv =
1493 world->tbl[i][j]->rat * world->data->seq[0]->xv;
1494 }
1495 }
1496 sumrates = 0.0;
1497 for (i = 0; i < world->data->seq[0]->oldsite; i++)
1498 {
1499 for (j = 0; j < rcategs; j++)
1500 sumrates +=
1501 world->data->seq[0]->aliasweight[i] * world->options->probcat[j] *
1502 world->tbl[j][world->data->seq[0]->
1503 category[world->data->seq[0]->alias[i] - 1] - 1]->rat;
1504 }
1505 sumrates /= (MYREAL) world->data->seq[0]->sites[locus];
1506 for (i = 0; i < rcategs; i++)
1507 for (j = 0; j < categs; j++)
1508 {
1509 world->tbl[i][j]->rat /= sumrates;
1510 world->tbl[i][j]->ratxi /= sumrates;
1511 world->tbl[i][j]->ratxv /= sumrates;
1512 }
1513 } /* inittable */
1514
1515 void
print_weights(FILE * outfile,world_fmt * world,option_fmt * options,long locus)1516 print_weights (FILE * outfile, world_fmt * world, option_fmt * options,
1517 long locus)
1518 {
1519 if (options->weights)
1520 {
1521 if ((options->printdata) || (options->progress && outfile == stdout))
1522 {
1523 printweights (outfile, world, options, 0,
1524 world->data->seq[0]->sites[locus],
1525 world->data->seq[0]->weight, "Sites");
1526 }
1527 }
1528 }
1529
1530 void
print_tbl(FILE * outfile,world_fmt * world,option_fmt * options,long locus)1531 print_tbl (FILE * outfile, world_fmt * world, option_fmt * options,
1532 long locus)
1533 {
1534 long i;
1535
1536 option_fmt *opt;
1537
1538
1539 opt = options;
1540 if (opt->rcategs > 1)
1541 {
1542 FPRINTF (outfile, "\nRegion type Rate of change Probability\n");
1543 FPRINTF (outfile, "---------------------------------------------\n");
1544 for (i = 0; i < opt->rcategs; i++)
1545 FPRINTF (outfile, "%9ld%16.3f%17.3f\n", i + 1, opt->rrate[i],
1546 opt->probcat[i]);
1547 FPRINTF (outfile,"\n");
1548 if (opt->autocorr)
1549 FPRINTF (outfile,
1550 "Expected length of a patch of sites having the same rate = %8.3f\n",
1551 1. / opt->lambda);
1552 FPRINTF (outfile,"\n");
1553 }
1554 if (opt->categs > 1)
1555 {
1556 FPRINTF (outfile, "Site category Rate of change\n");
1557 FPRINTF (outfile, "------------------------------\n");
1558 for (i = 0; i < opt->categs; i++)
1559 FPRINTF (outfile, "%9ld%16.3f\n", i + 1, opt->rate[i]);
1560 }
1561 if ((opt->rcategs > 1) || (opt->categs > 1))
1562 FPRINTF (outfile, "\n");
1563 }
1564
1565
1566 void
printweights(FILE * outfile,world_fmt * world,option_fmt * options,short inc,long chars,short * weight,char * letters)1567 printweights (FILE * outfile, world_fmt * world, option_fmt * options,
1568 short inc, long chars, short *weight, char *letters)
1569 {
1570 /* print out the weights of sites */
1571 long i, j;
1572 FPRINTF (outfile, "\n %s are weighted as follows:\n", letters);
1573 for (i = 0; i < chars; i++)
1574 {
1575 if (i % 60 == 0)
1576 {
1577 FPRINTF (outfile, "\n");
1578 for (j = 1; j <= options->nmlength + 3; j++)
1579 FPRINTF (outfile, " ");
1580 }
1581 FPRINTF (outfile, "%hd", weight[i + inc]);
1582 if ((i + 1) % 10 == 0 && (i + 1) % 60 != 0)
1583 FPRINTF (outfile, " ");
1584 }
1585 FPRINTF (outfile, "\n\n");
1586 } /* printweights */
1587
1588
1589
1590 void
print_seqfreqs(FILE * outfile,world_fmt * world,option_fmt * options)1591 print_seqfreqs (FILE * outfile, world_fmt * world, option_fmt * options)
1592 {
1593 if(outfile==NULL)
1594 return;
1595
1596 if (world->locus == 0 || outfile == stdout)
1597 {
1598 if (options->freqsfrom)
1599 FPRINTF (outfile, "\nEmpirical ");
1600 FPRINTF (outfile, "Base Frequencies\n");
1601 FPRINTF (outfile,
1602 "------------------------------------------------------------\n");
1603 FPRINTF (outfile,
1604 "Locus Nucleotide Transition/\n");
1605 FPRINTF (outfile,
1606 " ------------------------------ Transversion ratio\n");
1607 FPRINTF (outfile, " A C G T(U)\n");
1608 FPRINTF (outfile,
1609 "------------------------------------------------------------\n");
1610 }
1611 FPRINTF (outfile, "%4li %6.4f %6.4f %6.4f %6.4f %10.5f\n",
1612 world->locus + 1, options->freqa, options->freqc, options->freqg,
1613 options->freqt, world->data->seq[0]->ttratio);
1614 if (outfile == stdout)
1615 FPRINTF (outfile, "\n");
1616
1617 }
1618
1619 MYREAL
treelike_seq(world_fmt * world,long locus)1620 treelike_seq (world_fmt * world, long locus)
1621 {
1622 const seqmodel_fmt *seq = world->data->seq[0];
1623 const MYREAL freqa = seq->basefrequencies[NUC_A];
1624 const MYREAL freqc = seq->basefrequencies[NUC_C];
1625 const MYREAL freqg = seq->basefrequencies[NUC_G];
1626 const MYREAL freqt = seq->basefrequencies[NUC_T];
1627
1628 contribarr tterm;
1629 contribarr like;
1630 contribarr nulike;
1631 contribarr clai;
1632 // long size = sizeof(MYREAL) * world->options->rcategs;
1633 MYREAL summ, sum2, sumc, sumterm, lterm;
1634 long i, j, k, lai;
1635 MYREAL scale;
1636 node *p;
1637 sitelike *x1;
1638 worldoption_fmt *opt;
1639 opt = world->options;
1640 p = crawlback (world->root->next);
1641 summ = 0.0;
1642
1643 if (opt->rcategs == 1)
1644 {
1645 for (i = 0; i < seq->endsite; i++)
1646 {
1647 x1 = &(p->x.s[i][0]);
1648 scale = p->scale[i];
1649 tterm[0] =
1650 freqa * (*x1)[0] + freqc * (*x1)[1] +
1651 freqg * (*x1)[2] + freqt * (*x1)[3];
1652 summ += seq->aliasweight[i] * (LOG (tterm[0]) + scale);
1653 }
1654 }
1655 else
1656 {
1657 for (i = 0; i < seq->endsite; i++)
1658 {
1659 scale = p->scale[i];
1660 //k = seq->category[seq->alias[i] - 1] - 1;
1661 for (j = 0; j < opt->rcategs; j++)
1662 {
1663 x1 = &(p->x.s[i][j]);
1664 tterm[j] =
1665 freqa * (*x1)[0] + freqc * (*x1)[1] +
1666 freqg * (*x1)[2] + freqt * (*x1)[3];
1667 }
1668 sumterm = 0.0;
1669 for (j = 0; j < opt->rcategs; j++)
1670 sumterm += opt->probcat[j] * tterm[j];
1671 lterm = LOG (sumterm) + scale;
1672 for (j = 0; j < opt->rcategs; j++)
1673 clai[j] = tterm[j] / sumterm;
1674 swap (clai, world->contribution[i]);
1675 // memcpy (world->contribution[i], clai, size);
1676 summ += seq->aliasweight[i] * lterm;
1677 if(MYISNAN((float)summ))
1678 {
1679 // error("summ is not a number, should not happen\n");
1680 summ = -HUGE;
1681 }
1682
1683 }
1684 for (j = 0; j < opt->rcategs; j++)
1685 like[j] = 1.0;
1686 for (i = 0; i < seq->sites[locus]; i++)
1687 {
1688 sumc = 0.0;
1689 for (k = 0; k < opt->rcategs; k++)
1690 sumc += opt->probcat[k] * like[k];
1691 sumc *= opt->lambda;
1692 if ((seq->ally[i] > 0) && (seq->location[seq->ally[i] - 1] > 0))
1693 {
1694 lai = seq->location[seq->ally[i] - 1];
1695 swap (world->contribution[lai - 1], clai);
1696 //memcpy (clai, world->contribution[lai - 1], size);
1697 for (j = 0; j < opt->rcategs; j++)
1698 nulike[j] = ((1.0 - opt->lambda) * like[j] + sumc) * clai[j];
1699 }
1700 else
1701 {
1702 for (j = 0; j < opt->rcategs; j++)
1703 nulike[j] = ((1.0 - opt->lambda) * like[j] + sumc);
1704 }
1705 swap (nulike, like);
1706 //memcpy (like, nulike, size);
1707 }
1708 sum2 = 0.0;
1709 for (i = 0; i < opt->rcategs; i++)
1710 sum2 += opt->probcat[i] * like[i];
1711 summ += LOG (sum2);
1712 }
1713 return summ;
1714 } /* treelikelihood */
1715
1716 void
snp_invariants_original(contribarr invariants,long endsite,long rcategs,seqmodel_fmt * seq,phenotype x1)1717 snp_invariants_original (contribarr invariants, long endsite, long rcategs,
1718 seqmodel_fmt * seq, phenotype x1)
1719 {
1720 long i, j;
1721 MYREAL *val;
1722 register MYREAL freqa, freqc, freqg, freqt;
1723 freqa = seq->basefrequencies[NUC_A];
1724 freqc = seq->basefrequencies[NUC_C];
1725 freqg = seq->basefrequencies[NUC_G];
1726 freqt = seq->basefrequencies[NUC_T];
1727
1728 memset (invariants, 0, sizeof (contribarr));
1729 for (j = 0; j < rcategs; j++)
1730 {
1731 for (i = endsite; i < endsite + 4; i++)
1732 {
1733 val = x1[i][j];
1734 invariants[j] +=
1735 (freqa * val[0] + freqc * val[1] +
1736 freqg * val[2] + freqt * val[3]);
1737 }
1738 }
1739 for (j = 0; j < rcategs; j++)
1740 invariants[j] = 1. - invariants[j];
1741 }
1742
1743 void
snp_invariants(contribarr invariants,world_fmt * world,long locus,phenotype x1,MYREAL * scaler)1744 snp_invariants (contribarr invariants, world_fmt *world, long locus, phenotype x1, MYREAL *scaler)
1745 {
1746 worldoption_fmt *opt;
1747 seqmodel_fmt *seq = world->data->seq[0];
1748 //MYREAL summ;
1749
1750 sitelike *val;
1751 long i, j;
1752 register MYREAL freqa, freqc, freqg, freqt;
1753 freqa = seq->basefrequencies[NUC_A];
1754 freqc = seq->basefrequencies[NUC_C];
1755 freqg = seq->basefrequencies[NUC_G];
1756 freqt = seq->basefrequencies[NUC_T];
1757
1758 opt = world->options;
1759 //summ = 0.0;
1760 /* snp invariants */
1761 for (j = 0; j < opt->rcategs; j++)
1762 {
1763 invariants[j] = 0.0;
1764 for (i = seq->endsite - seq->addon ; i < seq->endsite; i++)
1765 {
1766 val = &(x1[i][j]);
1767 invariants[j] +=
1768 exp(scaler[i]) * ((freqa * (*val)[0] + freqc * (*val)[1] +
1769 freqg * (*val)[2] + freqt * (*val)[3]));
1770 }
1771 invariants[j] = 1.0 - invariants[j];
1772 }
1773 // printf("invariant0=%f\n",invariants[0]);
1774 } /* snp_invariants*/
1775
1776
1777 MYREAL
treelike_snp(world_fmt * world,long locus)1778 treelike_snp (world_fmt * world, long locus)
1779 {
1780 worldoption_fmt *opt;
1781 seqmodel_fmt *seq = world->data->seq[0];
1782 MYREAL scale;
1783 contribarr tterm;
1784 contribarr invariants;
1785 contribarr like;
1786 contribarr nulike;
1787 MYREAL summ, sum2, sumc, sumterm, lterm;
1788 long i, j, k, lai;
1789
1790 node *p;
1791 sitelike *x1;
1792 register MYREAL freqa, freqc, freqg, freqt;
1793 freqa = seq->basefrequencies[NUC_A];
1794 freqc = seq->basefrequencies[NUC_C];
1795 freqg = seq->basefrequencies[NUC_G];
1796 freqt = seq->basefrequencies[NUC_T];
1797
1798 opt = world->options;
1799 p = crawlback (world->root->next);
1800 summ = 0.0;
1801 /* snp invariants */
1802 snp_invariants (invariants,world, locus, p->x.s, p->scale);
1803 for (j = 0; j < opt->rcategs; j++)
1804 {
1805 if(invariants[j] <= 0.0)
1806 {
1807 #ifdef DEBUG
1808 printf("invariants are funny %f\n", invariants[j]);
1809 #endif
1810 return -MYREAL_MAX;
1811 }
1812 }
1813 if(opt->rcategs==1)
1814 {
1815 for (i = 0; i < seq->endsite - seq->addon; i++)
1816 {
1817 scale = p->scale[i];
1818 x1 = &(p->x.s[i][j]);
1819 tterm[0] = (freqa * (*x1)[0] + freqc * (*x1)[1] + freqg * (*x1)[2] + freqt * (*x1)[3]);
1820 summ += seq->aliasweight[i] * (LOG (tterm[0]) + scale - log(invariants[0]));
1821 }
1822 return summ;
1823 }
1824 else
1825 {
1826 for (i = 0; i < seq->endsite - seq->addon; i++)
1827 {
1828 scale = p->scale[i];
1829 for (j = 0; j < opt->rcategs; j++)
1830 {
1831 x1 = &(p->x.s[i][j]);
1832 tterm[j] =
1833 (freqa * (*x1)[0] + freqc * (*x1)[1] +
1834 freqg * (*x1)[2] + freqt * (*x1)[3]) /invariants[j];
1835 }
1836 sumterm = 0.0;
1837 for (j = 0; j < opt->rcategs; j++)
1838 sumterm += opt->probcat[j] * tterm[j];
1839 lterm = LOG (sumterm) + scale;
1840 for (j = 0; j < opt->rcategs; j++)
1841 world->contribution[i][j] = tterm[j] / sumterm;
1842
1843 summ += seq->aliasweight[i] * lterm;
1844 } /* over endsite - 4[snp-invariants] */
1845 for (j = 0; j < opt->rcategs; j++)
1846 like[j] = 1.0;
1847 for (i = 0; i < seq->sites[locus]; i++)
1848 {
1849 sumc = 0.0;
1850 for (k = 0; k < opt->rcategs; k++)
1851 sumc += opt->probcat[k] * like[k];
1852 sumc *= opt->lambda;
1853 if ((seq->ally[i] > 0) && (seq->location[seq->ally[i] - 1] > 0))
1854 {
1855 lai = seq->location[seq->ally[i] - 1];
1856 for (j = 0; j < opt->rcategs; j++)
1857 nulike[j] =
1858 ((1.0 - opt->lambda) * like[j] +
1859 sumc) * world->contribution[lai - 1][j];
1860 }
1861 else
1862 {
1863 for (j = 0; j < opt->rcategs; j++)
1864 nulike[j] = ((1.0 - opt->lambda) * like[j] + sumc);
1865 }
1866 swap (nulike, like);
1867 }
1868 sum2 = 0.0;
1869 for (i = 0; i < opt->rcategs; i++)
1870 sum2 += opt->probcat[i] * like[i];
1871 summ += LOG (sum2);
1872 return summ;
1873 }
1874 } /* treelike_snp */
1875
1876
1877 MYREAL
treelike_snp_unlinked(world_fmt * world,long locus)1878 treelike_snp_unlinked (world_fmt * world, long locus)
1879 {
1880 //worldoption_fmt *opt;
1881 seqmodel_fmt *seq = world->data->seq[0];
1882 MYREAL scale;
1883 contribarr tterm, invariants;
1884 MYREAL summ, datasum = 0, lterm, result = 0;
1885 long i, ii;
1886
1887 node *p;
1888 sitelike *x1;
1889
1890 register MYREAL freqa, freqc, freqg, freqt;
1891
1892 freqa = seq->basefrequencies[NUC_A];
1893 freqc = seq->basefrequencies[NUC_C];
1894 freqg = seq->basefrequencies[NUC_G];
1895 freqt = seq->basefrequencies[NUC_T];
1896
1897 //opt = world->options;
1898 //seq = world->data->seq[0];
1899 p = crawlback (world->root->next);
1900 summ = 0.0;
1901 /* snp invariants */
1902 snp_invariants (invariants, world, locus, p->x.s, p->scale);
1903 /* no rate categories used */
1904 for (i = 0; i < seq->endsite; i++)
1905 {
1906 ii = i / 5;
1907 scale = p->scale[i];
1908 x1 = &(p->x.s[i][0]);
1909 tterm[0] =
1910 (freqa * (*x1)[0] + freqc * (*x1)[1] +
1911 freqg * (*x1)[2] + freqt * (*x1)[3]);
1912 if (i % 5 == 0)
1913 {
1914 lterm = LOG (tterm[0]) + scale;
1915 summ = 0;
1916 datasum = seq->aliasweight[ii] * lterm;
1917 }
1918 else
1919 summ += pow (tterm[0], (MYREAL) seq->aliasweight[ii]);
1920 if (((i + 1) % 5) == 0 && i != 0)
1921 {
1922 if(summ > 0.)
1923 {
1924 result +=
1925 datasum + LOG ((1 - EXP (LOG (summ) - datasum)) / invariants[0]);
1926 }
1927 }
1928 }
1929 return result;
1930 } /* treelike_snp_unlinked */
1931
1932 void
check_basefreq(option_fmt * options)1933 check_basefreq (option_fmt * options)
1934 {
1935
1936 if (options->freqa == 0. || options->freqc == 0. || options->freqt == 0.
1937 || options->freqg == 0.)
1938 {
1939 options->freqa = 0.25;
1940 options->freqc = 0.25;
1941 options->freqg = 0.25;
1942 options->freqt = 0.25;
1943 }
1944 }
1945
1946
1947 void
copy_seq(world_fmt * original,world_fmt * kopie)1948 copy_seq (world_fmt * original, world_fmt * kopie)
1949 {
1950 long sites;
1951 seqmodel_fmt *kseq;
1952 seqmodel_fmt *oseq;
1953 kseq = kopie->data->seq[0];
1954 oseq = original->data->seq[0];
1955 sites = oseq->sites[original->locus];
1956 memcpy(kseq->basefrequencies,oseq->basefrequencies,sizeof(MYREAL)*BASEFREQLENGTH);
1957 kseq->aa = oseq->aa;
1958 kseq->bb = oseq->bb;
1959 kseq->endsite = oseq->endsite;
1960 kseq->xi = oseq->xi;
1961 kseq->xv = oseq->xv;
1962 kseq->ttratio = oseq->ttratio;
1963 kseq->fracchange = oseq->fracchange;
1964 memcpy (kseq->sites, oseq->sites, sizeof (long) * original->loci);
1965 memcpy (kseq->alias, oseq->alias, sizeof (long) * sites);
1966 memcpy (kseq->ally, oseq->ally, sizeof (long) * sites);
1967 memcpy (kseq->category, oseq->category, sizeof (long) * sites);
1968 memcpy (kseq->weight, oseq->weight, sizeof (short) * sites);
1969 kseq->weightsum = oseq->weightsum;
1970 memcpy (kseq->aliasweight, oseq->aliasweight, sizeof (long) * sites);
1971 memcpy (kseq->location, oseq->location, sizeof (long) * sites);
1972 kseq->addon = oseq->addon;
1973 }
1974
1975
find_rates_fromdata(data_fmt * data,option_fmt * options,world_fmt * world)1976 void find_rates_fromdata(data_fmt * data, option_fmt * options, world_fmt *world)
1977 {
1978 long locus;
1979 long pop;
1980 long ind;
1981 long sites;
1982 long i;
1983 long n;
1984 long s;
1985 long **v;
1986 long maxsites = 1;
1987 long *numind;
1988 char *reference;
1989 char *indseq;
1990 long loci = data->loci;
1991 long numpop = data->numpop;
1992 long segreg;
1993 MYREAL mean=0.;
1994 MYREAL delta = 0.;
1995
1996 numind = (long *) mycalloc (data->loci, sizeof (long));
1997 options->segregs = (long *) mycalloc (data->loci, sizeof (long));
1998 options->wattersons = (MYREAL *) mycalloc (data->loci, sizeof (MYREAL));
1999 for (locus=0; locus < loci; locus++)
2000 {
2001 maxsites = (data->seq[0]->sites[locus] > maxsites ? data->seq[0]->sites[locus] : maxsites);
2002 for(pop=0;pop < data->numpop; pop++)
2003 {
2004 numind[locus] += data->numind[pop][locus];
2005 }
2006 }
2007
2008 v = (long **) mycalloc (loci, sizeof (long *));
2009 v[0] = (long *) mycalloc (loci * maxsites, sizeof (long));
2010 for (i = 1; i < loci; i++)
2011 {
2012 v[i] = v[0] + maxsites * i;
2013 }
2014
2015 for (locus=0; locus< loci; locus++)
2016 {
2017 reference = data->yy[0][0][locus][0];
2018 for(pop=0;pop < numpop; pop++)
2019 {
2020 for(ind=0; ind < data->numind[pop][locus];ind++)
2021 {
2022 indseq=data->yy[pop][ind][locus][0];
2023 if(strcmp(indseq,reference)!=0)
2024 {
2025 for(sites=0; sites < data->seq[0]->sites[locus]; sites++)
2026 {
2027 if(v[locus][sites] == 0)
2028 {
2029 s = (long)(indseq[sites] != reference[sites]);
2030 v[locus][sites] += s;
2031 }
2032 }
2033 }
2034 }
2035 }
2036 }
2037 //n = 0;
2038 //mean = 0.;
2039 if(options->mu_rates == NULL)
2040 {
2041 options->mu_rates = (MYREAL *) mycalloc( loci, sizeof(MYREAL));
2042 // printf("%i> opt murate size %li\n",myID,loci * sizeof (MYREAL));
2043 }
2044 n = 0;
2045 mean = 0. ;
2046 options->muloci = loci;
2047 for (locus=0; locus < loci; locus++)
2048 {
2049 segreg = 0;
2050 for(sites=0; sites < data->seq[0]->sites[locus]; sites++)
2051 {
2052 segreg += (long) (v[locus][sites] > 0);
2053 }
2054 options->segregs[locus] = segreg;
2055 if(segreg==0)
2056 {
2057 if (options->onlyvariable)
2058 {
2059 data->skiploci[locus] = TRUE;
2060 data->locusweight[locus] = 0.0;
2061 }
2062 else
2063 {
2064 if (options->has_variableandone)
2065 {
2066 if (options->firstinvariant == -1)
2067 {
2068 options->firstinvariant=locus;
2069 data->locusweight[locus] = 1.0;
2070 }
2071 else
2072 {
2073 data->skiploci[locus] = TRUE;
2074 data->locusweight[locus] = 0.0;
2075 data->locusweight[options->firstinvariant] +=1;
2076 }
2077 }
2078 }
2079 }
2080 // originally I used just the plain watterson estimates but
2081 // this leads to strange results with highly unequal sequences
2082 // now the watterson's theta is adjusted per site
2083 // and that is used to generate the relative rate from the data
2084 // PB April 3 2011
2085 options->wattersons[locus] = watterson(segreg,numind[locus]);
2086 options->mu_rates[locus] = (options->wattersons[locus] + 0.0000001)/data->seq[0]->sites[locus];
2087 if (data->seq[0]->sites[locus]>0)
2088 options->wattersons[locus] /= data->seq[0]->sites[locus];
2089 n = n + 1;
2090 delta = options->mu_rates[locus] - mean;
2091 mean += delta/n;
2092 }
2093 if(mean > 0.0)
2094 {
2095 for (locus=0; locus < loci; locus++)
2096 {
2097 options->mu_rates[locus] /= mean;
2098 }
2099 }
2100 else
2101 {
2102 options->murates = FALSE;
2103 options->murates_fromdata = FALSE;
2104 warning("Relative mutation rates estimation from data was requested but attempt failed --> reset to all-equal rates");
2105 }
2106 myfree(v[0]);
2107 myfree(v);
2108 myfree(numind);
2109 }
2110
find_rates_fromdata_alleles(data_fmt * data,option_fmt * options,world_fmt * world,MYREAL mean)2111 void find_rates_fromdata_alleles(data_fmt * data, option_fmt * options, world_fmt *world, MYREAL mean)
2112 {
2113 long locus;
2114 options->segregs = (long *) mycalloc(data->loci, sizeof(long));
2115 for (locus=0;locus < data->loci; locus++)
2116 {
2117 options->segregs[locus] = (long) (options->mu_rates[locus] * mean + 0.5);
2118 }
2119 }
2120
2121
free_seq(seqmodel_fmt ** seq,long seqnum)2122 void free_seq(seqmodel_fmt **seq, long seqnum)
2123 {
2124 long i;
2125 for(i=0;i<seqnum;i++)
2126 {
2127 if(seq[i]->links!=NULL)
2128 {
2129 myfree(seq[i]->links);
2130 }
2131 myfree(seq[i]->sites);
2132 }
2133 myfree(seq[0]);
2134 myfree(seq);
2135 }
2136