1 /*------------------------------------------------------
2  Maximum likelihood estimation
3  of migration rate  and effectice population size
4  using a Metropolis-Hastings Monte Carlo algorithm
5  -------------------------------------------------------
6  S E Q U E N C E S   R O U T I N E S
7 
8 
9  Peter Beerli 1996, Seattle
10  beerli@fsu.edu
11 
12  Copyright 2002 Peter Beerli and Joseph Felsenstein
13 
14  Permission is hereby granted, free of charge, to any person obtaining
15  a copy of this software and associated documentation files (the
16  "Software"), to deal in the Software without restriction, including
17  without limitation the rights to use, copy, modify, merge, publish,
18  distribute, sublicense, and/or sell copies of the Software, and to
19  permit persons to whom the Software is furnished to do so, subject
20  to the following conditions:
21 
22  The above copyright notice and this permission notice shall be
23  included in all copies or substantial portions of the Software.
24 
25  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
28  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
29  ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
30  CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
31  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 
33 
34  $Id: sequence.c 2171 2013-09-22 22:05:02Z beerli $
35 
36 -------------------------------------------------------*/
37 /* \file sequence.c
38 
39 */
40 #include "migration.h"
41 #include "sighandler.h"
42 #include "data.h"
43 #include "tools.h"
44 #include "migrate_mpi.h"
45 #include "watterson.h"
46 #include "tree.h"
47 
48 #ifdef DMALLOC_FUNC_CHECK
49 #include <dmalloc.h>
50 #endif
51 /* prototypes ------------------------------------------- */
52 void make_sequences (world_fmt * world, option_fmt * options, data_fmt * data,
53                      long locus);
54 void init_sequences (world_fmt * world, option_fmt * options, data_fmt * data,
55                      long locus);
56 void init_sequences2 (world_fmt * world, seqmodel_fmt * seq, long locus);
57 void initratio (option_fmt * options);
58 void initfreqs (MYREAL *freqa, MYREAL *freqc, MYREAL *freqg, MYREAL *freqt);
59 void initcatn (long *categs);
60 boolean initcategs (long categs, MYREAL *rate, MYREAL *probcat);
61 void initprobcat (long categs, MYREAL *probsum, MYREAL *probcat);
62 void init_tbl (world_fmt * world, long locus);
63 void print_weights (FILE * outfile, world_fmt * world, option_fmt * options,
64                     long locus);
65 void print_tbl (FILE * outfile, world_fmt * world, option_fmt * options,
66                 long locus);
67 MYREAL treelike_seq (world_fmt * world, long locus);
68 MYREAL treelike_snp (world_fmt * world, long locus);
69 /*private functions */
70 void getbasefreqs (option_fmt * options, seqmodel_fmt * seq, long locus);
71 void empiricalfreqs (world_fmt * world, option_fmt * options,
72                      seqmodel_fmt * seq, long locus);
73 void makeweights (world_fmt * world, data_fmt * data, option_fmt *options, long locus);
74 void makevalues_seq (world_fmt * world, option_fmt * options, data_fmt * data,
75                      long locus);
76 void make_invarsites (world_fmt * world, data_fmt * data, long locus);
77 void make_invarsites_unlinked (world_fmt * world, data_fmt * data,
78                                long locus);
79 void sitecombine2 (world_fmt * world, data_fmt * data, option_fmt *options, long sites,
80                    long locus);
81 void sitesort2 (world_fmt * world, data_fmt * data, option_fmt *options, long sites, long locus);
82 void sitescrunch2 (world_fmt * world, long sites, long i, long j, long locus);
83 void inputoptions (world_fmt *earth, world_fmt * world, option_fmt * options, data_fmt * data,
84                    long locus);
85 void inputweights (world_fmt * world, data_fmt * data, long chars);
86 void inputcategs (long a, long b, world_fmt * world, option_fmt * options,
87                   data_fmt * data, world_fmt *earth);
88 void initlambda (option_fmt * options);
89 void printweights (FILE * outfile, world_fmt * world, option_fmt * options,
90                    short inc, long chars, short *weight, char *letters);
91 void print_seqfreqs (FILE * outfile, world_fmt * world, option_fmt * options);
92 
93 void snp_invariants (contribarr invariants, world_fmt *world, long locus, phenotype x1, MYREAL *scaler);
94 
95 void makevalues_snp (world_fmt * world, option_fmt * options, data_fmt * data,
96                      long locus);
97 
98 void init_sequences_aliases (world_fmt * earth, world_fmt * world, option_fmt * options,
99                              data_fmt * data, long locus);
100 
101 
102 void check_basefreq (option_fmt * options);
103 
104 extern void swap (void *, void *);
105 
106 
107 
108 /* allocation things */
109 void
init_sequences(world_fmt * world,option_fmt * options,data_fmt * data,long locus)110 init_sequences (world_fmt * world, option_fmt * options, data_fmt * data,
111                 long locus)
112 {
113 
114     long sites = world->data->seq[0]->sites[locus];
115     if (options->datatype == 'u')
116         sites *= 5;
117     if (options->datatype == 'n')
118         sites += 4;
119     if (!world->data->seq[0]->done || world->data->seq[0]->alias == NULL)
120     {
121         world->data->seq[0]->done = TRUE;
122         world->data->seq[0]->alias = (long *) mycalloc (1, sizeof (long) * sites);
123         world->data->seq[0]->ally = (long *) mycalloc (1, sizeof (long) * sites);
124         world->data->seq[0]->savealiasweight = (long *) mycalloc (sites, sizeof (long));
125         world->data->seq[0]->aliasweight = (long *) mycalloc (1, sizeof (long) * sites);
126         world->data->seq[0]->location = (long *) mycalloc (1, sizeof (long) * sites);
127         world->data->seq[0]->category = (long *) mycalloc (1, sizeof (long) * sites);
128         world->data->seq[0]->weight = (short *) mycalloc (1, sizeof (short) * sites);
129     }
130     else
131     {
132         world->data->seq[0]->alias =
133             (long *) myrealloc (world->data->seq[0]->alias, sizeof (long) * sites);
134         world->data->seq[0]->ally =
135             (long *) myrealloc (world->data->seq[0]->ally, sizeof (long) * sites);
136         world->data->seq[0]->savealiasweight =
137             (long *) myrealloc (world->data->seq[0]->savealiasweight,
138                               sizeof (long) * sites);
139         world->data->seq[0]->aliasweight =
140             (long *) myrealloc (world->data->seq[0]->aliasweight,
141                               sizeof (long) * sites);
142         world->data->seq[0]->location =
143             (long *) myrealloc (world->data->seq[0]->location, sizeof (long) * sites);
144         world->data->seq[0]->category =
145             (long *) myrealloc (world->data->seq[0]->category, sizeof (long) * sites);
146         world->data->seq[0]->weight =
147             (short *) myrealloc (world->data->seq[0]->weight, sizeof (short) * sites);
148     }
149 }
150 
151 void
init_sequences_aliases(world_fmt * earth,world_fmt * world,option_fmt * options,data_fmt * data,long locus)152 init_sequences_aliases (world_fmt *earth, world_fmt * world, option_fmt * options,
153                         data_fmt * data, long locus)
154 {
155   inputoptions (earth, world, options, data, locus);
156     if (!options->freqsfrom)
157         getbasefreqs (options, world->data->seq[0], locus);
158     makeweights (world, data, options, locus);
159 }
160 
161 
162 /* menu material ----------------------------------------- */
163 void
initratio(option_fmt * options)164 initratio (option_fmt * options)
165 {
166     long z = 0;
167     char *tmp;
168     char input[LINESIZE];
169     printf
170     ("Transition/transversion ratio?\nEnter a value for each locus, spaced by blanks or commas\n[Minium value > 0.5]");
171     FGETS (input, LINESIZE, stdin);
172     tmp = strtok (input, " ,\n");
173     while (tmp != NULL)
174     {
175         options->ttratio[z++] = atof (tmp);
176         tmp = strtok (NULL, " ,;\n");
177         options->ttratio =
178             (MYREAL *) myrealloc (options->ttratio, sizeof (MYREAL) * (z + 1));
179         options->ttratio[z] = 0.0;
180     }
181 }
182 
183 void
initfreqs(MYREAL * freqa,MYREAL * freqc,MYREAL * freqg,MYREAL * freqt)184 initfreqs (MYREAL *freqa, MYREAL *freqc, MYREAL *freqg, MYREAL *freqt)
185 {
186     char input[LINESIZE];
187     int scanned;
188     MYREAL summ = 0;
189 
190     printf
191     ("Base frequencies for A, C, G, T/U\n (use blanks to separate, if all are equal use a sign [=])?\n");
192     for (;;)
193     {
194         FGETS (input, LINESIZE, stdin);
195         if (input[0] == '=')
196         {
197             scanned = 4;
198             *freqa = *freqc = *freqg = *freqt = 0.25;
199         }
200         else
201 #ifdef USE_MYREAL_FLOAT
202             scanned = sscanf (input, "%f%f%f%f%*[^\n]", freqa, freqc, freqg, freqt);
203 #else
204         scanned = sscanf (input, "%lf%lf%lf%lf%*[^\n]", freqa, freqc, freqg, freqt);
205 #endif
206         if (scanned == 4)
207             break;
208         else
209             printf ("Please enter exactly 4 values.\n");
210     };
211     // adjust frequencies to a total of 1
212     summ = *freqa + *freqc + *freqg + *freqt;
213     if (summ != 1.0)
214         printf ("Frequency values were adjusted to add up to 1.0.\n");
215     *freqa /= summ;
216     *freqc /= summ;
217     *freqg /= summ;
218     *freqt /= summ;
219     printf ("Nucleotide frequencies: A=%f, C=%f, G=%f, T=%f\n", *freqa, *freqc,
220             *freqg, *freqt);
221 }
222 
223 
224 void
initcatn(long * categs)225 initcatn (long *categs)
226 {    /* initialize category number */
227   int retval;
228     do
229     {
230         printf ("Number of categories (1-%d)?\n", MAXCATEGS);
231         retval = scanf ("%ld%*[^\n]", categs);
232         getchar ();
233     }
234     while (*categs > MAXCATEGS || *categs < 1);
235 }
236 
237 
238 boolean
initcategs(long categs,MYREAL * rate,MYREAL * probcat)239 initcategs (long categs, MYREAL *rate, MYREAL *probcat)
240 {    /* initialize rate categories */
241     long i;
242     char input[LINESIZE];
243     char rest[LINESIZE];
244     int scanned;
245     boolean done;
246 
247     for (;;)
248     {
249         printf
250 	  ("Either enter the Shape parameter alpha for Gamma deviated rates\n*OR* enter the rates for each category (use a space to separate)\n===>");fflush(stdout);
251         FGETS (input, LINESIZE, stdin);
252         done = TRUE;
253         if (count_words (input) == 1)
254         {
255             gamma_rates (rate, probcat, categs, input);
256             return TRUE;
257         }
258         for (i = 0; i < categs; i++)
259         {
260 #ifdef USE_MYREAL_FLOAT
261             scanned = sscanf (input, "%f %[^\n]", &rate[i], rest);
262 #else
263             scanned = sscanf (input, "%lf %[^\n]", &rate[i], rest);
264 #endif
265             if ((scanned < 2 && i < (categs - 1))
266                     || (scanned < 1 && i == (categs - 1)))
267             {
268                 printf ("Please enter exactly %ld values.\n", categs);
269                 done = FALSE;
270                 break;
271             }
272             strcpy (input, rest);
273         }
274         if (done)
275             break;
276     }
277     return FALSE;
278 }
279 
280 void
initprobcat(long categs,MYREAL * probsum,MYREAL * probcat)281 initprobcat (long categs, MYREAL *probsum, MYREAL *probcat)
282 {
283     long i;
284     boolean done;
285     char input[LINESIZE];
286     char rest[LINESIZE];
287     int scanned;
288 
289     do
290     {
291         printf ("Probability for each category?");
292         printf (" (use a space to separate)\n");
293         FGETS (input, LINESIZE, stdin);
294         done = TRUE;
295         for (i = 0; i < categs; i++)
296         {
297 #ifdef USE_MYREAL_FLOAT
298             scanned = sscanf (input, "%f %[^\n]", &probcat[i], rest);
299 #else
300             scanned = sscanf (input, "%lf %[^\n]", &probcat[i], rest);
301 #endif
302             if ((scanned < 2 && i < (categs - 1))
303                     || (scanned < 1 && i == (categs - 1)))
304             {
305                 done = FALSE;
306                 printf ("Please enter exactly %ld values.\n", categs);
307                 break;
308             }
309             strcpy (input, rest);
310         }
311         if (!done)
312             continue;
313         *probsum = 0.0;
314         for (i = 0; i < categs; i++)
315             *probsum += probcat[i];
316 
317         if (fabs (1.0 - (*probsum)) > 0.001)
318         {
319             for (i = 0; i < categs; i++)
320                 probcat[i] /= *probsum;
321             printf ("Probabilities were adjusted to add up to one\n");
322             for (i = 0; i < categs; i++)
323                 printf ("  %li> %f\n", i + 1, probcat[i]);
324             printf ("\n\n");
325         }
326     }
327     while (!done);
328 }
329 
330 ///
331 /// constrains the arbitrary site rate variation to an average of 1.0
constrain_rates(long categs,MYREAL * rate,MYREAL * probcat)332 void constrain_rates(long categs, MYREAL *rate, MYREAL *probcat)
333 {
334   char input[LINESIZE];
335   long i;
336   MYREAL mean;
337   printf("By default the rates will be constrained to an have an average rate of 1.0\nPress return if that is OK (preferred option) or enter the word RAW\n===>"); fflush(stdout);
338   FGETS (input, LINESIZE, stdin);
339   if(input[0] == '\0')
340     {
341       mean = 0.0;
342       for (i = 0; i < categs; i++)
343 	{
344 	  mean += probcat[i] * rate[i];
345 	}
346       for (i = 0; i < categs; i++)
347 	{
348 	  rate[i] /= mean;
349 	}
350     }
351 }
352 
353 /*data read material ===================================== */
354 ///
355 /// read sequence data and linked SNP data
356 void
make_sequences(world_fmt * world,option_fmt * options,data_fmt * data,long locus)357 make_sequences (world_fmt * world, option_fmt * options, data_fmt * data,
358                 long locus)
359 {
360     if (world->sumtips<=1)
361       {
362 	data->skiploci[locus] = TRUE;
363         world->data->skiploci[locus] = TRUE;
364         world->skipped += 1;
365       }
366 
367   makevalues_seq (world, options, data, locus);
368   if (options->freqsfrom)
369     {
370       empiricalfreqs (world, options, world->data->seq[0], locus);
371       getbasefreqs (options, world->data->seq[0], locus);
372     }
373 #ifdef BEAGLE
374   if(world->mutationmodels[world->sublocistarts[locus]].basefreqs==NULL)
375     world->mutationmodels[world->sublocistarts[locus]].basefreqs = (double*) calloc(4,sizeof(double));
376   world->mutationmodels[world->sublocistarts[locus]].basefreqs[0] =  world->data->seq[0]->basefrequencies[0];
377   world->mutationmodels[world->sublocistarts[locus]].basefreqs[1] =  world->data->seq[0]->basefrequencies[1];
378   world->mutationmodels[world->sublocistarts[locus]].basefreqs[2] =  world->data->seq[0]->basefrequencies[2];
379   world->mutationmodels[world->sublocistarts[locus]].basefreqs[3] =  world->data->seq[0]->basefrequencies[3];
380   world->mutationmodels[world->sublocistarts[locus]].numstates=4;
381 #endif
382 
383 }
384 
385 ///
386 /// read unlinked SNP data (each locus must be only 1 site)
387 void
make_snp(world_fmt * world,option_fmt * options,data_fmt * data,long locus)388 make_snp (world_fmt * world, option_fmt * options, data_fmt * data,
389           long locus)
390 {
391   makevalues_snp (world, options, data, locus);
392   /*if (options->freqsfrom)
393     {
394     empiricalfreqs (world, world->data->seq, locus); */
395   options->freqsfrom = FALSE;
396   getbasefreqs (options, world->data->seq[0], locus);
397   /* }
398    */
399 }
400 
401 /* private functions================================== */
402 void
getbasefreqs(option_fmt * options,seqmodel_fmt * seq,long locus)403 getbasefreqs (option_fmt * options, seqmodel_fmt * seq, long locus)
404 {
405   long l;
406 
407   register MYREAL freqa, freqc, freqg, freqt, freqr, freqy;
408   register MYREAL /*freqar, freqcy,*/ freqgr, freqty;
409 
410   MYREAL aa, bb;
411   if (locus == 0)
412     seq->ttratio = options->ttratio[0];
413   else
414     {
415       for (l = 1; l <= locus; l++)
416         {
417 	  if (options->ttratio[l] == 0.0)
418             {
419 	      seq->ttratio = options->ttratio[l - 1];
420 	      break;
421             }
422 	  seq->ttratio = options->ttratio[l];
423         }
424       if (l > locus)
425 	seq->ttratio = options->ttratio[locus];
426     }
427   check_basefreq (options);
428 
429   seq->basefrequencies[NUC_A] = options->freqa;
430   seq->basefrequencies[NUC_C] = options->freqc;
431   seq->basefrequencies[NUC_G] = options->freqg;
432   seq->basefrequencies[NUC_T] = options->freqt;
433   freqa = seq->basefrequencies[NUC_A];
434   freqc = seq->basefrequencies[NUC_C];
435   freqg = seq->basefrequencies[NUC_G];
436   freqt = seq->basefrequencies[NUC_T];
437   seq->basefrequencies[NUC_R] = freqa + freqg;
438   seq->basefrequencies[NUC_Y] = freqc + freqt;
439   freqr = seq->basefrequencies[NUC_R];
440   freqy = seq->basefrequencies[NUC_Y];
441   seq->basefrequencies[NUC_AR]= freqa / freqr;
442   seq->basefrequencies[NUC_CY]= freqc / freqy;
443   seq->basefrequencies[NUC_GR]= freqg / freqr;
444   seq->basefrequencies[NUC_TY]= freqt / freqy;
445   //freqar = seq->basefrequencies[NUC_AR];
446   //freqcy = seq->basefrequencies[NUC_CY];
447   freqgr = seq->basefrequencies[NUC_GR];
448   freqty = seq->basefrequencies[NUC_TY];
449 
450   aa =
451     seq->ttratio * (freqr) * (freqy) - freqa * freqg -
452     freqc * freqt;
453   bb = freqa * (freqgr) + freqc * (freqty);
454   seq->xi = aa / (aa + bb);
455   seq->xv = 1.0 - seq->xi;
456   if (seq->xi <= 0.0)
457     {
458       warning ("This transition/transversion ratio (%f)\n",seq->ttratio);
459       warning ("is impossible with these base frequencies (%f, %f, %f, %f)!\n",freqa,freqc,freqg,freqt);
460       seq->xi = 0.00001; // do not set this to zero because of the 1/(fracchange=xi*(...))
461       seq->xv = 0.99999;
462       seq->ttratio =
463 	(freqa * freqg +
464 	 freqc * freqt) / ((freqr) * (freqy));
465 
466       warning (" Transition/transversion parameter reset\n");
467       warning ("  so transition/transversion ratio is %10.6f\n\n",
468 	       (seq->ttratio));
469     }
470   // use 1/frac as precomputation speed up
471   seq->fracchange = 1. / (
472 			  (seq->xi) * (2. * freqa * (freqgr) +
473 				       2. * freqc * (freqty)) + (seq->xv) * (1.0 -
474 										      freqa *
475 										      freqa -
476 										      freqc *
477 										      freqc -
478 										      freqg *
479 										      freqg -
480 										      freqt *
481 										      freqt));
482 }
483 
484 /*===================================================*/
485 
486 void
makeweights(world_fmt * world,data_fmt * data,option_fmt * options,long locus)487 makeweights (world_fmt * world, data_fmt * data, option_fmt *options, long locus)
488 {
489     /* make up weights vector to avoid duplicate computations */
490     long i;
491     seqmodel_fmt *seq = world->data->seq[0];
492     world->data->seq[0]->endsite = 1;
493     for (i = 0; i < seq->sites[locus]; i++)
494     {
495         seq->alias[i] = i + 1;
496         seq->ally[i] = 0;
497         seq->aliasweight[i] = seq->weight[i];
498         seq->location[i] = 0;
499     }
500     sitesort2 (world, data, options, seq->sites[locus], locus);
501     sitecombine2 (world, data, options, seq->sites[locus], locus);
502     sitescrunch2 (world, seq->sites[locus], 1, 2, locus);
503     for (i = 1; i <= seq->sites[locus]; i++)
504     {
505         if (seq->aliasweight[i - 1] > 0)
506             seq->endsite = i;
507     }
508     for (i = 1; i <= seq->endsite; i++)
509     {
510         seq->location[seq->alias[i - 1] - 1] = i;
511         seq->ally[seq->alias[i - 1] - 1] =
512             seq->alias[i - 1];
513     }
514     init_sequences2 (world, seq, locus);
515     memcpy(seq->savealiasweight, seq->aliasweight, sizeof(long) * seq->endsite);
516 }    /* makeweights */
517 
518 void
init_sequences2(world_fmt * world,seqmodel_fmt * seq,long locus)519 init_sequences2 (world_fmt * world, seqmodel_fmt * seq, long locus)
520 {
521     if (world->contribution == NULL)
522       {
523         world->contribution =
524             (contribarr *) mymalloc ((4 + seq->endsite) * sizeof (contribarr));
525 	//	printf("%i> temp=%f size=%li: world->contribution allocated\n",myID, world->heat,(4 + seq->endsite) * sizeof (contribarr));
526       }
527     else
528       {
529         world->contribution =
530 	  (contribarr *) myrealloc (world->contribution,
531                                     (4 + seq->endsite) * sizeof (contribarr));
532 	//	printf("%i> temp=%f size=%li: world->contribution REallocated\n",myID, world->heat,(4 + seq->endsite) * sizeof (contribarr));
533       }
534 }
535 
536 
set_nucleotide(MYREAL * treedata,const char nucleotide,const MYREAL seqerr)537 void set_nucleotide(MYREAL *treedata, const char nucleotide, const MYREAL seqerr)
538 {
539   long b;
540   const MYREAL seqerr3 = seqerr / 3.;
541   const MYREAL seqerr23 = 2. * seqerr / 3.;
542   const MYREAL oneseqerr = 1.0 - seqerr;
543   const MYREAL oneseqerr23 = 1.0 - seqerr23;
544   const MYREAL oneseqerr3 = 1.0 - seqerr3;
545 
546   for (b = 0; b < 4; b++)
547     treedata[b] = 0.0 + seqerr3;
548   switch (nucleotide)
549     {
550     case 'A':
551       treedata[NUC_A] = oneseqerr;
552       break;
553 
554     case 'C':
555       treedata[NUC_C] = oneseqerr;
556       break;
557 
558     case 'G':
559       treedata[NUC_G] = oneseqerr;
560       break;
561 
562     case 'T':
563       treedata[NUC_T] = oneseqerr;
564       break;
565 
566     case 'U':
567       treedata[NUC_T] = oneseqerr;
568       break;
569 
570     case 'M':
571       treedata[NUC_A] = oneseqerr;
572       treedata[NUC_C] = oneseqerr;
573       break;
574 
575     case 'R':
576       treedata[NUC_A] = oneseqerr23;
577       treedata[NUC_G] = oneseqerr23;
578       break;
579 
580     case 'W':
581       treedata[NUC_A] = oneseqerr23;
582       treedata[NUC_T] = oneseqerr23;
583       break;
584 
585     case 'S':
586       treedata[NUC_C] = oneseqerr23;
587       treedata[NUC_G] = oneseqerr23;
588       break;
589 
590     case 'Y':
591       treedata[NUC_C] = oneseqerr23;
592       treedata[NUC_T] = oneseqerr23;
593       break;
594 
595     case 'K':
596       treedata[NUC_G] = oneseqerr23;
597       treedata[NUC_T] = oneseqerr23;
598       break;
599 
600     case 'B':
601       treedata[NUC_A] = seqerr;
602       treedata[NUC_C] = oneseqerr3;
603       treedata[NUC_G] = oneseqerr3;
604       treedata[NUC_T] = oneseqerr3;
605       break;
606 
607     case 'D':
608       treedata[NUC_A] = oneseqerr3;
609       treedata[NUC_C] = seqerr;
610       treedata[NUC_G] = oneseqerr3;
611       treedata[NUC_T] = oneseqerr3;
612       break;
613 
614     case 'H':
615       treedata[NUC_A] = oneseqerr3;
616       treedata[NUC_C] = oneseqerr3;
617       treedata[NUC_G] = seqerr;
618       treedata[NUC_T] = oneseqerr3;
619       break;
620 
621     case 'V':
622       treedata[NUC_A] = oneseqerr3;
623       treedata[NUC_C] = oneseqerr3;
624       treedata[NUC_G] = oneseqerr3;
625       treedata[NUC_T] = seqerr;
626       break;
627 
628     case 'N':
629       for (b = 0; b < 4; b++)
630 	treedata[b] = 1.0;
631       break;
632 
633     case 'X':
634       for (b = 0; b < 4; b++)
635 	treedata[b] = 1.0;
636       break;
637 
638     case '?':
639       for (b = 0; b < 4; b++)
640 	treedata[b] = 1.0;
641       break;
642 
643     case 'O':
644       for (b = 0; b < 4; b++)
645 	treedata[b] = 1.0;
646       break;
647 #ifdef GAP
648     case '-':
649       treedata[4] = 1.0;
650       break;
651 #else
652     case '-':
653       for (b = 0; b < 4; b++)
654 	treedata[b] = 1.0;
655       break;
656 #endif
657     }
658 }
659 
660 
661 void
makevalues_seq(world_fmt * world,option_fmt * options,data_fmt * data,long locus)662 makevalues_seq (world_fmt * world, option_fmt * options, data_fmt * data,
663                 long locus)
664 {
665   seqmodel_fmt *seq = world->data->seq[0];
666   long ii, ind, j, k, l, pop, top;
667   long z=0;
668   node **treenode = world->nodep;
669   MYREAL seqerr = options->seqerror;
670   long zpop;
671   // we need to recalculate the sumtips
672   world->sumtips = 0;
673 
674   for (pop = 0; pop < data->numpop; pop++)
675     {
676       zpop = 0;
677       top = max_shuffled_individuals(options, data, pop, locus);
678       for (ii = 0; ii < top; ii++)
679 	{
680 	  ind = data->shuffled[pop][locus][ii];
681 	  zpop++;
682 	  //	  if(options->include_unknowns)
683 	  // {
684 	      if (!options->usertree)
685 		{
686 		  allocate_tip (world, options, &(treenode[z]), pop, locus, z, ind,
687 				data->indnames[pop][ind]);
688 		  //treenode[z]->tyme = find_tipdate(treenode[z]->nayme, pop,world);
689 		  world->data->sampledates[pop][locus][ind].id = treenode[z]->id;
690 		  z++;
691 		}
692 	      for (k = 0; k < seq->endsite; k++)
693 		{
694 		  j = seq->alias[k] - 1;
695 		  for (l = 0; l < world->options->rcategs; l++)
696 		    {
697 		      set_nucleotide(treenode[z-1]->x.s[k][l],data->yy[pop][ind][locus][0][j], seqerr);
698 		    }
699 		}
700         }
701       data->numalleles[pop][locus] = zpop;
702       world->sumtips += zpop;
703     }
704 }
705 
706 
707 
708 void
makevalues_snp(world_fmt * world,option_fmt * options,data_fmt * data,long locus)709 makevalues_snp (world_fmt * world, option_fmt * options, data_fmt * data,
710                 long locus)
711 {
712   long i, ii, j, k, l, pop;
713   long b, f;
714   long ind;
715   seqmodel_fmt *seq = world->data->seq[0];
716   node **treenode = world->nodep;
717   f = seq->addon;
718   for (k = 0; k < seq->endsite * (f + 1);
719        k += (1 + seq->addon))
720     {
721       j = seq->alias[k / (f + 1)] - 1;
722       i = -1;
723       for (pop = 0; pop < data->numpop; pop++)
724 	{
725 	  for (ii = 0; ii < data->numalleles[pop][locus]; ii++)
726 	    {
727 	      ind = data->shuffled[pop][locus][ii];
728 	      i++;
729 	      if (k==0 && !options->usertree)
730 		{
731 		  strcpy (treenode[i]->nayme, data->indnames[pop][ind][locus]);
732 		  treenode[i]->tyme = find_tipdate(treenode[i]->nayme, pop,world);
733 		}
734 	      for (l = 0; l < world->options->rcategs; l++)
735 		{
736 		  if (pop == 0)
737 		    {
738 		      //                        treenode[i]->x.s[k][l] = vec_float_one();
739 		      // reset all values for the snp columns
740 		      //  treenode[i]->x.s[k][l] = vec_zero();
741 		      for (b = 1; b <= 4; b++)
742 			{
743 			  memset (treenode[i]->x.s[k + b][l], 0,
744 				  sizeof (MYREAL) * 4);
745 			  treenode[i]->x.s[k + b][l][b - 1] = 1.0;
746 			}
747 		      continue;
748 		    }
749 		  else
750 		      {
751                         for (b = 0; b <= 4; b++)
752 			  memset (treenode[i]->x.s[k + b][l], 0,
753 				  sizeof (MYREAL) * 4);
754 		      }
755 
756                     switch (data->yy[pop][ind][locus][0][j])
757 		      {
758 		      case 'A':
759                         treenode[i]->x.s[k][l][NUC_A] = 1.0;
760                         treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
761                         treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
762                         treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
763                         treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
764                         break;
765 
766                     case 'C':
767                         treenode[i]->x.s[k][l][NUC_C] = 1.0;
768                         treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
769                         treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
770                         treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
771                         treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
772                         break;
773 
774                     case 'G':
775                         treenode[i]->x.s[k][l][NUC_G] = 1.0;
776                         treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
777                         treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
778                         treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
779                         treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
780                         break;
781                     case 'T':
782                     case 'U':
783                         treenode[i]->x.s[k][l][NUC_T] = 1.0;
784                         treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
785                         treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
786                         treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
787                         treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
788                         break;
789 
790                     case 'M':
791                         treenode[i]->x.s[k][l][NUC_A] = 1.0;
792                         treenode[i]->x.s[k][l][NUC_C] = 1.0;
793                         treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
794                         treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
795                         treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
796                         treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
797                         treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
798                         treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
799                         treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
800                         treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
801                         break;
802 
803                     case 'R':
804                         treenode[i]->x.s[k][l][NUC_A] = 1.0;
805                         treenode[i]->x.s[k][l][NUC_G] = 1.0;
806                         treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
807                         treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
808                         treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
809                         treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
810                         treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
811                         treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
812                         treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
813                         treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
814                         break;
815 
816                     case 'W':
817                         treenode[i]->x.s[k][l][NUC_A] = 1.0;
818                         treenode[i]->x.s[k][l][NUC_T] = 1.0;
819                         treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
820                         treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
821                         treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
822                         treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
823                         treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
824                         treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
825                         treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
826                         treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
827                         break;
828 
829                     case 'S':
830                         treenode[i]->x.s[k][l][NUC_C] = 1.0;
831                         treenode[i]->x.s[k][l][NUC_G] = 1.0;
832                         treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
833                         treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
834                         treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
835                         treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
836                         treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
837                         treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
838                         treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
839                         treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
840                         break;
841 
842                     case 'Y':
843                         treenode[i]->x.s[k][l][NUC_C] = 1.0;
844                         treenode[i]->x.s[k][l][NUC_T] = 1.0;
845                         treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
846                         treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
847                         treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
848                         treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
849                         treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
850                         treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
851                         treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
852                         treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
853                         break;
854 
855                     case 'K':
856                         treenode[i]->x.s[k][l][NUC_G] = 1.0;
857                         treenode[i]->x.s[k][l][NUC_T] = 1.0;
858                         treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
859                         treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
860                         treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
861                         treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
862                         treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
863                         treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
864                         treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
865                         treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
866                         break;
867 
868                     case 'B':
869                         treenode[i]->x.s[k][l][NUC_C] = 1.0;
870                         treenode[i]->x.s[k][l][NUC_G] = 1.0;
871                         treenode[i]->x.s[k][l][NUC_T] = 1.0;
872                         treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
873                         treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
874                         treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
875                         treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
876                         treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
877                         treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
878                         treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
879                         treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
880                         treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
881                         treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
882                         treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
883                         treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
884                         break;
885 
886                     case 'D':
887                         treenode[i]->x.s[k][l][NUC_A] = 1.0;
888                         treenode[i]->x.s[k][l][NUC_G] = 1.0;
889                         treenode[i]->x.s[k][l][NUC_T] = 1.0;
890                         treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
891                         treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
892                         treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
893                         treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
894                         treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
895                         treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
896                         treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
897                         treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
898                         treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
899                         treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
900                         treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
901                         treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
902                         break;
903 
904                     case 'H':
905                         treenode[i]->x.s[k][l][NUC_A] = 1.0;
906                         treenode[i]->x.s[k][l][NUC_C] = 1.0;
907                         treenode[i]->x.s[k][l][NUC_T] = 1.0;
908                         treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
909                         treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
910                         treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
911                         treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
912                         treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
913                         treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
914                         treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
915                         treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
916                         treenode[i]->x.s[k + 1][l][NUC_T] = 1.0;
917                         treenode[i]->x.s[k + 2][l][NUC_T] = 1.0;
918                         treenode[i]->x.s[k + 3][l][NUC_T] = 1.0;
919                         treenode[i]->x.s[k + 4][l][NUC_T] = 1.0;
920                         break;
921 
922                     case 'V':
923                         treenode[i]->x.s[k][l][NUC_A] = 1.0;
924                         treenode[i]->x.s[k][l][NUC_C] = 1.0;
925                         treenode[i]->x.s[k][l][NUC_G] = 1.0;
926                         treenode[i]->x.s[k + 1][l][NUC_A] = 1.0;
927                         treenode[i]->x.s[k + 2][l][NUC_A] = 1.0;
928                         treenode[i]->x.s[k + 3][l][NUC_A] = 1.0;
929                         treenode[i]->x.s[k + 4][l][NUC_A] = 1.0;
930                         treenode[i]->x.s[k + 1][l][NUC_C] = 1.0;
931                         treenode[i]->x.s[k + 2][l][NUC_C] = 1.0;
932                         treenode[i]->x.s[k + 3][l][NUC_C] = 1.0;
933                         treenode[i]->x.s[k + 4][l][NUC_C] = 1.0;
934                         treenode[i]->x.s[k + 1][l][NUC_G] = 1.0;
935                         treenode[i]->x.s[k + 2][l][NUC_G] = 1.0;
936                         treenode[i]->x.s[k + 3][l][NUC_G] = 1.0;
937                         treenode[i]->x.s[k + 4][l][NUC_G] = 1.0;
938                         break;
939 
940                     case 'N':
941                     case 'X':
942                     case '-':
943                     case 'O':
944                     case '?':
945                         for (b = 0; b < 4; b++)
946                         {
947                             treenode[i]->x.s[k][l][b] = 1.0;
948                             treenode[i]->x.s[k + 1][l][b] = 1.0;
949                             treenode[i]->x.s[k + 2][l][b] = 1.0;
950                             treenode[i]->x.s[k + 3][l][b] = 1.0;
951                             treenode[i]->x.s[k + 4][l][b] = 1.0;
952                         }
953                         break;
954                     default:
955                         break;
956 
957                     }
958                 }
959             }
960         }
961     }
962 }
963 
964 
965 void
make_invarsites(world_fmt * world,data_fmt * data,long locus)966 make_invarsites (world_fmt * world, data_fmt * data, long locus)
967 {
968     long i, k, z, ii, pop, l;
969     long endsite = world->data->seq[0]->endsite;
970     node **treenode = world->nodep;
971     i = -1;
972     for (pop = 0; pop < data->numpop; pop++)
973     {
974         for (ii = 0; ii < data->numalleles[pop][locus]; ii++)
975         {
976             i++;
977             z = 0;
978             for (k = endsite; k < endsite + 4; k++)
979             {
980                 for (l = 0; l < world->options->rcategs; l++)
981                 {
982                     memset (treenode[i]->x.s[k][l], 0, sizeof (MYREAL) * 4);
983                     treenode[i]->x.s[k][l][z] = 1.0;
984                 }
985                 z++;
986             }
987         }
988     }
989 }
990 
991 void
make_invarsites_unlinked(world_fmt * world,data_fmt * data,long locus)992 make_invarsites_unlinked (world_fmt * world, data_fmt * data, long locus)
993 {
994     long i, k, z, ii, pop, l;
995     long endsite = world->data->seq[0]->endsite;
996     node **treenode = world->nodep;
997     i = -1;
998     for (pop = 0; pop < data->numpop; pop++)
999     {
1000         for (ii = 0; ii < data->numalleles[pop][locus]; ii++)
1001         {
1002             i++;
1003             z = 0;
1004             if (pop == PANEL)
1005             {
1006                 for (k = endsite * 5; k < endsite * 5 + 4; k++)
1007                 {
1008                     for (l = 0; l < world->options->rcategs; l++)
1009                     {
1010 #ifdef ALTIVEC
1011                         memset (treenode[i]->x.s[k][l].f, 0, sizeof (MYREAL) * 4);
1012                         treenode[i]->x.s[k][l].f[z] = 1.0;
1013 #else
1014                         memset (treenode[i]->x.s[k][l], 0, sizeof (MYREAL) * 4);
1015                         treenode[i]->x.s[k][l][z] = 1.0;
1016 #endif
1017                     }
1018                     z++;
1019                 }
1020             }
1021             else
1022             {
1023                 for (k = endsite * 5; k < endsite * 5 + 4; k++)
1024                 {
1025                     for (l = 0; l < world->options->rcategs; l++)
1026                     {
1027 #ifdef ALTIVEC
1028                         treenode[i]->x.s[k][l].f[0] = 1.0; //putting ? for data
1029                         treenode[i]->x.s[k][l].f[1] = 1.0;
1030                         treenode[i]->x.s[k][l].f[2] = 1.0;
1031                         treenode[i]->x.s[k][l].f[3] = 1.0;
1032 #else
1033                         treenode[i]->x.s[k][l][0] = 1.0; //putting ? for data
1034                         treenode[i]->x.s[k][l][1] = 1.0;
1035                         treenode[i]->x.s[k][l][2] = 1.0;
1036                         treenode[i]->x.s[k][l][3] = 1.0;
1037 #endif
1038                     }
1039                 }
1040             }
1041         }
1042     }
1043 }
1044 
1045 void
sitesort2(world_fmt * world,data_fmt * data,option_fmt * options,long sites,long locus)1046 sitesort2 (world_fmt * world, data_fmt * data, option_fmt *options, long sites, long locus)
1047 {
1048   long gap, i, i1, j, jj, jg, k, kk, kkk, itemp, pop, z = 0;
1049   boolean flip, tied, samewt;
1050   seqmodel_fmt *seq;
1051   long *tempsum, *temppop;
1052   long numind;
1053   MYREAL a1,a2;
1054   tempsum = (long *) mycalloc (1, sizeof (long) * data->numpop);
1055   temppop = (long *) mycalloc (1, sizeof (long) * data->numpop);
1056   for (i = 0; i < data->numpop; i++)
1057     {
1058       if(options->randomsubset > 0)
1059 	{
1060 	  numind = (options->randomsubset < data->numind[i][locus] ? options->randomsubset : data->numind[i][locus]);
1061 	}
1062       else
1063 	{
1064 	  numind = data->numind[i][locus] ;
1065 	}
1066       //      if (numind > 0)
1067       //  {
1068 	  temppop[z] = i;
1069 	  if (z == 0)
1070 	    tempsum[z] = numind;
1071 	  else
1072 	    tempsum[z] = tempsum[z - 1] + numind;
1073           z++;
1074 	    //  }
1075     }
1076     seq = world->data->seq[0];
1077     gap = sites / 2;
1078     while (gap > 0)
1079     {
1080         for (i = gap + 1; i <= sites; i++)
1081         {
1082             j = i - gap;
1083             flip = TRUE;
1084             while (j > 0 && flip)
1085             {
1086                 jj = seq->alias[j - 1];
1087                 jg = seq->alias[j + gap - 1];
1088                 samewt = ((seq->weight[jj - 1] != 0)
1089                           && (seq->weight[jg - 1] != 0))
1090                          || ((seq->weight[jj - 1] == 0) && (seq->weight[jg - 1] == 0));
1091                 tied = samewt
1092                        && (seq->category[jj - 1] == seq->category[jg - 1]);
1093                 flip = ((!samewt) && (seq->weight[jj - 1] == 0))
1094                        || (samewt
1095                            && (seq->category[jj - 1] > seq->category[jg - 1]));
1096                 k = 0;
1097                 pop = 0;
1098                 kk = -1;
1099                 while (k < world->sumtips && tied)
1100                 {
1101                     if (k == tempsum[pop])
1102                     {
1103                         kk = 0;
1104                         pop++;
1105                     }
1106                     else
1107                     {
1108                         kk++;
1109                     }
1110 		    i1 = temppop[pop];
1111 		    //debug1
1112 		    if (data->numind[i1][locus]>0)
1113 		      {
1114 			kkk = data->shuffled[i1][locus][kk];
1115 			a1 = data->yy[i1][kkk][locus][0][jj - 1];
1116 			a2 = data->yy[i1][kkk][locus][0][jg - 1];
1117 			flip =
1118 			  (a1 > a2);
1119 			tied = (tied
1120 				&& a1 == a2);
1121 		      }
1122                     k++;
1123                 }
1124                 if (!flip)
1125                     break;
1126                 itemp = seq->alias[j - 1];
1127                 seq->alias[j - 1] = seq->alias[j + gap - 1];
1128                 seq->alias[j + gap - 1] = itemp;
1129                 itemp = seq->aliasweight[j - 1];
1130                 seq->aliasweight[j - 1] = seq->aliasweight[j + gap - 1];
1131                 seq->aliasweight[j + gap - 1] = itemp;
1132                 j -= gap;
1133             }
1134         }
1135         gap /= 2;
1136     }
1137     myfree(tempsum);
1138     myfree(temppop);
1139 }    /* sitesort2 */
1140 
1141 
1142 void
sitecombine2(world_fmt * world,data_fmt * data,option_fmt * options,long sites,long locus)1143 sitecombine2 (world_fmt * world, data_fmt * data, option_fmt *options, long sites, long locus)
1144 {
1145   long i, i1, j, k, kk, pop, z = 0;
1146     boolean tied, samewt;
1147     seqmodel_fmt *seq;
1148     long *tempsum, *temppop;
1149     long numind;
1150     long kkk;
1151     tempsum = (long *) mycalloc (1, sizeof (long) * data->numpop);
1152     temppop = (long *) mycalloc (1, sizeof (long) * data->numpop);
1153     if(options->randomsubset > 0)
1154       {
1155 	tempsum[0] = (options->randomsubset < data->numind[0][locus] ? options->randomsubset : data->numind[0][locus]);
1156       }
1157     else
1158       {
1159 	tempsum[0] = data->numind[0][locus];
1160       }
1161 
1162     for (i = 0; i < data->numpop; i++)
1163     {
1164       numind = ((options->randomsubset > 0) && (options->randomsubset < data->numind[i][locus])) ? options->randomsubset : data->numind[i][locus];
1165       //debug2
1166       //      if (numind > 0)
1167       //  {
1168 	  temppop[z] = i;
1169 	  if (z == 0)
1170 	    tempsum[z] = numind;
1171 	  else
1172                 tempsum[z] = tempsum[z - 1] + numind;
1173             z++;
1174 	    //  }
1175     }
1176 
1177     seq = world->data->seq[0];
1178     i = 1;
1179     while (i < sites)
1180     {
1181         j = i + 1;
1182         tied = TRUE;
1183         while (j <= sites && tied)
1184         {
1185             samewt = ((seq->aliasweight[i - 1] != 0)
1186                       && (seq->aliasweight[j - 1] != 0))
1187                      || ((seq->aliasweight[i - 1] == 0)
1188                          && (seq->aliasweight[j - 1] == 0));
1189             tied = samewt
1190                    && (seq->category[seq->alias[i - 1] - 1] ==
1191                        seq->category[seq->alias[j - 1] - 1]);
1192             k = 0;
1193             pop = 0;
1194             kk = -1;
1195             while (k < world->sumtips && tied)
1196             {
1197                 if (k == tempsum[pop])
1198                 {
1199                     kk = 0;
1200                     pop++;
1201                 }
1202                 else
1203                 {
1204                     kk++;
1205                 }
1206 		i1 = temppop[pop];
1207 		if (data->numind[i1][locus]>0)
1208 		  {
1209 		    kkk = data->shuffled[i1][locus][kk];
1210 		    tied = (tied
1211 			    && data->yy[i1][kkk][locus][0][seq->
1212 							   alias[i - 1] -
1213 							   1] ==
1214 			    data->yy[i1][kkk][locus][0][seq->alias[j - 1] -
1215 							1]);
1216 		  }
1217 		k++;
1218 	    }
1219 	    if (!tied)
1220 	      break;
1221 	    seq->aliasweight[i - 1] += seq->aliasweight[j - 1];
1222 	    seq->aliasweight[j - 1] = 0;
1223 	    seq->ally[seq->alias[j - 1] - 1] = seq->alias[i - 1];
1224 	    j++;
1225 	}
1226 	i = j;
1227     }
1228     myfree(temppop);
1229     myfree(tempsum);
1230 }    /* sitecombine2 */
1231 
1232 
1233 void
sitescrunch2(world_fmt * world,long sites,long i,long j,long locus)1234 sitescrunch2 (world_fmt * world, long sites, long i, long j, long locus)
1235 {
1236     /* move so positively weighted sites come first */
1237     /* used by dnainvar, dnaml, dnamlk, & restml */
1238     long itemp;
1239     boolean done, found;
1240     seqmodel_fmt *seq;
1241     seq = world->data->seq[0];
1242     done = FALSE;
1243     while (!done)
1244     {
1245         //found = FALSE;
1246         if (seq->aliasweight[i - 1] > 0)
1247             i++;
1248         else
1249         {
1250             if (j <= i)
1251                 j = i + 1;
1252             if (j <= sites)
1253             {
1254                 //found = FALSE;
1255                 do
1256                 {
1257                     found = (seq->aliasweight[j - 1] > 0);
1258                     j++;
1259                 }
1260                 while (!(found || j > sites));
1261                 if (found)
1262                 {
1263                     j--;
1264                     itemp = seq->alias[i - 1];
1265                     seq->alias[i - 1] = seq->alias[j - 1];
1266                     seq->alias[j - 1] = itemp;
1267                     itemp = seq->aliasweight[i - 1];
1268                     seq->aliasweight[i - 1] = seq->aliasweight[j - 1];
1269                     seq->aliasweight[j - 1] = itemp;
1270                 }
1271                 else
1272                     done = TRUE;
1273             }
1274             else
1275                 done = TRUE;
1276         }
1277         done = (done || i >= sites);
1278     }
1279 }    /* sitescrunch2 */
1280 
1281 void
inputoptions(world_fmt * earth,world_fmt * world,option_fmt * options,data_fmt * data,long locus)1282 inputoptions (world_fmt *earth, world_fmt * world, option_fmt * options, data_fmt * data,
1283               long locus)
1284 {
1285     long i;
1286     long sites = world->data->seq[0]->sites[locus];
1287     for (i = 0; i < sites; i++)
1288         world->data->seq[0]->category[i] = 1;
1289     for (i = 0; i < sites; i++)
1290         world->data->seq[0]->weight[i] = 1;
1291     if (options->weights)
1292         inputweights (world, data, sites);
1293     world->data->seq[0]->weightsum = 0;
1294     for (i = 0; i < sites; i++)
1295         world->data->seq[0]->weightsum += world->data->seq[0]->weight[i];
1296     if (world->options->categs > 1)
1297     {
1298       inputcategs (0, sites, world, options, data, earth);
1299     }
1300 }    /* inputoptions */
1301 
1302 void
inputweights(world_fmt * world,data_fmt * data,long chars)1303 inputweights (world_fmt * world, data_fmt * data, long chars)
1304 {
1305     /* input the character weights, 0-9 and A-Z for weights 0 - 35 */
1306     char ch;
1307     long i;
1308 	char input[1024];
1309 
1310     ch = getc (data->weightfile);
1311     while (ch == '#')
1312     {
1313         FGETS(input, LINESIZE,data->weightfile);
1314         ch = getc (data->weightfile);
1315     }
1316     ungetc (ch, data->weightfile);
1317     for(i=0;i<chars;i++)
1318       {
1319 	world->data->seq[0]->weight[i] = 1;
1320         ch = getc (data->weightfile);
1321 
1322 	if (isdigit ((int) ch))
1323 	  world->data->seq[0]->weight[i] = ch - '0';
1324 	else
1325 	  {
1326 	    if (isalpha ((int) ch))
1327 	      {
1328 		ch = uppercase (ch);
1329 		world->data->seq[0]->weight[i] = (short) (ch - 'A' + 10);
1330 	      }
1331 	    else
1332 	      {
1333 		if(isspace((int) ch))
1334 		  {
1335 		    i--;
1336 		    continue;
1337 		  }
1338 		else
1339 		  printf ("ERROR: Bad weight character: %c\n", ch);
1340 		exit (EXIT_FAILURE);
1341 	      }
1342 	  }
1343       }
1344 }    /* inputweights */
1345 
1346 void
inputcategs(long a,long b,world_fmt * world,option_fmt * options,data_fmt * data,world_fmt * earth)1347 inputcategs (long a, long b, world_fmt * world, option_fmt * options,
1348              data_fmt * data, world_fmt *earth)
1349 {
1350     /* input the categories, 1-9 */
1351     char ch;
1352     long i;
1353     char input[LINESIZE];
1354     int retval;
1355     if(data->catfile!=NULL)
1356       {
1357 	ch = getc (data->catfile);
1358 	while (ch == '#')
1359 	  {
1360 	    FGETS(input, LINESIZE,data->catfile);
1361 	    ch = getc (data->catfile);
1362 	  }
1363 	ungetc (ch, data->catfile);
1364 	retval = fscanf (data->catfile, "%ld", &options->categs);
1365 	options->rate =
1366 	  (MYREAL *) myrealloc (options->rate, sizeof (MYREAL) * options->categs);
1367 	for (i = 0; i < options->categs; i++)
1368 	  {
1369 #ifdef USE_MYREAL_FLOAT
1370 	    retval = fscanf (data->catfile, "%f", &options->rate[i]);
1371 #else
1372 	    retval = fscanf (data->catfile, "%lf", &options->rate[i]);
1373 #endif
1374 	  }
1375 	for (i = a; i < b; i++)
1376 	  {
1377 	    if ((ch >= '1') && (ch <= ('0' + world->options->categs)))
1378 	      world->data->seq[0]->category[i] = ch - '0';
1379 	    else
1380 	      {
1381 		if(isspace((int) ch))
1382 		  {
1383 		    i--;
1384 		    continue;
1385 		  }
1386 		else
1387 		  {
1388 		    printf
1389 		      ("BAD CATEGORY CHARACTER: %c -- CATEGORIES ARE CURRENTLY 1-%ld\n",
1390 		       ch, world->options->categs);
1391 		    exit (EXIT_FAILURE);
1392 		  }
1393 	      }
1394 	  }
1395 	fclose(data->catfile);
1396 	data->catfile=NULL;
1397       }
1398     else
1399       {
1400 	memcpy(world->data->seq[0]->category,earth->data->seq[0]->category,sizeof(int)*b);
1401       }
1402 }    /* inputcategs */
1403 
1404 
1405 
1406 
1407 void
empiricalfreqs(world_fmt * world,option_fmt * options,seqmodel_fmt * seq,long locus)1408 empiricalfreqs (world_fmt * world, option_fmt * options, seqmodel_fmt * seq,
1409                 long locus)
1410 {
1411     /* Get empirical base frequencies from the data */
1412     long i, j, k;
1413     MYREAL summ, suma, sumc, sumg, sumt, w;
1414     long snps = world->options->datatype == 'u' ? 5 : 1;
1415     options->freqa = 0.25;
1416     options->freqc = 0.25;
1417     options->freqg = 0.25;
1418     options->freqt = 0.25;
1419     for (k = 1; k <= 8; k++)
1420     {
1421         suma = 0.0;
1422         sumc = 0.0;
1423         sumg = 0.0;
1424         sumt = 0.0;
1425         for (i = 0; i < world->sumtips; i++)
1426         {
1427             for (j = 0; j < seq->endsite * snps; j += snps)
1428             {
1429                 w = (MYREAL) seq->aliasweight[j / snps];
1430                 summ = (options->freqa) * world->nodep[i]->x.s[j][0][NUC_A] + (options->freqc) * world->nodep[i]->x.s[j][0][NUC_C]
1431                     + (options->freqg) * world->nodep[i]->x.s[j][0][NUC_G]
1432                 + (options->freqt) * world->nodep[i]->x.s[j][0][NUC_T];
1433                 suma += w * (options->freqa) * world->nodep[i]->x.s[j][0][NUC_A] / summ;
1434                 sumc += w * (options->freqc) * world->nodep[i]->x.s[j][0][NUC_C] / summ;
1435                 sumg += w * (options->freqg) * world->nodep[i]->x.s[j][0][NUC_G] / summ;
1436                 sumt += w * (options->freqt) * world->nodep[i]->x.s[j][0][NUC_T] / summ;
1437 
1438             }
1439         }
1440         summ = suma + sumc + sumg + sumt;
1441         options->freqa = suma / summ;
1442         options->freqc = sumc / summ;
1443         options->freqg = sumg / summ;
1444         options->freqt = sumt / summ;
1445     }
1446 }    /* empiricalfreqs */
1447 
1448 
1449 void
initlambda(option_fmt * options)1450 initlambda (option_fmt * options)
1451 {
1452   int retval;
1453     while (options->lambda <= 1.0)
1454     {
1455         printf
1456         ("Mean block length of sites having the same rate\n (needs to be greater than 1)?\n");
1457 #ifdef USE_MYREAL_FLOAT
1458         retval = scanf ("%f%*[^\n]", &options->lambda);
1459 #else
1460         retval = scanf ("%lf%*[^\n]", &options->lambda);
1461 #endif
1462         getchar ();
1463     }
1464     options->lambda = 1.0 / options->lambda;
1465 }
1466 
1467 
1468 
1469 void
init_tbl(world_fmt * world,long locus)1470 init_tbl (world_fmt * world, long locus)
1471 {
1472     /* Define a lookup table. Precompute values and print them out in tables */
1473     long i, j;
1474     MYREAL sumrates;
1475     long categs = world->options->categs;
1476     long rcategs = world->options->rcategs;
1477     world->tbl = (valrec ***) mymalloc (rcategs * sizeof (valrec **));
1478     for (i = 0; i < rcategs; i++)
1479     {
1480         world->tbl[i] = (valrec **) mymalloc (categs * sizeof (valrec *));
1481         for (j = 0; j < categs; j++)
1482             world->tbl[i][j] = (valrec *) mymalloc (sizeof (valrec));
1483     }
1484     for (i = 0; i < rcategs; i++)
1485     {
1486         for (j = 0; j < categs; j++)
1487         {
1488             world->tbl[i][j]->rat =
1489                 world->options->rrate[i] * world->options->rate[j];
1490             world->tbl[i][j]->ratxi =
1491                 world->tbl[i][j]->rat * world->data->seq[0]->xi;
1492             world->tbl[i][j]->ratxv =
1493                 world->tbl[i][j]->rat * world->data->seq[0]->xv;
1494         }
1495     }
1496     sumrates = 0.0;
1497     for (i = 0; i < world->data->seq[0]->oldsite; i++)
1498     {
1499         for (j = 0; j < rcategs; j++)
1500             sumrates +=
1501                 world->data->seq[0]->aliasweight[i] * world->options->probcat[j] *
1502                 world->tbl[j][world->data->seq[0]->
1503                               category[world->data->seq[0]->alias[i] - 1] - 1]->rat;
1504     }
1505     sumrates /= (MYREAL) world->data->seq[0]->sites[locus];
1506     for (i = 0; i < rcategs; i++)
1507         for (j = 0; j < categs; j++)
1508         {
1509             world->tbl[i][j]->rat /= sumrates;
1510             world->tbl[i][j]->ratxi /= sumrates;
1511             world->tbl[i][j]->ratxv /= sumrates;
1512         }
1513 }    /* inittable */
1514 
1515 void
print_weights(FILE * outfile,world_fmt * world,option_fmt * options,long locus)1516 print_weights (FILE * outfile, world_fmt * world, option_fmt * options,
1517                long locus)
1518 {
1519     if (options->weights)
1520     {
1521         if ((options->printdata) || (options->progress && outfile == stdout))
1522         {
1523             printweights (outfile, world, options, 0,
1524                           world->data->seq[0]->sites[locus],
1525                           world->data->seq[0]->weight, "Sites");
1526         }
1527     }
1528 }
1529 
1530 void
print_tbl(FILE * outfile,world_fmt * world,option_fmt * options,long locus)1531 print_tbl (FILE * outfile, world_fmt * world, option_fmt * options,
1532            long locus)
1533 {
1534     long i;
1535 
1536     option_fmt *opt;
1537 
1538 
1539     opt = options;
1540     if (opt->rcategs > 1)
1541     {
1542         FPRINTF (outfile, "\nRegion type     Rate of change    Probability\n");
1543         FPRINTF (outfile, "---------------------------------------------\n");
1544         for (i = 0; i < opt->rcategs; i++)
1545             FPRINTF (outfile, "%9ld%16.3f%17.3f\n", i + 1, opt->rrate[i],
1546                      opt->probcat[i]);
1547         FPRINTF (outfile,"\n");
1548         if (opt->autocorr)
1549             FPRINTF (outfile,
1550                      "Expected length of a patch of sites having the same rate = %8.3f\n",
1551                      1. / opt->lambda);
1552         FPRINTF (outfile,"\n");
1553     }
1554     if (opt->categs > 1)
1555     {
1556         FPRINTF (outfile, "Site category   Rate of change\n");
1557         FPRINTF (outfile, "------------------------------\n");
1558         for (i = 0; i < opt->categs; i++)
1559             FPRINTF (outfile, "%9ld%16.3f\n", i + 1, opt->rate[i]);
1560     }
1561     if ((opt->rcategs > 1) || (opt->categs > 1))
1562         FPRINTF (outfile, "\n");
1563 }
1564 
1565 
1566 void
printweights(FILE * outfile,world_fmt * world,option_fmt * options,short inc,long chars,short * weight,char * letters)1567 printweights (FILE * outfile, world_fmt * world, option_fmt * options,
1568               short inc, long chars, short *weight, char *letters)
1569 {
1570     /* print out the weights of sites */
1571     long i, j;
1572     FPRINTF (outfile, "\n    %s are weighted as follows:\n", letters);
1573     for (i = 0; i < chars; i++)
1574     {
1575         if (i % 60 == 0)
1576         {
1577             FPRINTF (outfile, "\n");
1578             for (j = 1; j <= options->nmlength + 3; j++)
1579                FPRINTF (outfile, " ");
1580         }
1581         FPRINTF (outfile, "%hd", weight[i + inc]);
1582         if ((i + 1) % 10 == 0 && (i + 1) % 60 != 0)
1583             FPRINTF (outfile, " ");
1584     }
1585     FPRINTF (outfile, "\n\n");
1586 }    /* printweights */
1587 
1588 
1589 
1590 void
print_seqfreqs(FILE * outfile,world_fmt * world,option_fmt * options)1591 print_seqfreqs (FILE * outfile, world_fmt * world, option_fmt * options)
1592 {
1593   if(outfile==NULL)
1594     return;
1595 
1596   if (world->locus == 0 || outfile == stdout)
1597     {
1598         if (options->freqsfrom)
1599             FPRINTF (outfile, "\nEmpirical ");
1600         FPRINTF (outfile, "Base Frequencies\n");
1601         FPRINTF (outfile,
1602                  "------------------------------------------------------------\n");
1603         FPRINTF (outfile,
1604                  "Locus     Nucleotide                        Transition/\n");
1605         FPRINTF (outfile,
1606                  "          ------------------------------  Transversion ratio\n");
1607         FPRINTF (outfile, "          A       C       G       T(U)\n");
1608         FPRINTF (outfile,
1609                  "------------------------------------------------------------\n");
1610     }
1611     FPRINTF (outfile, "%4li      %6.4f  %6.4f  %6.4f  %6.4f    %10.5f\n",
1612              world->locus + 1, options->freqa, options->freqc, options->freqg,
1613              options->freqt, world->data->seq[0]->ttratio);
1614     if (outfile == stdout)
1615         FPRINTF (outfile, "\n");
1616 
1617 }
1618 
1619 MYREAL
treelike_seq(world_fmt * world,long locus)1620 treelike_seq (world_fmt * world, long locus)
1621 {
1622     const seqmodel_fmt *seq = world->data->seq[0];
1623     const MYREAL freqa = seq->basefrequencies[NUC_A];
1624     const MYREAL freqc = seq->basefrequencies[NUC_C];
1625     const MYREAL freqg = seq->basefrequencies[NUC_G];
1626     const MYREAL freqt = seq->basefrequencies[NUC_T];
1627 
1628     contribarr tterm;
1629     contribarr like;
1630     contribarr nulike;
1631     contribarr clai;
1632     //  long size = sizeof(MYREAL) * world->options->rcategs;
1633     MYREAL summ, sum2, sumc, sumterm, lterm;
1634     long i, j, k, lai;
1635     MYREAL scale;
1636     node *p;
1637     sitelike *x1;
1638     worldoption_fmt *opt;
1639     opt = world->options;
1640     p = crawlback (world->root->next);
1641     summ = 0.0;
1642 
1643     if (opt->rcategs == 1)
1644     {
1645         for (i = 0; i < seq->endsite; i++)
1646         {
1647             x1 = &(p->x.s[i][0]);
1648             scale = p->scale[i];
1649             tterm[0] =
1650                 freqa * (*x1)[0] + freqc * (*x1)[1] +
1651                 freqg * (*x1)[2] + freqt * (*x1)[3];
1652             summ += seq->aliasweight[i] * (LOG (tterm[0]) + scale);
1653         }
1654     }
1655     else
1656     {
1657         for (i = 0; i < seq->endsite; i++)
1658         {
1659             scale = p->scale[i];
1660             //k = seq->category[seq->alias[i] - 1] - 1;
1661             for (j = 0; j < opt->rcategs; j++)
1662             {
1663                 x1 = &(p->x.s[i][j]);
1664                 tterm[j] =
1665                     freqa * (*x1)[0] + freqc * (*x1)[1] +
1666                     freqg * (*x1)[2] + freqt * (*x1)[3];
1667             }
1668             sumterm = 0.0;
1669             for (j = 0; j < opt->rcategs; j++)
1670                 sumterm += opt->probcat[j] * tterm[j];
1671             lterm = LOG (sumterm) + scale;
1672             for (j = 0; j < opt->rcategs; j++)
1673                 clai[j] = tterm[j] / sumterm;
1674             swap (clai, world->contribution[i]);
1675             //      memcpy (world->contribution[i], clai, size);
1676             summ += seq->aliasweight[i] * lterm;
1677             if(MYISNAN((float)summ))
1678 	      {
1679 		// error("summ is not a number, should not happen\n");
1680 		summ = -HUGE;
1681 	      }
1682 
1683         }
1684         for (j = 0; j < opt->rcategs; j++)
1685             like[j] = 1.0;
1686         for (i = 0; i < seq->sites[locus]; i++)
1687         {
1688             sumc = 0.0;
1689             for (k = 0; k < opt->rcategs; k++)
1690                 sumc += opt->probcat[k] * like[k];
1691             sumc *= opt->lambda;
1692             if ((seq->ally[i] > 0) && (seq->location[seq->ally[i] - 1] > 0))
1693             {
1694                 lai = seq->location[seq->ally[i] - 1];
1695                 swap (world->contribution[lai - 1], clai);
1696                 //memcpy (clai, world->contribution[lai - 1], size);
1697                 for (j = 0; j < opt->rcategs; j++)
1698                     nulike[j] = ((1.0 - opt->lambda) * like[j] + sumc) * clai[j];
1699             }
1700             else
1701             {
1702                 for (j = 0; j < opt->rcategs; j++)
1703                     nulike[j] = ((1.0 - opt->lambda) * like[j] + sumc);
1704             }
1705             swap (nulike, like);
1706             //memcpy (like, nulike, size);
1707         }
1708         sum2 = 0.0;
1709         for (i = 0; i < opt->rcategs; i++)
1710             sum2 += opt->probcat[i] * like[i];
1711         summ += LOG (sum2);
1712     }
1713     return summ;
1714 }    /* treelikelihood */
1715 
1716 void
snp_invariants_original(contribarr invariants,long endsite,long rcategs,seqmodel_fmt * seq,phenotype x1)1717 snp_invariants_original (contribarr invariants, long endsite, long rcategs,
1718                 seqmodel_fmt * seq, phenotype x1)
1719 {
1720     long i, j;
1721     MYREAL *val;
1722     register MYREAL freqa, freqc, freqg, freqt;
1723     freqa = seq->basefrequencies[NUC_A];
1724     freqc = seq->basefrequencies[NUC_C];
1725     freqg = seq->basefrequencies[NUC_G];
1726     freqt = seq->basefrequencies[NUC_T];
1727 
1728         memset (invariants, 0, sizeof (contribarr));
1729     for (j = 0; j < rcategs; j++)
1730       {
1731 	for (i = endsite; i < endsite + 4; i++)
1732 	  {
1733 	    val = x1[i][j];
1734             invariants[j] +=
1735 	      (freqa * val[0] + freqc * val[1] +
1736 	       freqg * val[2] + freqt * val[3]);
1737 	  }
1738       }
1739     for (j = 0; j < rcategs; j++)
1740       invariants[j] = 1. - invariants[j];
1741 }
1742 
1743 void
snp_invariants(contribarr invariants,world_fmt * world,long locus,phenotype x1,MYREAL * scaler)1744 snp_invariants (contribarr invariants, world_fmt *world, long locus, phenotype x1, MYREAL *scaler)
1745 {
1746     worldoption_fmt *opt;
1747     seqmodel_fmt *seq =  world->data->seq[0];
1748     //MYREAL summ;
1749 
1750     sitelike *val;
1751     long i, j;
1752     register MYREAL freqa, freqc, freqg, freqt;
1753     freqa = seq->basefrequencies[NUC_A];
1754     freqc = seq->basefrequencies[NUC_C];
1755     freqg = seq->basefrequencies[NUC_G];
1756     freqt = seq->basefrequencies[NUC_T];
1757 
1758     opt = world->options;
1759     //summ = 0.0;
1760     /* snp invariants */
1761     for (j = 0; j < opt->rcategs; j++)
1762       {
1763 	invariants[j] = 0.0;
1764 	for (i = seq->endsite - seq->addon ; i < seq->endsite; i++)
1765 	  {
1766 	    val = &(x1[i][j]);
1767             invariants[j] +=
1768 	      exp(scaler[i]) * ((freqa * (*val)[0] + freqc * (*val)[1] +
1769 	       freqg * (*val)[2] + freqt * (*val)[3]));
1770 	  }
1771 	invariants[j] = 1.0 - invariants[j];
1772       }
1773     //    printf("invariant0=%f\n",invariants[0]);
1774 }    /* snp_invariants*/
1775 
1776 
1777 MYREAL
treelike_snp(world_fmt * world,long locus)1778 treelike_snp (world_fmt * world, long locus)
1779 {
1780     worldoption_fmt *opt;
1781     seqmodel_fmt *seq =  world->data->seq[0];
1782     MYREAL scale;
1783     contribarr tterm;
1784     contribarr invariants;
1785     contribarr like;
1786     contribarr nulike;
1787     MYREAL summ, sum2, sumc, sumterm, lterm;
1788     long i, j, k, lai;
1789 
1790     node *p;
1791     sitelike *x1;
1792     register MYREAL freqa, freqc, freqg, freqt;
1793     freqa = seq->basefrequencies[NUC_A];
1794     freqc = seq->basefrequencies[NUC_C];
1795     freqg = seq->basefrequencies[NUC_G];
1796     freqt = seq->basefrequencies[NUC_T];
1797 
1798     opt = world->options;
1799     p = crawlback (world->root->next);
1800     summ = 0.0;
1801     /* snp invariants */
1802     snp_invariants (invariants,world, locus, p->x.s, p->scale);
1803     for (j = 0; j < opt->rcategs; j++)
1804       {
1805 	if(invariants[j] <= 0.0)
1806 	  {
1807 #ifdef DEBUG
1808 	    printf("invariants are funny %f\n", invariants[j]);
1809 #endif
1810 	    return -MYREAL_MAX;
1811 	  }
1812       }
1813     if(opt->rcategs==1)
1814       {
1815 	for (i = 0; i < seq->endsite - seq->addon; i++)
1816 	  {
1817 	    scale = p->scale[i];
1818 	    x1 = &(p->x.s[i][j]);
1819 	    tterm[0] = (freqa * (*x1)[0] + freqc * (*x1)[1] + freqg * (*x1)[2] + freqt * (*x1)[3]);
1820 	    summ += seq->aliasweight[i] * (LOG (tterm[0]) + scale - log(invariants[0]));
1821 	  }
1822 	return summ;
1823       }
1824     else
1825       {
1826 	for (i = 0; i < seq->endsite - seq->addon; i++)
1827 	  {
1828 	    scale = p->scale[i];
1829 	    for (j = 0; j < opt->rcategs; j++)
1830 	      {
1831 		x1 = &(p->x.s[i][j]);
1832 		tterm[j] =
1833 		  (freqa * (*x1)[0] + freqc * (*x1)[1] +
1834 		   freqg * (*x1)[2] + freqt * (*x1)[3]) /invariants[j];
1835 	      }
1836 	    sumterm = 0.0;
1837 	    for (j = 0; j < opt->rcategs; j++)
1838 	      sumterm += opt->probcat[j] * tterm[j];
1839 	    lterm = LOG (sumterm) + scale;
1840 	    for (j = 0; j < opt->rcategs; j++)
1841 	      world->contribution[i][j] = tterm[j] / sumterm;
1842 
1843 	    summ += seq->aliasweight[i] * lterm;
1844 	  }    /* over endsite - 4[snp-invariants] */
1845 	for (j = 0; j < opt->rcategs; j++)
1846 	  like[j] = 1.0;
1847 	for (i = 0; i < seq->sites[locus]; i++)
1848 	  {
1849 	    sumc = 0.0;
1850 	    for (k = 0; k < opt->rcategs; k++)
1851 	      sumc += opt->probcat[k] * like[k];
1852 	    sumc *= opt->lambda;
1853 	    if ((seq->ally[i] > 0) && (seq->location[seq->ally[i] - 1] > 0))
1854 	      {
1855 		lai = seq->location[seq->ally[i] - 1];
1856 		for (j = 0; j < opt->rcategs; j++)
1857 		  nulike[j] =
1858                     ((1.0 - opt->lambda) * like[j] +
1859                      sumc) * world->contribution[lai - 1][j];
1860 	      }
1861 	    else
1862 	      {
1863 		for (j = 0; j < opt->rcategs; j++)
1864 		  nulike[j] = ((1.0 - opt->lambda) * like[j] + sumc);
1865 	      }
1866 	    swap (nulike, like);
1867 	  }
1868 	sum2 = 0.0;
1869 	for (i = 0; i < opt->rcategs; i++)
1870 	  sum2 += opt->probcat[i] * like[i];
1871 	summ += LOG (sum2);
1872 	return summ;
1873       }
1874 }    /* treelike_snp */
1875 
1876 
1877 MYREAL
treelike_snp_unlinked(world_fmt * world,long locus)1878 treelike_snp_unlinked (world_fmt * world, long locus)
1879 {
1880     //worldoption_fmt *opt;
1881     seqmodel_fmt *seq = world->data->seq[0];
1882     MYREAL scale;
1883     contribarr tterm, invariants;
1884     MYREAL summ, datasum = 0, lterm, result = 0;
1885     long i, ii;
1886 
1887     node *p;
1888     sitelike *x1;
1889 
1890     register MYREAL freqa, freqc, freqg, freqt;
1891 
1892     freqa = seq->basefrequencies[NUC_A];
1893     freqc = seq->basefrequencies[NUC_C];
1894     freqg = seq->basefrequencies[NUC_G];
1895     freqt = seq->basefrequencies[NUC_T];
1896 
1897     //opt = world->options;
1898     //seq = world->data->seq[0];
1899     p = crawlback (world->root->next);
1900     summ = 0.0;
1901     /* snp invariants */
1902     snp_invariants (invariants, world, locus, p->x.s, p->scale);
1903     /* no rate categories used */
1904     for (i = 0; i < seq->endsite; i++)
1905     {
1906         ii = i / 5;
1907         scale = p->scale[i];
1908         x1 = &(p->x.s[i][0]);
1909         tterm[0] =
1910             (freqa * (*x1)[0] + freqc * (*x1)[1] +
1911              freqg * (*x1)[2] + freqt * (*x1)[3]);
1912         if (i % 5 == 0)
1913         {
1914             lterm = LOG (tterm[0]) + scale;
1915             summ = 0;
1916             datasum = seq->aliasweight[ii] * lterm;
1917         }
1918         else
1919             summ += pow (tterm[0], (MYREAL) seq->aliasweight[ii]);
1920         if (((i + 1) % 5) == 0 && i != 0)
1921 	  {
1922 	    if(summ > 0.)
1923 	      {
1924 		result +=
1925 		  datasum + LOG ((1 - EXP (LOG (summ) - datasum)) / invariants[0]);
1926 	      }
1927 	  }
1928     }
1929     return result;
1930 }    /* treelike_snp_unlinked */
1931 
1932 void
check_basefreq(option_fmt * options)1933 check_basefreq (option_fmt * options)
1934 {
1935 
1936     if (options->freqa == 0. || options->freqc == 0. || options->freqt == 0.
1937             || options->freqg == 0.)
1938     {
1939         options->freqa = 0.25;
1940         options->freqc = 0.25;
1941         options->freqg = 0.25;
1942         options->freqt = 0.25;
1943     }
1944 }
1945 
1946 
1947 void
copy_seq(world_fmt * original,world_fmt * kopie)1948 copy_seq (world_fmt * original, world_fmt * kopie)
1949 {
1950     long sites;
1951     seqmodel_fmt *kseq;
1952     seqmodel_fmt *oseq;
1953     kseq = kopie->data->seq[0];
1954     oseq = original->data->seq[0];
1955     sites = oseq->sites[original->locus];
1956     memcpy(kseq->basefrequencies,oseq->basefrequencies,sizeof(MYREAL)*BASEFREQLENGTH);
1957     kseq->aa = oseq->aa;
1958     kseq->bb = oseq->bb;
1959     kseq->endsite = oseq->endsite;
1960     kseq->xi = oseq->xi;
1961     kseq->xv = oseq->xv;
1962     kseq->ttratio = oseq->ttratio;
1963     kseq->fracchange = oseq->fracchange;
1964     memcpy (kseq->sites, oseq->sites, sizeof (long) * original->loci);
1965     memcpy (kseq->alias, oseq->alias, sizeof (long) * sites);
1966     memcpy (kseq->ally, oseq->ally, sizeof (long) * sites);
1967     memcpy (kseq->category, oseq->category, sizeof (long) * sites);
1968     memcpy (kseq->weight, oseq->weight, sizeof (short) * sites);
1969     kseq->weightsum = oseq->weightsum;
1970     memcpy (kseq->aliasweight, oseq->aliasweight, sizeof (long) * sites);
1971     memcpy (kseq->location, oseq->location, sizeof (long) * sites);
1972     kseq->addon = oseq->addon;
1973 }
1974 
1975 
find_rates_fromdata(data_fmt * data,option_fmt * options,world_fmt * world)1976 void find_rates_fromdata(data_fmt * data, option_fmt * options, world_fmt *world)
1977 {
1978   long locus;
1979   long pop;
1980   long ind;
1981   long sites;
1982   long i;
1983   long n;
1984   long s;
1985   long **v;
1986   long maxsites = 1;
1987   long *numind;
1988   char *reference;
1989   char *indseq;
1990   long loci = data->loci;
1991   long numpop = data->numpop;
1992   long segreg;
1993   MYREAL mean=0.;
1994   MYREAL delta = 0.;
1995 
1996   numind  = (long *) mycalloc (data->loci, sizeof (long));
1997   options->segregs = (long *) mycalloc (data->loci, sizeof (long));
1998   options->wattersons = (MYREAL *) mycalloc (data->loci, sizeof (MYREAL));
1999   for (locus=0; locus < loci; locus++)
2000     {
2001       maxsites = (data->seq[0]->sites[locus] > maxsites ? data->seq[0]->sites[locus] : maxsites);
2002       for(pop=0;pop < data->numpop; pop++)
2003 	{
2004 	  numind[locus] += data->numind[pop][locus];
2005 	}
2006     }
2007 
2008   v = (long **) mycalloc (loci, sizeof (long *));
2009   v[0] = (long *) mycalloc (loci * maxsites, sizeof (long));
2010   for (i = 1; i < loci; i++)
2011     {
2012       v[i] = v[0] + maxsites * i;
2013     }
2014 
2015   for (locus=0; locus< loci; locus++)
2016     {
2017       reference = data->yy[0][0][locus][0];
2018       for(pop=0;pop < numpop; pop++)
2019 	{
2020 	  for(ind=0; ind < data->numind[pop][locus];ind++)
2021 	    {
2022 	      indseq=data->yy[pop][ind][locus][0];
2023 	      if(strcmp(indseq,reference)!=0)
2024 		{
2025 		  for(sites=0; sites < data->seq[0]->sites[locus]; sites++)
2026 		    {
2027 		      if(v[locus][sites] == 0)
2028 			{
2029 			  s = (long)(indseq[sites] != reference[sites]);
2030 			  v[locus][sites] += s;
2031 			}
2032 		    }
2033 		}
2034 	    }
2035 	}
2036     }
2037   //n = 0;
2038   //mean = 0.;
2039   if(options->mu_rates == NULL)
2040     {
2041       options->mu_rates = (MYREAL *) mycalloc( loci, sizeof(MYREAL));
2042       //      printf("%i> opt murate size %li\n",myID,loci * sizeof (MYREAL));
2043     }
2044   n = 0;
2045   mean = 0. ;
2046   options->muloci = loci;
2047   for (locus=0; locus < loci; locus++)
2048     {
2049       segreg = 0;
2050       for(sites=0; sites < data->seq[0]->sites[locus]; sites++)
2051 	{
2052 	  segreg += (long) (v[locus][sites] > 0);
2053 	}
2054       options->segregs[locus] = segreg;
2055       if(segreg==0)
2056 	{
2057 	  if (options->onlyvariable)
2058 	    {
2059 	      data->skiploci[locus] = TRUE;
2060 	      data->locusweight[locus] = 0.0;
2061 	    }
2062 	  else
2063 	    {
2064 	      if (options->has_variableandone)
2065 		{
2066 		  if (options->firstinvariant == -1)
2067 		    {
2068 		      options->firstinvariant=locus;
2069 		      data->locusweight[locus] = 1.0;
2070 		    }
2071 		  else
2072 		    {
2073 		      data->skiploci[locus] = TRUE;
2074 		      data->locusweight[locus] = 0.0;
2075 		      data->locusweight[options->firstinvariant] +=1;
2076 		    }
2077 		}
2078 	    }
2079 	}
2080       // originally I used just the plain watterson estimates but
2081       // this leads to strange results with highly unequal sequences
2082       // now the watterson's theta is adjusted per site
2083       // and that is used to generate the relative rate from the data
2084       // PB April 3 2011
2085       options->wattersons[locus] = watterson(segreg,numind[locus]);
2086       options->mu_rates[locus] = (options->wattersons[locus] + 0.0000001)/data->seq[0]->sites[locus];
2087       if (data->seq[0]->sites[locus]>0)
2088 	options->wattersons[locus] /= data->seq[0]->sites[locus];
2089       n = n + 1;
2090       delta = options->mu_rates[locus] - mean;
2091       mean += delta/n;
2092     }
2093   if(mean > 0.0)
2094     {
2095       for (locus=0; locus < loci; locus++)
2096 	{
2097 	  options->mu_rates[locus] /= mean;
2098 	}
2099     }
2100   else
2101     {
2102       options->murates = FALSE;
2103       options->murates_fromdata = FALSE;
2104       warning("Relative mutation rates estimation from data was requested but attempt failed --> reset to all-equal rates");
2105     }
2106   myfree(v[0]);
2107   myfree(v);
2108   myfree(numind);
2109 }
2110 
find_rates_fromdata_alleles(data_fmt * data,option_fmt * options,world_fmt * world,MYREAL mean)2111 void find_rates_fromdata_alleles(data_fmt * data, option_fmt * options, world_fmt *world, MYREAL mean)
2112 {
2113   long locus;
2114   options->segregs = (long *) mycalloc(data->loci, sizeof(long));
2115   for (locus=0;locus < data->loci; locus++)
2116     {
2117       options->segregs[locus] = (long) (options->mu_rates[locus] *  mean + 0.5);
2118     }
2119 }
2120 
2121 
free_seq(seqmodel_fmt ** seq,long seqnum)2122 void free_seq(seqmodel_fmt **seq, long seqnum)
2123 {
2124   long i;
2125   for(i=0;i<seqnum;i++)
2126     {
2127       if(seq[i]->links!=NULL)
2128 	{
2129 	  myfree(seq[i]->links);
2130 	}
2131       myfree(seq[i]->sites);
2132     }
2133   myfree(seq[0]);
2134   myfree(seq);
2135 }
2136