1 /*------------------------------------------------------
2  Bayesian Maximum likelihood estimation
3  of migration rate  and effectice population size
4  using a Metropolis-Hastings Monte Carlo algorithm
5  -------------------------------------------------------
6  D A T A   R O U T I N E S
7 
8  creates data structures,
9  read data (Electrophoretic loci, sequences, microsats),
10  prints data,
11  destroys data.
12 
13 Copyright 1997-2002 Peter Beerli and Joseph Felsenstein
14 Copyright 2003-2008 Peter Beerli
15 Copyright 2009-2012 Peter Beerli and Michal Palczewski
16 
17 Permission is hereby granted, free of charge, to any person obtaining a copy
18 of this software and associated documentation files (the "Software"), to deal
19 in the Software without restriction, including without limitation the rights
20 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
21 of the Software, and to permit persons to whom the Software is furnished to do
22 so, subject to the following conditions:
23 
24 The above copyright notice and this permission notice shall be included in all copies
25 or substantial portions of the Software.
26 
27 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
28 INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
29 PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
30 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
31 CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
32 OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33 
34 $Id: data.c 2172 2013-10-30 17:50:08Z beerli $
35 
36 -----------------------------------
37 add this to the microsatellite data, this will allow to use
38 a known repeat lengths to read number of sites instead of
39 number of repeats, the datafile will need an additional line
40 perhaps use something like #@ 2 2 2 3  .... on the second line
41 so that other non microsatellite data models
42 can ignore it
43 y = Map[If[(a = Mod[#, rlen]) != 0,
44       If[Random[] < 0.5, # - (rlen + a), # + rlen - a], #] &, x]/
45    rlen  - Min[y] + 1
46 -------------------------------------------------------*/
47 /*! \file data.c
48 
49 Data manipulation routines
50 
51 */
52 
53 
54 #include <string.h>
55 #include <wchar.h>
56 #ifndef WIN32
57 #include <xlocale.h>
58 #endif
59 #include "migration.h"
60 #include "sighandler.h"
61 #include "tools.h"
62 #include "migrate_mpi.h"
63 #include "data.h"
64 #include "sequence.h"
65 #include "random.h"
66 #ifdef PRETTY
67 #include "pretty.h"
68 #endif
69 extern long number_genomes (int type);
70 extern void jumble (long *s, long n);
71 extern void jumble_ownseed (long *s, long n);
72 
73 #ifdef DMALLOC_FUNC_CHECK
74 #include <dmalloc.h>
75 #endif
76 /* prototypes ----------------------------------------- */
77 //void create_data (data_fmt ** data);
78 //void get_data (FILE * infile, data_fmt * data, option_fmt * options);
79 //void print_data (world_fmt * world, option_fmt * options, data_fmt * data);
80 //long find_missing(data_fmt *data, long pop, long locus);
81 //void print_data_summary (FILE * file, world_fmt * world, option_fmt * options,
82 //                         data_fmt * data);
83 //short findAllele (data_fmt * data, char s[], long locus);
84 //void free_datapart (data_fmt * data, option_fmt * options, long locus);
85 /*private functions */
86 //void init_data_structure1 (data_fmt ** data, option_fmt * options);
87 void read_header (FILE * infile, data_fmt * data, option_fmt * options);
88 void read_sites (data_fmt * data, world_fmt *world, option_fmt *options);
89 //void init_data_structure2 (data_fmt ** data, option_fmt * options, long pop);
90 //void init_data_structure3 (data_fmt * data);
91 void read_popheader (FILE * infile, data_fmt * data, option_fmt * options, long pop, long genomes);
92 void read_indname (FILE * file, data_fmt * data, long pop, long ind,
93                    long locus, long nmlength);
94 void read_popdata (FILE * file, data_fmt * data, long pop,
95                    option_fmt * options);
96 void read_microalleles (FILE * infile, data_fmt * data, option_fmt *options, long pop, long ind);
97 void read_alleles (FILE * infile, data_fmt * data, long pop, long ind);
98 long read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options,
99                    long locus, long pop, long ind, long baseread);
100 long read_ind_seq_oneliner (FILE * infile, data_fmt * data, option_fmt * options,
101 		       long pop, long ind, long baseread);
102 
103 void read_hapmap (FILE * infile, data_fmt * data, option_fmt * options, long pop);
104 void
105 read_hapmap_genotypes (FILE * infile, data_fmt * data, option_fmt * options, long pop);
106 void read_distance_fromfile (FILE * dfile, long tips, long nmlength, MYREAL **m);
107 void finish_read_seq (FILE * infile, data_fmt * data, option_fmt * options,
108                       long pop, long baseread);
109 void print_alleledata (world_fmt * world, data_fmt * data,
110                        option_fmt * options);
111 void print_microdata (world_fmt * world, data_fmt * data,
112                        option_fmt * options);
113 void print_seqdata (world_fmt * world, option_fmt * options, data_fmt * data);
114 
115 void print_header (FILE * outfile, long pop, world_fmt * world,
116                    option_fmt * options, data_fmt * data);
117 MYREAL create_alleles (data_fmt * data, option_fmt *options);
118 void addAllele (data_fmt * data, char s[], long locus, long *z);
119 void set_numind (data_fmt * data);
120 void print_seq_pop (long locus, long pop, world_fmt * world,
121                     option_fmt * options, data_fmt * data);
122 void print_seq_ind (long locus, long pop, long ind, world_fmt * world,
123                     option_fmt * options, data_fmt * data);
124 void print_locus_head (long locus, world_fmt * world, option_fmt * options,
125                        data_fmt * data);
126 void find_delimiter (char *title, char *dlm);
127 void read_geofile (data_fmt * data, option_fmt * options, long numpop);
128 void read_datefile (data_fmt * data, option_fmt * options, long numpop);
129 void read_uep_fromfile (FILE * uepfile, long tips, long nmlength, int **uep,
130                         long *uepsites, long datatype);
131 void read_uepfile (data_fmt * data, option_fmt * options, long numpop);
132 
133 
134 /*=====================================================*/
135 /// creates the data structure
136 void
create_data(data_fmt ** data)137 create_data (data_fmt ** data)
138 {
139     (*data) = (data_fmt *) mycalloc (1, sizeof (data_fmt));
140 }
141 
142 /*
143 void
144 init_data (data_fmt * data)
145 {
146 
147 }
148 */
149 
150 ///
151 /// free the data module
152 void
destroy_data(data_fmt * data)153 destroy_data (data_fmt * data)
154 {
155   long ind;
156   long indalloc;
157   long locus;
158   long pop;
159   long loci = data->loci;
160   long numpop = data->numpop;
161 
162   // free data from init_data_structure3
163   for (locus = 0; locus < loci; locus++)
164     {
165       myfree(data->allele[locus]);
166     }
167   myfree(data->allele);
168   myfree(data->subloci);
169   myfree(data->maxalleles);
170   myfree(data->skiploci);
171   myfree(data->locusweight);
172 
173   // free data from init_data_structure2
174   for(pop=0; pop < numpop ; pop++)
175     {
176       indalloc = -1;
177       for(locus=0; locus < loci; locus++)
178 	{
179 	  if(indalloc < data->numind[pop][locus])
180 	    indalloc = data->numind[pop][locus];
181 	}
182       for (ind = 0; ind < indalloc; ind++)
183 	{
184 	  for(locus=0; locus < loci; locus++)
185 	    {
186 	      myfree(data->indnames[pop][ind][locus]);
187 	      myfree(data->yy[pop][ind][locus]);
188 	    }
189 	  myfree(data->indnames[pop][ind]);
190 	  myfree(data->yy[pop][ind]);
191 	}
192       myfree(data->indnames[pop]);
193       myfree(data->yy[pop]);
194     }
195   myfree(data->indnames);
196   myfree(data->yy);
197 
198   // data->yy were already freed in free_datapart()
199 
200   // free data from init_structure_1
201   myfree(data->popnames[0]);
202   myfree(data->numind[0]);
203   myfree(data->numalleles[0]);
204   myfree(data->popnames);
205   myfree(data->numind);
206   myfree(data->numalleles);
207 
208   if(data->position!=NULL)
209     myfree(data->position);
210   myfree(data->geo);
211   myfree(data->lgeo);
212   if(data->ogeo != NULL)
213     {
214       myfree(data->ogeo[0]);
215       myfree(data->ogeo);
216     }
217   // free sampledates
218   for(locus=0;locus<data->loci;locus++)
219     {
220       for(pop=0;pop < data->numpop; pop++)
221 	{
222 	  myfree(data->sampledates[pop][locus]);
223 	}
224     }
225   myfree(data->sampledates[0]);
226   myfree(data->sampledates);
227   myfree(data);
228 }
229 
230 ///
231 /// shuffles (jumbles) the individuals so that we can take the first y to subsample the dataset
shuffle(long ** shuffled,long n,int shuffle_ON)232 void shuffle(long **shuffled, long n, int shuffle_ON)
233 {
234   long i;
235   for(i=0;i<n;i++)
236     {
237       (*shuffled)[i] = i;
238     }
239   switch(shuffle_ON)
240     {
241     case 0:
242       break;
243     case -1:
244       jumble(*shuffled, n);
245       break;
246     default:
247       jumble_ownseed(*shuffled, n);
248     }
249 }
250 
shuffle_data(data_fmt * data,option_fmt * options)251 void shuffle_data(data_fmt *data, option_fmt *options)
252 {
253   long pop;
254   long locus;
255   data->shuffled = (long ***) mycalloc(data->numpop,sizeof(long **));
256   for(pop=0; pop < data->numpop; pop++)
257     {
258       data->shuffled[pop] = (long **) mycalloc(data->loci, sizeof(long *));
259       for(locus=0; locus < data->loci; locus++)
260 	{
261 	  data->shuffled[pop][locus] = (long *) mycalloc(data->numind[pop][locus], sizeof(long));
262 	  if(options->randomsubset>0 && options->randomsubset <  data->numind[pop][locus])
263 	    {
264 	      shuffle(&data->shuffled[pop][locus],data->numind[pop][locus],options->randomsubsetseed);
265 	    }
266 	  else
267 	    {
268 	      shuffle(&data->shuffled[pop][locus],data->numind[pop][locus],0);
269 	    }
270 	}
271     }
272 }
273 
274 
max_shuffled_individuals(option_fmt * options,data_fmt * data,long pop,long locus)275 long max_shuffled_individuals(option_fmt *options, data_fmt *data, long pop, long locus)
276 {
277   long top;
278   if(options->randomsubset>0 && options->randomsubset <  data->numind[pop][locus])
279     {
280       top = options->randomsubset;
281     }
282   else
283     {
284       top = data->numind[pop][locus];
285     }
286   return top;
287 }
288 
289 
290 //---------------------------------------------------------------
291 // ..number_pop.number_loci.<delimiter>.<title [no numbers to start]
292 // locu_specification {sNumber_of_sites}{n{number of sites}{m1}{a1}
293 // [parentheses () mark link loci
294 // example:
295 // (n3 s120 m1)(m1 m1 m1 a1 s100)(n1)(n2)(m1)
296 // contains 5 linked loci, the first two are compounds
297 // the system is to have for each individual haplotype a set of
298 // loci so that it may look like this
299 // ind11......ACA ATTAGA 13 15 5 3 2 A ATACG A AT 13
300 // ind12......ACA ATTCGA 12 15 6 3 2 A ATACG T CT 5
301 void
get_new_data(FILE * infile,data_fmt * data,option_fmt * options,world_fmt * world)302 get_new_data (FILE * infile, data_fmt * data, option_fmt * options, world_fmt *world)
303 {
304   //long locus;
305   MYREAL mean;
306   long pop;
307   long locus;
308   long genomes=1;
309   boolean saved_option_murate=FALSE;
310   data->hasghost = FALSE;
311   // read how many populations and loci and delimiter and title
312   // if we have the new haplotype input then there should be no delimiter
313   // needed
314   read_header (infile, data, options);
315   init_data_structure1 (&data, options);
316   //
317   // here I need to add the msat_lengths support
318   //
319   // this will set up all the locus structure, but may need some additional
320   // init_data-structure1a()
321   read_sites(data, world, options);
322   // snps etc need something like
323   //        data->seq[0]->addon = 4;
324   // hapmap data is peculiar check that
325   // if not sequence the fracchange [for F84] needs to be 1.0
326   //        data->seq[0]->fracchange = 1.0;
327 
328   if (options->progress)
329     fprintf (stdout, "\n\n");
330   if (options->writelog)
331     fprintf (options->logfile, "\n\n");
332 
333   for (pop = 0; pop < data->numpop; pop++)
334     {
335       read_popheader (infile, data, options, pop, genomes);
336       if (options->progress)
337 	fprintf (stdout, "Reading (%li) %s ...\n", pop, data->popnames[pop]);
338       if (options->writelog)
339 	fprintf (options->logfile, "Reading (%li) %s ...\n", pop, data->popnames[pop]);
340       init_data_structure2 (&data, options, pop);
341       read_popdata (infile, data, pop, options);
342     }
343   read_geofile (data, options, data->numpop);
344 #ifdef UEP
345 
346     read_uepfile (data, options, data->numpop);
347 #endif
348     read_datefile(data, options, data->numpop);
349     if (options->progress)
350         fprintf (stdout, "\n\n");
351     init_data_structure3 (data,options);
352     if ((options->onlyvariable || options->has_variableandone) && !options->murates_fromdata)
353       {
354 	saved_option_murate = TRUE;
355 	options->murates_fromdata = TRUE;
356       }
357     switch (options->datatype)
358     {
359     case 'a':
360         mean = create_alleles (data, options);
361         break;
362     case 'b':
363         mean = create_alleles (data, options);
364         for (pop = 0; pop < data->loci; pop++)
365             data->maxalleles[pop] = XBROWN_SIZE;
366         break;
367     case 'm':
368         mean = create_alleles (data, options);
369         for (pop = 0; pop < data->loci; pop++)
370             data->maxalleles[pop] = options->micro_stepnum;
371         break;
372     case 'h':
373     case 'n':
374     case 'u':
375       mean = create_alleles (data, options);
376       break;
377     default: /*DNA types*/
378       for (pop = 0; pop < data->loci; pop++)
379             data->maxalleles[pop] = 4;
380     }
381 
382     shuffle_data(data, options);
383 
384     if(options->murates_fromdata)
385       {
386 	if (strchr (SEQUENCETYPES, options->datatype))
387 	  {
388 	    find_rates_fromdata(data, options, world);
389 	  }
390 	else
391 	  {
392 	    find_rates_fromdata_alleles(data, options, world, mean);
393 	  }
394       }
395     if(saved_option_murate)
396       {
397 	for (locus=0; locus < data->loci; locus++)
398 	  {
399 	    options->mu_rates[locus] = 1.0;
400 	  }
401 	options->murates_fromdata=FALSE;
402 	saved_option_murate = FALSE;
403       }
404 }
405 
406 void
get_data(FILE * infile,data_fmt * data,option_fmt * options,world_fmt * world)407 get_data (FILE * infile, data_fmt * data, option_fmt * options, world_fmt *world)
408 {
409   long locus;
410   long pop;
411   long genomes=1;
412   MYREAL mean;
413   boolean saved_option_murate=FALSE;
414     data->hasghost = FALSE;
415     read_header (infile, data, options);
416     genomes =  number_genomes (options->datatype);
417     init_data_structure1 (&data, options);
418     data->datatype[0] = options->datatype;
419     switch (options->datatype)
420     {
421     case 's':
422     case 'f':// read standard sequence data
423       read_sites (data,world, options);
424         break;
425     case 'n': // read snp data
426       read_sites (data,world, options);
427         data->seq[0]->addon = 4;
428         break;
429     case 'h': //read data from hapmap allele frequency files
430       // there is no sites line in these files!
431       for(locus=0;locus<data->loci;locus++)
432 	data->seq[0]->sites[locus] = 1;
433       data->seq[0]->addon = 4;
434         break;
435     case 'u': // read unlinked snp
436       read_sites (data,world, options);
437         data->seq[0]->addon = 4;
438         break;
439     default:
440       read_sites(data,world, options); // checks whether there is a line for microsat repeat numbers
441       data->seq[0]->fracchange = 1.0;
442       break;
443     }
444     if (options->progress)
445         fprintf (stdout, "\n\n");
446     if (options->writelog)
447         fprintf (options->logfile, "\n\n");
448     for (pop = 0; pop < data->numpop; pop++)
449     {
450       read_popheader (infile, data, options, pop, genomes);
451         if (options->progress)
452             fprintf (stdout, "Reading (%li) %s ...\n", pop, data->popnames[pop]);
453         if (options->writelog)
454             fprintf (options->logfile, "Reading (%li) %s ...\n", pop, data->popnames[pop]);
455         init_data_structure2 (&data, options, pop);
456         read_popdata (infile, data, pop, options);
457     }
458     read_geofile (data, options, data->numpop);
459 #ifdef UEP
460 
461     read_uepfile (data, options, data->numpop);
462 #endif
463     read_datefile(data, options, data->numpop);
464     if (options->progress)
465         fprintf (stdout, "\n\n");
466     init_data_structure3 (data,options);
467     if ((options->onlyvariable || options->has_variableandone) && !options->murates_fromdata)
468       {
469 	saved_option_murate = TRUE;
470 	options->murates_fromdata = TRUE;
471       }
472     switch (options->datatype)
473     {
474     case 'a':
475         mean=create_alleles (data, options);
476         break;
477     case 'b':
478         mean=create_alleles (data, options);
479         for (pop = 0; pop < data->loci; pop++)
480             data->maxalleles[pop] = XBROWN_SIZE;
481         break;
482     case 'm':
483         mean=create_alleles (data, options);
484         for (pop = 0; pop < data->loci; pop++)
485             data->maxalleles[pop] = options->micro_stepnum;
486         break;
487     case 'h':
488     case 'n':
489     case 'u':
490       mean=create_alleles (data, options);
491       break;
492     default: /*DNA types*/
493       for (pop = 0; pop < data->loci; pop++)
494             data->maxalleles[pop] = 4;
495     }
496 
497     shuffle_data(data, options);
498 
499     if(options->murates_fromdata)
500       {
501 	if (strchr (SEQUENCETYPES, options->datatype))
502 	  {
503 	    find_rates_fromdata(data, options, world);
504 	  }
505 	else
506 	  {
507 	    find_rates_fromdata_alleles(data, options, world, mean);
508 	  }
509       }
510     if(saved_option_murate)
511       {
512 	for (locus=0; locus < data->loci; locus++)
513 	  {
514 	    options->mu_rates[locus] = 1.0;
515 	  }
516 	options->murates_fromdata=FALSE;
517 	saved_option_murate = FALSE;
518       }
519 }
520 
521 /* private functions ========================================== */
522 
523 void
init_data_structure1(data_fmt ** data,option_fmt * options)524 init_data_structure1 (data_fmt ** data, option_fmt * options)
525 {
526     long pop;
527     long locus;
528     long numpop = (*data)->numpop;
529     long loci   = (*data)->loci;
530 
531     (*data)->ogeo = NULL;
532     (*data)->geo = NULL;
533     if ((*data)->yy == NULL)
534     {
535         (*data)->yy = (char *****) mymalloc (sizeof (char ****) * numpop);
536         (*data)->seq = (seqmodel_fmt **) mycalloc (1, sizeof (seqmodel_fmt *));
537         (*data)->seq[0] = (seqmodel_fmt *) mycalloc (1, sizeof (seqmodel_fmt));
538         (*data)->popnames =(char **) mymalloc (sizeof (char *) * numpop);
539         (*data)->popnames[0] =(char *) mycalloc (numpop * STRSIZE,sizeof(char));
540         (*data)->indnames = (char ****) mymalloc (sizeof (char ***) * numpop);
541         (*data)->numind = (long **) mymalloc (sizeof (long *) * numpop);
542         (*data)->numind[0] = (long *) mymalloc (sizeof (long) * numpop * loci);
543         (*data)->numalleles = (long **) mymalloc (sizeof (long *) * numpop);
544         (*data)->numalleles[0] = (long *) mymalloc (sizeof (long) * numpop * loci);
545 	(*data)->locitypes = (char *) mycalloc(loci, sizeof(char));
546 	(*data)->locusname = (char **) mycalloc(loci, sizeof(char*));
547         for (pop = 1; pop < numpop; pop++)
548         {
549 	  (*data)->popnames[pop] = (*data)->popnames[0] + pop * STRSIZE;
550 	  (*data)->numind[pop] = (*data)->numind[0] + pop * loci;
551 	  (*data)->numalleles[pop] =  (*data)->numalleles[0] + pop * loci;
552         }
553         (*data)->seq[0]->sites = (long *) mycalloc (loci, sizeof (long));
554         (*data)->seq[0]->links = (boolean *) mycalloc (loci, sizeof (boolean));
555         (*data)->position = (long *) mycalloc (loci, sizeof (long));
556 	(*data)->datatype = (char *) mycalloc(loci, sizeof(char));
557 	(*data)->numdatatypealloc = loci;
558 	(*data)->numsublocialloc = loci;
559 	(*data)->subloci = (long *) mycalloc(loci,sizeof(long));
560 	(*data)->numrepeatnumbers = (long *) mycalloc(loci, sizeof(long));
561 	(*data)->repeatnumbers = (long **) mycalloc(loci, sizeof(long *));
562 	(*data)->repeatlength = (long *) mycalloc(loci, sizeof(long));
563 	for(locus=0;locus<loci;locus++)
564 	  {
565 	    (*data)->numrepeatnumbers[locus] = 1;
566 	    (*data)->repeatnumbers[locus] = (long *) mycalloc(1, sizeof(long));
567 	  }
568     }
569     else
570     {
571         error ("Problem with initialization of data matrix yy\n");
572     }
573 }
574 
575 
576 void
init_data_structure2(data_fmt ** data,option_fmt * options,long pop)577 init_data_structure2 (data_fmt ** data, option_fmt * options, long pop)
578 {
579     long ind, locus;
580     long indalloc = -1;
581     for(locus=0;locus<(*data)->loci;locus++)
582     {
583         if(indalloc < (*data)->numind[pop][locus])
584             indalloc = (*data)->numind[pop][locus];
585     }
586     if (indalloc == 0)
587         indalloc = 2;
588 
589     (*data)->yy[pop] = (char ****) mymalloc (sizeof (char ***) * indalloc);
590     (*data)->indnames[pop] = (char ***) mycalloc (1, sizeof (char **) * indalloc);
591 #ifdef DEBUG
592     printf("init_data_structure2: indalloc=%li\n",indalloc);
593 #endif
594     for (ind = 0; ind < indalloc; ind++)
595     {
596         (*data)->indnames[pop][ind] =
597             (char **) mymalloc (sizeof (char *) * (*data)->loci);
598         (*data)->yy[pop][ind] =
599             (char ***) mymalloc (sizeof (char **) * (*data)->loci);
600         for (locus = 0; locus < (*data)->loci; locus++)
601         {
602 	  (*data)->indnames[pop][ind][locus] =
603             (char *) mycalloc (1, sizeof (char) * (1 + options->nmlength));
604             if (!strchr (SEQUENCETYPES, options->datatype))
605             {
606                 (*data)->yy[pop][ind][locus] =
607                     (char **) mycalloc (1, sizeof (char *) * 2);
608                 (*data)->yy[pop][ind][locus][0] =
609                     (char *) mycalloc (1, sizeof (char) * (options->allelenmlength+1));
610                 (*data)->yy[pop][ind][locus][1] =
611                     (char *) mycalloc (1, sizeof (char) * (options->allelenmlength+1));
612             }
613             else
614             {
615                 (*data)->yy[pop][ind][locus] =
616                     (char **) mycalloc (1, sizeof (char *));
617                 (*data)->yy[pop][ind][locus][0] =
618                     (char *) mycalloc (1,
619                                      sizeof (char) * ((*data)->seq[0]->sites[locus]+1));
620 #ifdef DEBUG
621 		printf("%i> sites=%li+1, pop=%li ind=%li locus=%li\n",myID, ((*data)->seq[0]->sites[locus]+1),pop, ind, locus);
622 #endif
623             }
624         }
625     }
626 }
627 
628 
629 short
findAllele(data_fmt * data,char s[],long locus)630 findAllele (data_fmt * data, char s[], long locus)
631 {
632     short found = 0;
633     while (data->allele[locus][found]!=NULL && (data->allele[locus][found][0] != '\0') &&(strcmp (s, data->allele[locus][found])))
634         found++;
635     return found;
636 }
637 
638 void
free_datapart(data_fmt * data,option_fmt * options,long locus)639 free_datapart (data_fmt * data, option_fmt * options, long locus)
640 {
641     long ind, pop;
642     //  long genomes = number_genomes (options->datatype);
643     for (pop = 0; pop < data->numpop; pop++)
644     {
645         for (ind = 0; ind < data->numind[pop][locus]; ind++)
646         {
647             if (!strchr (SEQUENCETYPES, options->datatype))
648             {
649                 myfree(data->yy[pop][ind][locus][0]);
650                 myfree(data->yy[pop][ind][locus][1]);
651                 myfree(data->yy[pop][ind][locus]);
652             }
653             else
654             {
655                 myfree(data->yy[pop][ind][locus][0]);
656                 myfree(data->yy[pop][ind][locus]);
657             }
658         }
659     }
660 }
661 
662 
663 void
init_data_structure3(data_fmt * data,option_fmt * options)664 init_data_structure3 (data_fmt * data, option_fmt *options)
665 {
666     long locus, pop, maxi;
667     data->allele =
668         (char ***) mycalloc (1, sizeof (char **) * data->loci);
669     data->subloci = (long *) mycalloc(data->loci,sizeof(long));
670     for (locus = 0; locus < data->loci; locus++)
671     {
672       data->subloci[locus] = 1;
673         maxi = 0;
674         for (pop = 0; pop < data->numpop; pop++)
675             maxi += data->numalleles[pop][locus];
676         data->allele[locus] =
677             (char **) mycalloc (maxi+1, sizeof (char*));
678         for (pop = 0; pop < maxi; pop++)
679 	  data->allele[locus][pop] =
680             (char *) mycalloc (options->allelenmlength+1, sizeof (char));
681     }
682     data->maxalleles = (long *) mycalloc (1, sizeof (long) * data->loci);
683     data->skiploci =
684         (boolean *) mycalloc (1, sizeof (boolean) * (data->loci + 1));
685     data->locusweight =
686         (MYREAL *) mycalloc (1, sizeof (MYREAL) * (data->loci + 1));
687     for (locus=0;locus < data->loci; locus++)
688       {
689 	data->locusweight[locus]=1.0;
690       }
691 }
692 
print_utf_warning(void)693 void print_utf_warning(void)
694 {
695   warning("Tried and failed to read a file that is not in ASCII TEXT format\n");
696   warning("This is most commonly UTF-8 or UTF-16 formatted\n");
697   warning("Try to convert the file into plain ASCII, enclodings like\n");
698   warning("WESTERN (MAC ROMAN) or WESTERN (WINDOWS LATIN 1) or WESTERN ISO LATIN\n");
699   usererror("Program aborts now\n");
700 }
701 
check_ascii(FILE * infile)702 void check_ascii(FILE *infile)
703 {
704   wint_t wch, wch2;
705   wch = fgetwc(infile);
706   if (wch > 127)
707     {
708       print_utf_warning();
709     }
710   else
711     {
712       wch2 = fgetwc(infile);
713       if(wch==0 && wch2>0)
714 	{
715 	  print_utf_warning();
716 	}
717       else
718 	{
719 	  if (wch>0 && wch2==0)
720 	    {
721 	      print_utf_warning();
722 	    }
723 	}
724       rewind(infile);
725     }
726 }
727 ///
728 /// read the first line of the data file
729 /// \param infile datafilename
730 /// \param data   data structure that holds all the data
731 /// \param options structure that contain all option information
732 /// \reval none
read_header(FILE * infile,data_fmt * data,option_fmt * options)733 void read_header (FILE * infile, data_fmt * data, option_fmt * options)
734 {
735     char input[LINESIZE], *p;
736     char title[LINESIZE];
737     strcpy(title,"\0");
738     //disabled for now: check_ascii(infile);
739     input[0] = '#';
740     while(input[0]=='#')
741       {
742 	FGETS (input, sizeof (input), infile);
743 #ifdef DEBUG
744 	printf("@@>%s<@@\n",input);
745 #endif
746 	if ((p = (char *) strpbrk (input, CRLF)) != NULL)
747 	  *p = '\0';
748       }
749     switch (lowercase (input[0]))
750     {
751     case 'a':
752         sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype, &(data->numpop),
753                 &(data->loci), title);
754         find_delimiter (title, &data->dlm);
755 	if(!(title[0] == '\0'))
756 	  strcpy(options->title,title);
757         break;
758     case 'b':
759     case 'm':
760         sscanf (input, "%1s%ld%ld%1s%[^\n]", &options->datatype,
761                 &(data->numpop), &(data->loci), &data->dlm, title);
762 	if(!(title[0] == '\0'))
763 	  strcpy(options->title,title);
764         break;
765     case 's':
766     case 'n':
767     case 'h': //hapmap data
768     case 'u':
769     case 'f':
770         sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype, &(data->numpop),
771                 &(data->loci), title);
772 	if(!(title[0] == '\0'))
773 	  strcpy(options->title,title);
774         break;
775     case 'g':   /* fall through if a menu change forces to analyze data
776                                instead of using the already sampled genealogies */
777         if (options->datatype == 'g')
778             break;
779         else
780             memmove (input, input + 1, (strlen (input) - 1) * sizeof (char));
781     default:
782       if(input[0]== '<')
783 	{
784 	  usererror ("This data file may contain an XML or HTML tag,\nand cannot be read properly, check the data formatting section in the manual!");
785 	  exit(-1);
786 	}
787         switch (options->datatype)
788         {
789         case 'a':
790             sscanf (input, "%ld%ld%[^\n]", &(data->numpop), &(data->loci),
791                     title);
792             find_delimiter (title, &data->dlm);
793 
794 	if(!(title[0] == '\0'))
795 	  strcpy(options->title,title);
796             break;
797         case 'b':
798         case 'm':
799             sscanf (input, "%ld%ld%1s%[^\n]", &(data->numpop), &(data->loci),
800                     &(data->dlm), title);
801 	    if(!(title[0] == '\0'))
802 	      strcpy(options->title,title);
803             break;
804         case 's':
805         case 'n':
806 	case 'h': // hapmap data
807         case 'u':
808         case 'f':
809             sscanf (input, "%ld%ld%[^\n]", &(data->numpop), &(data->loci),
810                     title);
811 	    if(!(title[0] == '\0'))
812 	      strcpy(options->title,title);
813             break;
814         default:
815             usererror ("Datatype is wrong, please use a valid data type!");
816         }
817     }
818     options->datatype = lowercase (options->datatype);
819 }
820 
821 void
find_delimiter(char * title,char * dlm)822 find_delimiter (char *title, char *dlm)
823 {
824     char *p = title;
825     long z = 0;
826     while (*p == ' ')
827     {
828         p++;
829         z++;
830     }
831     if (isalnum (*p))
832         memmove (title, p, sizeof (char) * (strlen (title) - z));
833     else
834     {
835         *dlm = *p;
836         p++;
837         while (*p == ' ')
838         {
839             p++;
840             z++;
841         }
842         memmove (title, p, sizeof (char) * (strlen (title) - z));
843     }
844 }
845 
set_datatype(char input,long locus,char ** locitypes,long * alloc)846 void set_datatype(char input, long locus, char ** locitypes, long *alloc)
847 {
848   if(locus >= *alloc)
849     {
850       *alloc += SUBLOCICHUNKS;
851       *locitypes = (char *) myrealloc(*locitypes, sizeof(char) * (*alloc));
852     }
853   (*locitypes)[locus] = input;
854 }
855 
856 //==================================================================
857 /// read the number of sites for each locus in the dataset,
858 /// does not assume a fixed line length, but assumes that at the end of the line is either
859 /// a \n or \r or \r\l (similar to the sequence reader) to accommodate windows, mac and
860 /// unix line ends.
861 /// The generalized sites reader can also read link status using a parenthesis notation,
862 /// and allows to give short names to the
863 /// loci, example: (ldh=s234 agdh=n3) gpi=s100  str1=b1 (str2=m1 a1)
864 /// the labels a, s, n etc are the same used for the datatype and will mark each locus
865 /// for the datatype=h it is assumed that all loci are the same and of type n
866 /// plain numbers are considered to be sequence loci with no name
867 void
read_sites(data_fmt * data,world_fmt * world,option_fmt * options)868 read_sites (data_fmt * data, world_fmt *world, option_fmt *options)
869 {
870   char * val=NULL;
871   boolean linked=TRUE;
872   long allocbufsize=LINESIZE;
873     long locus;
874     char *input;
875     char * word;
876     long oldi;
877     long i;
878     long len;
879     long z=0; //z is adding subloci and is reset with a closing ')'
880     long zz=0; //zz is used for mutationmodels and is not reset at closing ')'
881     input = (char *) mycalloc(allocbufsize ,sizeof(char));
882     word = (char *) mycalloc(allocbufsize ,sizeof(char));
883     // check whether the microsats are not repeats but fragmentlength
884     data->has_repeats = TRUE;
885     for (locus = 0; locus < data->loci; locus++)
886     {
887       read_word(data->infile, input);
888       switch(input[0])
889 	{
890 	case  '#':
891 	  if(input[1]=='@' || input[1]=='M')
892 	    {
893 	      if (input[1]=='M')
894 		{
895 		  warning("You should use #@M if you want to treat the input as fragment lengths in your infile\n");
896 		}
897 	      // line is microsatellite instruction
898 	      if(input[2]=='M' || input[1]=='M')
899 		{
900 		  FGETS2(&input,&allocbufsize,data->infile); // read the line
901 		  locus--;
902 		  oldi=0;
903 		  // long i;
904 		  len=0;
905 		  data->has_repeats = FALSE;
906 		  for(i = 0 ; i < data->loci; i++)
907 		    {
908 		      len += read_word_delim(input+len,word," ;,\t");
909 		      if(word[0]!='\0')
910 			{
911 			  data->repeatlength[i] = atol(word);
912 			  oldi=i;
913 			}
914 		      else
915 			{
916 			  data->repeatlength[i] = data->repeatlength[oldi];
917 			}
918 		    }
919 		}
920 	      else
921 		{
922 		  FGETS2(&input,&allocbufsize,data->infile); // read the line
923 		  locus--;
924 		}
925 	      myfree(input);
926 	      myfree(word);
927 	      return;
928 	    }
929 	  else
930 	    {
931 	      // line is a comment
932 	      FGETS2(&input,&allocbufsize,data->infile); // read the line
933 	      locus--;
934 	      continue;
935 	    }
936 	  break;
937 
938 	  case '(':
939 	    // unlinked loci are all on one line!
940 	    // this is a change from the old format, dna is now treated the same way as other loci
941 	    // I am not sure whether this really works, but would make things simpler,
942 	    // still unclear how to deal with haplotype versus diplotype information
943 	    data->oneliner=TRUE;
944 	    linked=TRUE;
945 	    if(z>=data->numsublocialloc)
946 	      {
947 		data->numsublocialloc += SUBLOCICHUNKS;
948 		data->subloci = (long *) myrealloc(data->subloci, data->numsublocialloc * sizeof(long));
949 	      }
950 	    data->subloci[z] += 1; //we opened a ( and therefore start a linkage group
951 	    if(input[1]!='\0' && isalpha(input[1]))
952 	      {
953 		read_word_delim(input,word,"=");
954 		if(input[0]!='\0')
955 		  {
956 		    data->locusname[z] = (char *) mycalloc(strlen(word),sizeof(char));
957 		    strcpy(data->locusname[locus],word+1);
958 		    // what is this good for?
959 		    if(!strcmp(word,input))
960 		      val=strchr((char *) "bmsnuh", input[0]);
961 		    else
962 		      val=strchr((char *) "bmsnuh", input[1]);
963 
964 		    if(val==0)
965 		      {
966                         set_datatype((input[0]=='(' ? input[1] : input[0]),locus, &data->locitypes, &data->numlocitypesalloc);
967 			zz++;
968 		      }
969 		    else
970 		      {
971 			// no datatype specified in the model therewe assume there is a unique data type specified in the parmfile or menu
972                         set_datatype(data->datatype[0],locus, &data->locitypes, &data->numlocitypesalloc);
973 		      }
974 		  }
975 		else
976 		  {
977 		    // no datatype specified in the model therefore we assume there is a unique data type specified in the parmfile or menu
978                     set_datatype(data->datatype[0],locus, &data->locitypes, &data->numlocitypesalloc);
979 		  }
980 	      }
981 	    else
982 	      {
983 		if(strlen(input)==1 || input[1]==' ')
984 		  {
985 		    locus--;
986 		    continue;
987 		  }
988 	      }
989 	    break;
990 	  case ')':
991 	    z=0; // reset subloci for the next locus
992 	    break;
993 	  default:
994 	    if (!strchr (SEQUENCETYPES, options->datatype))
995 	      {
996 		unread_word(data->infile, input);
997 		myfree(input);
998 		myfree(word);
999 		return;
1000 	      }
1001 	    if(isalpha(input[0]))
1002 	      {
1003 		read_word_delim(input,word,"=");
1004 		data->locusname[locus] = (char *) mycalloc(strlen(word),sizeof(char));
1005 		strcpy(data->locusname[locus],word);
1006 	      }
1007 	  }
1008 	if(linked)
1009 	  data->seq[0]->links[locus] = TRUE;
1010 	else
1011 	  data->seq[0]->links[locus] = FALSE;
1012 	if(input[0]=='(')
1013 	  {
1014 	    if(strchr("bmsnuh",input[1]))
1015 	      {
1016 		val = input+2;
1017 	      }
1018 	    else
1019 	      {
1020 		val = input+1;
1021 	      }
1022 	  }
1023 	else
1024 	  {
1025 	    if(strchr("bmsnuh",input[0]))
1026 	      {
1027 		val = input+1;
1028 	      }
1029 	    else
1030 	      {
1031 		val = input;
1032 	      }
1033 	  }
1034         data->seq[0]->sites[locus] = atoi (val);
1035 	val = NULL;
1036 	if(strchr("n",options->datatype))
1037 	  {
1038 	    if (options->allelenmlength < data->seq[0]->sites[locus])
1039 	      options->allelenmlength =  data->seq[0]->sites[locus];
1040 	  }
1041         if (data->seq[0]->sites[locus] == 0)
1042         {
1043 	  warning ("This does look like sequence data\n");
1044 	  warning ("I just read a number of sites=0\n");
1045 	  warning ("If you use the wrong data type, the program\n");
1046 	  usererror ("will crash anyway, so I stop now\n");
1047         }
1048     }
1049     FGETS2(&input,&allocbufsize,data->infile);
1050     while(input[0]=='#')
1051       FGETS2(&input,&allocbufsize,data->infile);
1052     myfree(input);
1053     myfree(word);
1054 }
1055 
1056 //old version
1057 void
read_old_sites(data_fmt * data)1058 read_old_sites (data_fmt * data)
1059 {
1060   char * val=NULL;
1061   boolean linked=TRUE;
1062     long locus;
1063     char *input;
1064     char * word;
1065 
1066     input = (char *) mycalloc(LONGLINESIZE ,sizeof(char));
1067     word = (char *) mycalloc(LONGLINESIZE ,sizeof(char));
1068 
1069     for (locus = 0; locus < data->loci-1; locus++)
1070     {
1071         read_word(data->infile, input);
1072 	switch(input[0])
1073 	  {
1074 	  case  '#':
1075 	    FGETS(input,LINESIZE,data->infile);
1076 	    locus--;
1077 	    continue;
1078 	    break;
1079 
1080 	  case '(':
1081 	    // unlinked loci are all on one line!
1082 	    // this is a change from the old format, dna is now treated the same way as other loci
1083 	    // I am not sure whether this really works, but would make things simpler,
1084 	    // still unclear how to deal with haplotype versus diplotype information
1085 	    data->oneliner=TRUE;
1086 	    if(input[1]!='\0' && isalpha(input[1]))
1087 	      {
1088 		read_word_delim(input,word,"=");
1089 		if(input[0]!='\0')
1090 		  {
1091 		    data->locusname[locus] = (char *) mycalloc(strlen(word),sizeof(char));
1092 		    strcpy(data->locusname[locus],word+1);
1093 		    if(!strcmp(word,input))
1094 		      val=strchr((char *) "bmsnuh", input[0]);
1095 		    else
1096 		      val=strchr((char *) "bmsnuh", input[1]);
1097 
1098 
1099                     if(val==0)
1100                       {
1101                         set_datatype((input[0]=='(' ? input[1] : input[0]),locus, &data->locitypes, &data->numlocitypesalloc);
1102                         //zz++;
1103                       }
1104                     else
1105                       {
1106                         // no datatype specified in the model therefore we assume there is a
1107                         // unique data type specified in the parmfile or menu
1108                         set_datatype(data->datatype[0],locus, &data->locitypes, &data->numlocitypesalloc);
1109                       }
1110 		  }
1111 		else
1112 		  {
1113                     // no datatype specified in the model therefore we assume there is a unique data type specified in the parmfile or menu
1114                     set_datatype(data->datatype[0],locus, &data->locitypes, &data->numlocitypesalloc);
1115 
1116 		  }
1117 	      }
1118 	    else
1119 	      {
1120 		if(strlen(input)==1 || input[1]==' ')
1121 		  {
1122 		    locus--;
1123 		    continue;
1124 		  }
1125 	      }
1126 	    break;
1127 	  case '|':
1128 	    if(linked==FALSE)
1129 	      linked=TRUE;
1130 	    else
1131 	      linked=FALSE;
1132 	    break;
1133 	  default:
1134 	    if(isalpha(input[0]))
1135 	      {
1136 		read_word_delim(input,word,"=");
1137 		data->locusname[locus] = (char *) mycalloc(strlen(word),sizeof(char));
1138 		strcpy(data->locusname[locus],word);
1139 	      }
1140 	  }
1141 	if(linked)
1142 	  data->seq[0]->links[locus] = TRUE;
1143 	else
1144 	  data->seq[0]->links[locus] = FALSE;
1145 	if(input[0]=='(')
1146 	  {
1147 	    if(strchr("bmsnuh",input[1]))
1148 	      {
1149 		val = input+2;
1150 	      }
1151 	    else
1152 	      {
1153 		val = input+1;
1154 	      }
1155 	  }
1156 	else
1157 	  {
1158 	    if(strchr("bmsnuh",input[0]))
1159 	      {
1160 		val = input+1;
1161 	      }
1162 	    else
1163 	      {
1164 		val = input;
1165 	      }
1166 	  }
1167         data->seq[0]->sites[locus] = atoi (val);
1168 	val = NULL;
1169         if (data->seq[0]->sites[locus] == 0)
1170         {
1171 	  warning ("This does look like sequence data\n");
1172 	  warning ("I just read a number of sites=0\n");
1173 	  warning ("If you use the wrong data type, the program\n");
1174 	  usererror ("will crash anyway, so I stop now\n");
1175         }
1176     }
1177     FGETS(input,LINESIZE,data->infile);
1178     while(input[0]=='#')
1179       FGETS(input,LINESIZE,data->infile);
1180     if(linked)
1181       data->seq[0]->links[locus] = TRUE;
1182     else
1183       data->seq[0]->links[locus] = FALSE;
1184     if(strchr("bmsnuh",input[0]))
1185       {
1186 	val = input+1;
1187       }
1188     else
1189       {
1190 	val = input;
1191 	while(!strchr("123456789",*val))
1192 	      val++;
1193       }
1194     data->seq[0]->sites[locus] = atoi (val);
1195 
1196     myfree(input);
1197     myfree(word);
1198 }
1199 
1200 
1201 void
read_popheader(FILE * infile,data_fmt * data,option_fmt * options,long pop,long genomes)1202 read_popheader (FILE * infile, data_fmt * data, option_fmt * options, long pop, long genomes)
1203 {
1204     boolean havepopname = FALSE;
1205     long    minlength   = 0;
1206     long    lo;
1207     long    locus;
1208     char   *input;
1209 
1210     input = (char *) mycalloc(LINESIZE,sizeof(char));
1211 
1212     // allows that sequence data can have different numbers of individuals for different loci
1213     // data syntax changes: #ind1 #ind2 #IND3 .... pop_name
1214     havepopname=FALSE;
1215     // with multiple loci we need to separate the nubers of individuals per locus
1216     // from the population title, and also take care in case the number of
1217     // individuals is not specified for all loci.
1218     if(data->loci>1)
1219     {
1220         read_word(data->infile, input);
1221 	while(input[0]=='#')
1222 	  {
1223 	    FGETS(input,LINESIZE,data->infile);//read rest of line and discard
1224 	    read_word(data->infile, input);    //read first word on next line
1225 	  }
1226         data->numind[pop][0] = atol(input);//set first numind value, this must be always present
1227 	data->numalleles[pop][0] = data->numind[pop][0] * genomes;
1228         for(locus=1; locus < data->loci; locus++)
1229         {
1230 	  read_word(infile, input);// if we encounter a # we treat the rest of the line as comment
1231 	    while(input[0]=='#')
1232 	      {
1233 		FGETS(input,LINESIZE,data->infile);
1234 		read_word(data->infile, input);
1235 	      }
1236             if(isdigit(input[0]) && havepopname == FALSE )
1237 	      {
1238                 data->numind[pop][locus] = atol(input);
1239 		data->numalleles[pop][locus] = data->numind[pop][locus] * genomes;
1240 	      }
1241             else
1242 	      {
1243 		// encountered a letter and assume this is the population name
1244                 unread_word(infile, input);
1245                 FGETS(input,LINESIZE,infile);
1246                 havepopname=TRUE;
1247 		if(input!=NULL)
1248 		  {
1249 		    minlength = strlen(input);
1250 		    minlength = MIN(minlength,80);
1251 		    sprintf(data->popnames[pop],"%-*.*s",(int) minlength,(int) minlength,input);
1252 		  }
1253                 break;//this leaves the numind empty of not specified, the must be filled in later
1254 	      }
1255         }
1256         if(!havepopname)
1257 	  {
1258             read_word(infile, input);
1259 	    if(input!=NULL)
1260 	      {
1261                 unread_word(infile, input);
1262                 FGETS(input,LINESIZE,infile);
1263 		minlength = strlen(input);
1264 		minlength = MIN(minlength,80);
1265 		sprintf(data->popnames[pop],"%-*s", (int) minlength, input);
1266 	      }
1267 	  }
1268 
1269         // fills numind for additional locus in case the numind was not specified
1270         for(lo=locus; lo < data->loci; lo++)
1271 	  {
1272             data->numind[pop][lo] = data->numind[pop][locus-1];
1273 	    data->numalleles[pop][lo] = data->numind[pop][lo] * genomes;
1274 	  }
1275     }
1276     else
1277       {
1278         // only one locus so we can use old scheme [see below]
1279         FGETS(input,LINESIZE,infile);
1280 	while(input[0]=='#')
1281 	  {
1282 	    FGETS(input,LINESIZE,data->infile);
1283 	  }
1284         sscanf (input, "%ld%[^\n]", &(data->numind[pop][0]), data->popnames[pop]);
1285 	data->numalleles[pop][0] = data->numind[pop][0] * genomes;
1286       }
1287 
1288     translate (data->popnames[pop], ' ', '_');
1289     translate (data->popnames[pop], '\t', '_');
1290     unpad(data->popnames[pop],"_");
1291     myfree(input);
1292 }
1293 
1294 
1295 void
read_indname(FILE * file,data_fmt * data,long pop,long ind,long locus,long nmlength)1296 read_indname (FILE * file, data_fmt * data, long pop, long ind, long locus, long nmlength)
1297 {
1298     long i = 0;
1299     char ch;
1300     char input[LINESIZE];
1301     while (i < nmlength)
1302     {
1303         ch = getc (file);
1304 	while(ch =='#')
1305 	  {
1306 	    FGETS(input,LINESIZE,data->infile);
1307 	    ch = getc (file);
1308 	  }
1309         if(!strchr("\r\n",ch))
1310             data->indnames[pop][ind][locus][i++] = ch;
1311         if(strchr("\t",ch))
1312             break;
1313     }
1314     data->indnames[pop][ind][locus][nmlength] = '\0';
1315 }
1316 
1317 void
read_popdata(FILE * infile,data_fmt * data,long pop,option_fmt * options)1318 read_popdata (FILE * infile, data_fmt * data, long pop, option_fmt * options)
1319 {
1320     long ind, baseread = 0;
1321     long locus = 0;
1322     char c;
1323     if(options->datatype=='h')
1324       {
1325 	//check whether this is genotypeformat or not
1326 	if ((c=getc(infile))!='r')
1327 	  {
1328 	    ungetc(c,infile);
1329 	    read_hapmap(infile, data, options, pop);
1330 	  }
1331 	else
1332 	  read_hapmap_genotypes(infile, data, options, pop);
1333       }
1334     else
1335       {
1336 	for (ind = 0; ind < data->numind[pop][0]; ind++)
1337 	  {
1338 	    read_indname (infile, data, pop, ind, locus, options->nmlength);
1339 	    switch (options->datatype)
1340 	      {
1341 	      case 'a':
1342 	      case 'b':
1343 	      case 'm':
1344 		if (data->dlm == '\0')
1345 		  read_alleles (infile, data, pop, ind);
1346 		else
1347 		  read_microalleles (infile, data, options, pop, ind);
1348 		break;
1349 	      case 's':
1350 	      case 'n':
1351 	      case 'u':
1352 	      case 'f':
1353 		if(data->oneliner)
1354 		  {
1355 		    baseread = read_ind_seq_oneliner (infile, data, options, pop, ind, 0);
1356 		  }
1357 		else
1358 		  {
1359 		    baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0);
1360 		  }
1361 		break;
1362 	      default:
1363 		usererror
1364 		  ("Wrong datatype, only the types a, m, s, n\n       (electrophoretic alleles, \n       microsatellite data,\n       sequence data,\n       SNP polymorphism)\n        are allowed.\n");
1365 		break;
1366 	      }
1367 	  }
1368 	if (!strchr (SEQUENCETYPES, options->datatype))
1369 	  return;
1370 	else
1371 	  {
1372 	    if(!data->oneliner)
1373 	      finish_read_seq (infile, data, options, pop, baseread);
1374 	  }
1375       }
1376 }
1377 
check_list(long la1,long nla1,long locus,data_fmt * data)1378 long check_list(long la1, long nla1, long locus, data_fmt *data)
1379 {
1380   if(la1 > data->numrepeatnumbers[locus])
1381     {
1382       data->repeatnumbers[locus] = (long *) realloc(data->repeatnumbers[locus], sizeof(long) * (la1+1));
1383       memset(data->repeatnumbers[locus]+data->numrepeatnumbers[locus], 0, sizeof(long)*(la1+1-data->numrepeatnumbers[locus]));
1384       data->numrepeatnumbers[locus]= la1+1;
1385       data->repeatnumbers[locus][la1]=nla1;
1386     }
1387   else
1388     {
1389       if(data->repeatnumbers[locus][la1]!=0)
1390 	return data->repeatnumbers[locus][la1];
1391     }
1392   return la1;
1393 }
1394 
len2repeat(char * a1,long rlen,long locus,data_fmt * data)1395 void len2repeat(char *a1, long rlen, long locus, data_fmt *data)
1396 {
1397   long la1 = (long) atol(a1);
1398   long a=0;
1399   if((a=la1 % rlen) != 0)
1400     {
1401       //      long correction = (((((float) rlen) / 2.) < a) ? (-a) : (rlen/2 == a ? (UNIF_RANDUM()<0.5 ? -a : a) : (rlen - a)));
1402       long correction = (((((float) rlen) / 2.) < a) ? (-a) : (rlen/2 == a ? (UNIF_RANDUM()<0.5 ? -a : a) : (rlen - a)));
1403       //long nla = check_list(la1,la1+correction, locus, data);
1404       long nla = (la1+correction)/rlen;
1405       sprintf(a1,"%li",nla);
1406     }
1407   else
1408     {
1409       sprintf(a1,"%li",la1/rlen);
1410     }
1411 }
1412 
1413 
1414 void
read_microalleles(FILE * infile,data_fmt * data,option_fmt * options,long pop,long ind)1415 read_microalleles (FILE * infile, data_fmt * data, option_fmt *options, long pop, long ind)
1416 {
1417     char *input, *isave, dlm[2], ddlm[2], *p, *a, *a1, *a2;
1418     long locus, i;
1419     input = (char *) mycalloc (1, sizeof (char) * (SUPERLINESIZE + 1));
1420     isave = input;
1421     a = (char *) mycalloc (1, sizeof (char) * LINESIZE);
1422     a1 = (char *) mycalloc (1, sizeof (char) * LINESIZE);
1423     a2 = (char *) mycalloc (1, sizeof (char) * LINESIZE);
1424     dlm[0] = data->dlm, dlm[1] = '\0';
1425     ddlm[0] = ' ', ddlm[1] = '\0';
1426     FGETS (input, SUPERLINESIZE, infile);
1427     if ((p = (char *) strpbrk (input, CRLF)) != NULL)
1428         *p = '\0';
1429     for (locus = 0; locus < data->loci; locus++)
1430     {
1431         while (isspace ((int) *input))
1432             input++;
1433         if (input[0] == '\0')
1434             FGETS (input, SUPERLINESIZE, infile);
1435         i = 0;
1436         while (strchr(" \t",input[i])==NULL && input[i] != dlm[0])
1437         {
1438             a1[i] = input[i];
1439             i++;
1440         }
1441         a1[i] = '\0';
1442         input += i;
1443         i = 0;
1444         if (input[i] == dlm[0])
1445         {
1446             input++;
1447             while (strchr(" \t",input[i])==NULL && input[i] != '\0')
1448             {
1449                 a2[i] = input[i];
1450                 i++;
1451             }
1452             a2[i] = '\0';
1453             if (a2[0] == '\0')
1454             {
1455                 strcpy (a2, a1);
1456             }
1457             input += i;
1458         }
1459         else
1460         {
1461             strcpy (a2, a1);
1462         }
1463         sprintf (data->yy[pop][ind][locus][0], "%-.*s",(int) options->allelenmlength,a1);
1464         sprintf (data->yy[pop][ind][locus][1], "%-.*s",(int) options->allelenmlength,a2);
1465     }
1466     myfree(a);
1467     myfree(a1);
1468     myfree(a2);
1469     myfree(isave);
1470 }
1471 
1472 void
read_alleles(FILE * infile,data_fmt * data,long pop,long ind)1473 read_alleles (FILE * infile, data_fmt * data, long pop, long ind)
1474 {
1475     char *input, *isave, *p, *a;
1476     long locus;
1477     a = (char *) mycalloc (1, sizeof (char) * LINESIZE);
1478 
1479     input = (char *) mycalloc (1, sizeof (char) * SUPERLINESIZE);
1480     isave = input;
1481     FGETS (input, SUPERLINESIZE, infile);
1482     if ((p = (char *) strpbrk (input, CRLF)) != NULL)
1483         *p = '\0';
1484     for (locus = 0; locus < data->loci; locus++)
1485     {
1486         while (isspace ((int) *input))
1487         {
1488             input++;
1489         }
1490         if (sscanf (input, "%s", a) == 1)
1491         {
1492             input += (long) strlen (a);
1493         }
1494 
1495         data->yy[pop][ind][locus][0][0] = a[0];
1496         data->yy[pop][ind][locus][0][1] = '\0';
1497         if (a[1] == '\0')
1498         {
1499             data->yy[pop][ind][locus][1][0] = a[0];
1500             data->yy[pop][ind][locus][1][1] = '\0';
1501         }
1502         else
1503         {
1504             data->yy[pop][ind][locus][1][0] = a[1];
1505             data->yy[pop][ind][locus][1][1] = '\0';
1506         }
1507     }
1508     myfree(a);
1509     myfree(isave);
1510 }
1511 
1512 ///
1513 /// reads the standard hapmap genotype data file format using this header
1514 /// rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode individual# ....
1515 void
read_hapmap_genotypes(FILE * infile,data_fmt * data,option_fmt * options,long pop)1516 read_hapmap_genotypes (FILE * infile, data_fmt * data, option_fmt * options, long pop)
1517 {
1518   int error;
1519   long ind;
1520     char label1;
1521     char label2;
1522     long label1count;
1523     long label2count;
1524     long label12count;
1525     char *input;
1526     long locus;
1527     long genomes = number_genomes(options->datatype);
1528     input = (char *) mycalloc(LONGLINESIZE,sizeof(char));
1529     // read header
1530     FGETS(input,LONGLINESIZE,infile);
1531     for(locus=0;locus<data->loci;locus++)
1532       {
1533 	//rs# or comment
1534 	read_word(infile,input);
1535 	if(input[0]=='#')
1536 	  {
1537 	    fprintf(stdout,"%s\n",input);
1538 	    FGETS(input,LONGLINESIZE,infile);
1539 	    locus--;
1540 	    continue;
1541 	  }
1542 	// discard rs#
1543 	// read allele and record
1544 	read_word(infile,input);
1545 	label1 = input[0];// format is like this G/C
1546 	label2 = input[2];
1547 	// read chromosome and discard
1548 	read_word(infile,input);
1549 	// read position
1550 	read_word(infile,input);
1551 	data->position[locus] = atol(input);
1552 	if(options->printdata)
1553 	  {
1554 	    fprintf(stdout,"%20li|",data->position[locus]);
1555 	  }
1556         // read strand must be positive but not checked
1557 	read_word(infile,input);
1558 	// read assembly# and discard
1559 	read_word(infile,input);
1560 	// read center and discard
1561 	read_word(infile,input);
1562 	// read protLSID and discard
1563 	read_word(infile,input);
1564 	// read assyLSID and discard
1565 	read_word(infile,input);
1566 	// read panelLSID and discard
1567 	read_word(infile,input);
1568 	// read QCODE and use as dummy
1569 	read_word(infile,input);
1570  	data->seq[0]->sites[locus]=1;
1571 	label1count=0;
1572 	label2count=0;
1573 	read_word(infile,input);
1574 	while(strstr(input,"rs")==NULL && !isdigit(input[0]))
1575 	  {
1576 	    if (label1 == input[0])
1577 	      label1count++;
1578 	    else
1579 	      label2count++;
1580 	    if (label2 == input[1])
1581 	      label2count++;
1582 	    else
1583 	      label1count++;
1584 	    error = read_word(infile,input);
1585 	    if (error==EOF)
1586 	      break;
1587 	  }
1588 	unread_word(infile,input);
1589 	label12count = label1count + label2count;
1590 	if(options->printdata)
1591 	  {
1592 	    fprintf(stdout, " freq[%c]=%f freq[%c]=%f\n",label1,
1593 		    (float) label1count/label12count,label2, (float) label2count/label12count);
1594 	  }
1595 	//was messing with the reading, use fraction data->numind[pop][locus] = label12count;
1596 	if(options->randomsubset > 0 && options->randomsubset < data->numind[pop][locus])
1597 	  {
1598 	    data->numalleles[pop][locus] = options->randomsubset * genomes;
1599 	    data->numind[pop][locus] = options->randomsubset / genomes;
1600 	  }
1601 	else
1602 	  {
1603 	    data->numalleles[pop][locus] = data->numind[pop][locus] = label12count;
1604 	  //	  data->numalleles[pop][locus] = data->numind[pop][locus] * genomes;
1605 	  }
1606 	label1count = (long) floor( (((double) label1count/label12count * data->numalleles[pop][locus]))+0.5 );
1607 	for(ind=0;ind < label1count; ind++)
1608 	  {
1609 	    data->yy[pop][ind][locus][0][0] = label1;
1610 	  }
1611 	for(ind=label1count;ind < data->numalleles[pop][locus]; ind++)
1612 	  {
1613 	    data->yy[pop][ind][locus][0][0] = label2;
1614 	  }
1615       }
1616     myfree(input);
1617 }
1618 
1619 ///
1620 /// my own hapmap format derived from the frequency data
1621 void
read_hapmap(FILE * infile,data_fmt * data,option_fmt * options,long pop)1622 read_hapmap (FILE * infile, data_fmt * data, option_fmt * options,  long pop)
1623 {
1624   long ind;
1625     char label1;
1626     char label2;
1627     long label1count;
1628     long label2count;
1629     long label12count;
1630     char *input;
1631     long locus;
1632     long genomes = number_genomes(options->datatype);
1633     input = (char *) mycalloc(LINESIZE,sizeof(char));
1634     for(locus=0;locus<data->loci;locus++)
1635       {
1636 	read_word(infile,input);
1637 	if(input[0]=='#')
1638 	  {
1639 	    fprintf(stdout,"%s\n",input);
1640 	    FGETS(input,LINESIZE,infile);
1641 	    locus--;
1642 	    continue;
1643 	  }
1644 	data->position[locus] = atol(input);
1645 	if(options->printdata)
1646 	  {
1647 	    fprintf(stdout,"%20li|",data->position[locus]);
1648 	  }
1649 	data->seq[0]->sites[locus]=1;
1650 	read_word(infile,input);
1651 	label1 = input[0];;
1652 	read_word(infile,input);
1653 	label1count = atol(input);
1654 
1655 	read_word(infile,input);
1656 	label2 = input[0];
1657 	read_word(infile,input);
1658 	label2count = atol(input);
1659 	read_word(infile,input);
1660 	label12count = atol(input);
1661 	if(options->printdata)
1662 	  {
1663 	    fprintf(stdout, " freq[%c]=%f freq[%c]=%f\n",label1,
1664 		    (float) label1count/label12count,label2, (float) label2count/label12count);
1665 	  }
1666 	//was messing with the reading, use fraction data->numind[pop][locus] = label12count;
1667 	if(options->randomsubset > 0 && options->randomsubset < data->numind[pop][locus])
1668 	  {
1669 	    data->numalleles[pop][locus] = options->randomsubset * genomes;
1670 	    data->numind[pop][locus] = options->randomsubset / genomes;
1671 	  }
1672 	else
1673 	  {
1674 	    data->numalleles[pop][locus] = label12count;
1675 	    data->numind[pop][locus] = label12count / genomes;
1676 	  }
1677 	// data->numind[pop][locus] * genomes;
1678 	label1count = (long) floor( (((double) label1count/label12count * data->numalleles[pop][locus]))+0.5 );
1679 	for(ind=0;ind < label1count; ind++)
1680 	  {
1681 	    data->yy[pop][ind][locus][0][0] = label1;
1682 	  }
1683 	for(ind=label1count;ind < data->numalleles[pop][locus]; ind++)
1684 	  {
1685 	    data->yy[pop][ind][locus][0][0] = label2;
1686 	  }
1687       }
1688     myfree(input);
1689 }
1690 
1691 long
read_ind_seq(FILE * infile,data_fmt * data,option_fmt * options,long locus,long pop,long ind,long baseread)1692 read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options,
1693               long locus, long pop, long ind, long baseread)
1694 {
1695     long j;
1696     char charstate;
1697     j = (options->interleaved) ? baseread : 0;
1698     charstate = getc (infile);
1699     ungetc ((int) charstate, infile);
1700     if(options->printdata)
1701       {
1702     	fprintf(stdout,"%s|",data->indnames[pop][ind][locus]);
1703       }
1704     while (j < data->seq[0]->sites[locus]
1705             && !(options->interleaved && strchr(CRLF,charstate)))
1706     {
1707         charstate = getc (infile);
1708         if (strchr(CRLF,charstate))
1709         {
1710 #ifdef INTERLEAVED
1711             if (options->interleaved)
1712             {
1713                 while(strchr(CRLF,charstate=getc(infile)))
1714                     ;
1715                 ungetc ((int) charstate, infile);
1716                 return j;
1717             }
1718             else
1719 #endif
1720                 charstate = ' ';
1721         }
1722         if (charstate == ' '
1723                 || (charstate >= '0' && charstate <= '9') || charstate == '\\')
1724             continue;
1725         charstate = uppercase (charstate);
1726 	if(options->printdata)
1727 	{
1728 	  fprintf(stdout, "%c",charstate);
1729 	}
1730         if ((strchr ("ABCDGHKMNRSTUVWXY?O-", (int) charstate)) == NULL)
1731         {
1732             printf
1733             ("ERROR: BAD BASE: %c AT POSITION %5ld OF INDIVIDUUM %3li in POPULATION %ld\n",
1734              charstate, j, ind+1, pop+1);
1735             printf
1736             ("Last complete sequences was in population %li, individual %li and locus %li:\n%s",
1737              pop + 1, ind, locus+1, data->indnames[pop][ind - 1][locus]);
1738             for (j = 0; j < data->seq[0]->sites[locus]; j++)
1739                 printf ("%c", data->yy[pop][ind - 1][locus][0][j]);
1740             exit (EXIT_FAILURE);
1741         }
1742         data->yy[pop][ind][locus][0][j++] = charstate;
1743     }
1744     charstate = getc (infile); /* swallow the \n or \r */
1745     while(charstate == '\r' || charstate == '\n' || charstate == '\t' || charstate == ' ' || charstate == ';')
1746       {
1747 	charstate = getc(infile);
1748       }
1749     if(charstate!='\n')
1750       ungetc((int) charstate, infile);
1751 
1752     if(options->printdata)
1753      {
1754        fprintf(stdout,"\n");
1755      }
1756     return j;
1757 }
1758 
1759 ///
1760 /// reads sequence data that comes where all loci are (1) on one line and (2) are not interleaved
1761 long
read_ind_seq_oneliner(FILE * infile,data_fmt * data,option_fmt * options,long pop,long ind,long baseread)1762 read_ind_seq_oneliner (FILE * infile, data_fmt * data, option_fmt * options,
1763               long pop, long ind, long baseread)
1764 {
1765     long j = 0;
1766     long locus=0;
1767     char charstate;
1768     charstate = getc (infile);
1769     ungetc ((int) charstate, infile);
1770     if(options->printdata)
1771       {
1772     	fprintf(stdout,"%s",data->indnames[pop][ind][locus]);
1773       }
1774     for(locus = 0; locus < data->loci; locus++)
1775       {
1776 	j=0;
1777 	if(options->printdata)
1778 	  {
1779 	    fprintf(stdout,"|");
1780 	  }
1781 	while (j < data->seq[0]->sites[locus])
1782 	  {
1783 	    charstate = getc (infile);
1784 	    if (strchr(CRLF,charstate))
1785 	      {
1786 		charstate = ' ';
1787 	      }
1788 	    if (charstate == ' '
1789                 || (charstate >= '0' && charstate <= '9') || charstate == '\\')
1790             continue;
1791 	    charstate = uppercase (charstate);
1792 	    if(options->printdata)
1793 	      {
1794 		fprintf(stdout, "%c",charstate);
1795 	      }
1796 	    if ((strchr ("ABCDGHKMNRSTUVWXY?O-", (int) charstate)) == NULL)
1797 	      {
1798 		printf
1799 		  ("ERROR: BAD BASE: %c AT POSITION %5ld OF INDIVIDUUM %3li in POPULATION %ld\n",
1800 		   charstate, j, ind+1, pop+1);
1801 		printf
1802 		  ("Last complete sequences was in population %li, individual %li and locus %li:\n%s",
1803 		   pop + 1, ind, locus+1, data->indnames[pop][ind - 1][locus]);
1804 		for (j = 0; j < data->seq[0]->sites[locus]; j++)
1805 		  printf ("%c", data->yy[pop][ind - 1][locus][0][j]);
1806 		exit (EXIT_FAILURE);
1807 	      }
1808 	    data->yy[pop][ind][locus][0][j++] = charstate;
1809 	  }
1810       }
1811     charstate = getc (infile); /* swallow the \n or \r */
1812     while(charstate == '\n' || charstate == '\t' || charstate == ' ' || charstate == ';')
1813       {
1814 	charstate = getc(infile);
1815       }
1816     if(charstate!='\n')
1817       ungetc((int) charstate, infile);
1818     if (charstate == '\r')
1819     {
1820         charstate = getc (infile); /* swallow the \n */
1821         if(charstate!='\n')
1822             ungetc((int) charstate, infile);
1823     }
1824     if(options->printdata)
1825      {
1826        fprintf(stdout,"\n");
1827      }
1828     return j;
1829 }
1830 
1831 void
read_distance_fromfile(FILE * dfile,long tips,long nmlength,MYREAL ** m)1832 read_distance_fromfile (FILE * dfile, long tips, long nmlength, MYREAL **m)
1833 {
1834     char input[SUPERLINESIZE];
1835     long i, j;
1836     int retval;
1837     if (dfile != NULL)
1838     {
1839         // assumes that the dfile is in PHYLIP format
1840         // and that all n x n cells are filled.
1841         FGETS (input, LONGLINESIZE, dfile); //reads header line with
1842         for (i = 0; i < tips; i++) // of individuals
1843         {
1844             //reads the population label
1845             FGETS (input, nmlength + 1, dfile);
1846             for (j = 0; j < tips; j++)
1847             {
1848 #ifdef USE_MYREAL_FLOAT
1849                 retval = fscanf (dfile, "%f", &m[i][j]);
1850 #else
1851                 retval = fscanf (dfile, "%lf", &m[i][j]);
1852 #endif
1853 		if((i!=j) && (m[i][j] < EPSILON))
1854 		  {
1855 		    warning("Reading geofile: adjusting dist[%li][%li]=%f to %f\n",i,j,m[i][j],EPSILON);
1856 		    m[i][j] = EPSILON;
1857 		  }
1858             }
1859             // reads the last \n from the
1860             // data matrix
1861             FGETS (input, LONGLINESIZE, dfile);
1862         }
1863     }
1864 }
1865 
1866 #ifdef UEP
1867 // uep function
1868 
1869 void
read_uep_fromfile(FILE * uepfile,long tips,long nmlength,int ** uep,long * uepsites,long datatype)1870 read_uep_fromfile (FILE * uepfile, long tips, long nmlength, int **uep,
1871                    long *uepsites, long datatype)
1872 {
1873     char input[LINESIZE];
1874     long i, j;
1875     long thistips;
1876     if (uepfile != NULL)
1877     {
1878         // assumes that the uepfile is in PHYLIP format
1879         // and that all n cells are filled.
1880         FGETS (input, LINESIZE, uepfile); //reads header line with
1881         // of individuals and uep sites
1882         sscanf (input, "%li%li", &thistips, uepsites);
1883         if (thistips != tips)
1884             error ("UEP datafile and infile are inconsistent!");
1885         if (strchr (SEQUENCETYPES, datatype))
1886         {
1887             for (i = 0; i < tips; i++)
1888             {
1889                 uep[i] = (int *) mycalloc (*uepsites, sizeof (int));
1890                 FGETS (input, nmlength + 1, uepfile); //reads each line
1891                 for (j = 0; j < *uepsites; ++j)
1892                     retval = fscanf (uepfile, "%i", &uep[i][j]);
1893                 // reads the last \n from the data matrix
1894                 FGETS (input, LINESIZE, uepfile);
1895             }
1896         }
1897         else
1898         {
1899             for (i = 0; i < tips; i++)
1900             {
1901                 uep[i] = (int *) mycalloc (*uepsites, sizeof (int));
1902                 uep[i + tips] = (int *) mycalloc (*uepsites, sizeof (int));
1903                 FGETS (input, nmlength + 1, uepfile); //reads each line
1904                 for (j = 0; j < *uepsites; ++j)
1905                     retval = fscanf (uepfile, "%i", &uep[i][j]);
1906                 // finished reading first allele, no onto the second
1907                 for (j = 0; j < *uepsites; ++j)
1908                     retval = fscanf (uepfile, "%i", &uep[i + tips][j]);
1909                 // reads the last \n from the data matrix
1910                 FGETS (input, LINESIZE, uepfile);
1911             }
1912         }
1913     }
1914 }
1915 #endif
1916 
1917 void
finish_read_seq(FILE * infile,data_fmt * data,option_fmt * options,long pop,long baseread)1918 finish_read_seq (FILE * infile, data_fmt * data, option_fmt * options,
1919                  long pop, long baseread)
1920 {
1921 
1922     long ind, baseread2 = 0, locus = 0;
1923     if (options->interleaved)
1924     {
1925         while (baseread < data->seq[0]->sites[0])
1926         {
1927             for (ind = 0; ind < data->numind[pop][0]; ind++)
1928             {
1929                 baseread2 =
1930                     read_ind_seq (infile, data, options, locus, pop, ind,
1931                                   baseread);
1932             }
1933             baseread = baseread2;
1934         }
1935     }
1936     for (locus = 1; locus < data->loci; locus++)
1937     {
1938         baseread = 0;
1939         for (ind = 0; ind < data->numind[pop][locus]; ind++)
1940         {
1941 	  read_indname (infile, data, pop, ind, locus, options->nmlength);
1942             baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0);
1943         }
1944         if (options->interleaved)
1945         {
1946             while (baseread < data->seq[0]->sites[locus])
1947             {
1948                 for (ind = 0; ind < data->numind[pop][locus]; ind++)
1949                 {
1950                     baseread2 =
1951                         read_ind_seq (infile, data, options, locus, pop, ind,
1952                                       baseread);
1953                 }
1954                 baseread = baseread2;
1955             }
1956         }
1957     }
1958 }
1959 
find_missing(data_fmt * data,long pop,long locus)1960 long find_missing(data_fmt *data, long pop, long locus)
1961 {
1962     long missing = 0;
1963     long ind;
1964     for(ind=0; ind < data->numind[pop][locus]; ind++)
1965     {
1966         if(data->yy[pop][ind][locus][0][0]=='?')
1967             missing++;
1968         if(data->yy[pop][ind][locus][1][0]=='?')
1969             missing++;
1970     }
1971     return missing;
1972 }
1973 
1974 
1975 ///
1976 /// Data set was subsampled and used a random sample of size
print_random_subset(FILE * file,data_fmt * data,option_fmt * options)1977 void print_random_subset(FILE * file, data_fmt * data, option_fmt *options)
1978 {
1979   long locus;
1980   long pop;
1981   char *name;
1982   long maxnum;
1983   long count;
1984   long ind;
1985   long length;
1986   long index;
1987   if(options->randomsubset > 0)
1988     {
1989       fprintf (file, "\nData set was subsampled and used a random sample of size: %li\n\n", options->randomsubset);
1990       fprintf (file, "Locus Population Individuals\n");
1991       fprintf (file, "----- ---------- ---------------------------------------------------------------------\n");
1992       name = (char*) mycalloc(options->nmlength+1,sizeof(char));
1993       for (locus=0; locus < data->loci; locus++)
1994 	{
1995 	  fprintf(file,"%5li ",locus+1);
1996 	  for (pop=0; pop < data->numpop; pop++)
1997 	    {
1998 	      fprintf(file, "%-10.10s ", data->popnames[pop]);
1999 	      maxnum = options->randomsubset < data->numind[pop][locus] ? options->randomsubset : data->numind[pop][locus];
2000 	      count = 18;
2001 	      for(ind=0;ind<maxnum;ind++)
2002 		{
2003 		  index = data->shuffled[pop][locus][ind];
2004 
2005 		  if(data->indnames[pop][index][locus][0]=='\0')
2006 		    memcpy(name,data->indnames[pop][index][0],sizeof(char)*options->nmlength);
2007 		  else
2008 		    memcpy(name,data->indnames[pop][index][locus],sizeof(char)*options->nmlength);
2009 		  remove_trailing_blanks(&name);
2010 		  length = strlen(name);
2011 		  if (count+length < LINELENGTH)
2012 		    {
2013 		      fprintf(file,"%s ",name);
2014 		      count += length+1;
2015 		    }
2016 		  else
2017 		    {
2018 		      fprintf(file,"\n");
2019 		      fprintf(file,"                 ");
2020 		      fprintf(file,"%s ",name);
2021 		      count=17+length+1;
2022 		    }
2023 		}
2024 	      if(pop<data->numpop-1)
2025 		fprintf(file,"\n      ");
2026 	      else
2027 		fprintf(file,"\n");
2028 	    }
2029 	}
2030       myfree(name);
2031     }
2032 }
2033 
2034 void
print_data_summary(FILE * file,world_fmt * world,option_fmt * options,data_fmt * data)2035 print_data_summary (FILE * file, world_fmt * world, option_fmt * options,
2036                     data_fmt * data)
2037 {
2038   int maxlength = 24; // length of the the total string , for popnames
2039   int length;
2040   long locus;
2041   long pop;
2042   long numind;
2043   long nummiss;
2044   char dstring[LINESIZE];
2045   long *total;
2046   long *totalmiss;
2047   total = (long *) mycalloc(data->loci,sizeof(long));
2048   totalmiss = (long *) mycalloc(data->loci,sizeof(long));
2049   fprintf (file, "\nSummary of data:\n");
2050   fprintf(file, "Title:%54.54s\n",options->title);
2051 
2052   switch (options->datatype)
2053     {
2054     case 'a':
2055       strcpy (dstring, "Allelic data");
2056       break;
2057     case 'f':
2058     case 's':
2059       strcpy (dstring, "Sequence data");
2060       break;
2061     case 'b':
2062     case 'm':
2063       strcpy (dstring, "Microsatellite data");
2064       break;
2065     case 'n':
2066     case 'u':
2067       strcpy (dstring, "SNP data");
2068       break;
2069     case 'h':
2070       strcpy (dstring, "SNP data (Hapmap data)");
2071       break;
2072     default:
2073       strcpy (dstring, "Unknown data [ERROR]");
2074       break;
2075     }
2076   fprintf (file, "Data file:  %48.48s\n", options->infilename);
2077   fprintf (file, "Datatype:   %48.48s\n", dstring);
2078   if(!data->has_repeats)
2079     {
2080       fprintf (file, "[Fragment length is translated to repeats]\n");
2081     }
2082   else
2083     {
2084       if (dstring[0]=='M')
2085 	fprintf (file, "[Data was used as repeat-length information]\n");
2086     }
2087 
2088   fprintf (file, "Number of loci:                         %20li\n\n",
2089 	   data->loci);
2090   if(options->has_datefile)
2091     {
2092       fprintf (file, "Title:%54.54s\n",options->title);
2093       fprintf (file, "Sample dates:%46.46s\n", options->datefilename);
2094       fprintf (file, "Generations per year:                        %5.5f\n", options->generation_year);
2095       fprintf (file, "Mutationrate per year:  %.10g", options->mutationrate_year[0]);
2096       for(locus=1; locus < options->mutationrate_year_numalloc; locus++)
2097 	{
2098 	  if(locus % 4 == 0)
2099 	    fprintf(file,"                      ");
2100 	  fprintf(file,", %f", options->mutationrate_year[locus]);
2101 	}
2102       fprintf(file,"\n");
2103     }
2104   //
2105   print_random_subset(file,data,options);
2106   if(file!=stdout)
2107     {
2108       pdf_print_random_subset(data,options);
2109     }
2110   for (pop = 0; pop < data->numpop; pop++)
2111     {
2112       length = strlen(data->popnames[pop]);
2113       if(maxlength < length)
2114 	 maxlength = length;
2115     }
2116     if(maxlength > 40)
2117       maxlength = 40;
2118 
2119   if (!strchr (SEQUENCETYPES, options->datatype))
2120     {
2121       fprintf (file,"%-*.*s     Locus   Gene copies    \n",maxlength,maxlength,"Population");
2122       fprintf (file,"%-*.*s             ---------------\n",maxlength, maxlength, " ");
2123       fprintf (file,"%-*.*s             data  (missing)\n",maxlength, maxlength, " ");
2124     }
2125   else
2126     {
2127       fprintf (file,"%-*.*s     Locus   Gene copies    \n",maxlength,maxlength,"Population");
2128     }
2129   fprintf (file,"----%-*.*s------------------------\n",maxlength,maxlength,"------------------------------------------------------------------");
2130   for (pop = 0; pop < data->numpop; pop++)
2131     {
2132         for(locus=0; locus< data->loci; locus++)
2133         {
2134 	  if (!strchr (SEQUENCETYPES, options->datatype))
2135 	    {
2136 	      nummiss = find_missing(data,pop,locus);
2137 	      numind = data->numalleles[pop][locus] - nummiss;
2138 	      fprintf (file, "%3li %-*.*s %5li %6li (%li)\n", options->newpops[pop], maxlength, maxlength,
2139 		       (locus==0 ? data->popnames[pop] : " " ), locus+1 , numind, nummiss);
2140 	    }
2141 	  else
2142 	    {
2143 	      nummiss = 0;
2144 	      numind = data->numind[pop][locus];
2145 	      fprintf (file, "%3li %-*.*s %5li    %6li\n", options->newpops[pop], maxlength, maxlength,
2146 		       (locus==0 ? data->popnames[pop] : " "), locus+1 , numind);
2147 	    }
2148 	  total[locus] += numind;
2149 	  totalmiss[locus] += nummiss;
2150         }
2151     }
2152     if (!strchr (SEQUENCETYPES, options->datatype))
2153       {
2154         for(locus=0; locus< data->loci; locus++)
2155 	  {
2156 	    fprintf (file,"    %-*.*s %5li %6li (%li)\n",maxlength,maxlength,
2157 		     (locus == 0 ? "Total of all populations" : " "), locus+1, total[locus], totalmiss[locus]);
2158 	  }
2159       }
2160     else
2161       {
2162         for(locus=0; locus< data->loci; locus++)
2163 	  {
2164 	    fprintf (file,"    %-*.*s %5li    %6li\n",maxlength,maxlength,
2165 		     (locus == 0 ? "Total of all populations" : " "), locus+1, total[locus]);
2166 	  }
2167       }
2168     fprintf(file,"\n");
2169     myfree(total);
2170     myfree(totalmiss);
2171     fflush (file);
2172 }
2173 
2174 void
print_data(world_fmt * world,option_fmt * options,data_fmt * data)2175 print_data (world_fmt * world, option_fmt * options, data_fmt * data)
2176 {
2177     if (options->printdata)
2178     {
2179         switch (options->datatype)
2180         {
2181         case 'a':
2182 	  //	  if(options->dlm=='\0')
2183           //  print_alleledata (world, data, options);
2184 	  //else
2185 	  print_microdata (world, data, options);
2186 	  break;
2187         case 'b':
2188         case 'm':
2189             print_microdata (world, data, options);
2190             break;
2191         case 's':
2192         case 'n':
2193 	case 'h':
2194         case 'u':
2195         case 'f':
2196             print_seqdata (world, options, data);
2197             break;
2198         }
2199     }
2200 }
2201 
2202 ///
2203 /// calculate allele frequency spectrum and print
2204 /// allele population1 .. populationN All
2205 /// taking into account population labeling and random_subsets
print_spectra(world_fmt * world,option_fmt * options,data_fmt * data)2206 void print_spectra(world_fmt * world, option_fmt * options,data_fmt * data)
2207 {
2208   const double one = 1.0;
2209   const double two = 2.0;
2210   long found;
2211   long locus;
2212   long a;
2213   long pop;
2214   long pop1;
2215   long ind;
2216   MYREAL **total;
2217   MYREAL *grandtotal;
2218   MYREAL allfreq;
2219   MYREAL f;
2220   MYREAL homo = 0.0;
2221   MYREAL *avghet;
2222   MYREAL v;
2223   MYREAL avghet1;
2224   //MYREAL avghetall = 0.0;
2225   long *maxalleles;
2226   long *maxallelepop;
2227   MYREAL ***freq;
2228   char *thisallele;
2229   char *thatallele;
2230   MYREAL fx;
2231   MYREAL general_homo;
2232   FILE *outfile = world->outfile;
2233   long loctotal;
2234   //long t;
2235   boolean second = (strchr(ALLELETYPES,options->datatype)!=NULL);
2236   if (strchr (DNASEQUENCETYPES, options->datatype))
2237     return; /*we do not calculate allele frequencies for DNA sequences*/
2238   // find total number of alleles
2239   maxalleles = (long *) mycalloc(data->loci, sizeof(long));
2240   maxallelepop = (long *) mycalloc(data->numpop, sizeof(long));
2241   avghet = (MYREAL *) mycalloc(data->numpop, sizeof(MYREAL));
2242   for (locus = 0; locus < data->loci; locus++)
2243     {
2244       maxalleles[locus] = findAllele(data,"\0",locus);
2245 #ifdef BEAGLE
2246       world->mutationmodels[world->sublocistarts[locus]].numstates = maxalleles[locus];
2247       world->mutationmodels[world->sublocistarts[locus]].basefreqs = (double *) calloc(world->mutationmodels[world->sublocistarts[locus]].numstates,sizeof(double));
2248 #endif
2249     }
2250 
2251   // create bins for each population and all
2252   grandtotal = (MYREAL *) mycalloc(data->loci, sizeof(MYREAL));
2253   freq = (MYREAL ***) mycalloc(data->numpop, sizeof(MYREAL **));
2254   doublevec2d(&total,data->numpop,data->loci);
2255   for (pop = 0; pop < data->numpop; pop++)
2256     {
2257       freq[pop] = (MYREAL **) mycalloc(data->loci, sizeof(MYREAL *));
2258       for (locus = 0; locus < data->loci; locus++)
2259 	{
2260 	  freq[pop][locus] = (MYREAL *) mycalloc(1+maxalleles[locus], sizeof(MYREAL));
2261 	}
2262     }
2263 
2264   // calculate spectrum
2265   for (pop1 = 0; pop1 < data->numpop; pop1++)
2266     {
2267       //pop = options->newpops[pop1]-1;
2268       for (ind = 0; ind < data->numind[pop1][0]; ind++)
2269         {
2270 	  for (locus = 0; locus < data->loci; locus++)
2271 	    {
2272 	      thisallele = data->yy[pop1][ind][locus][0];
2273 	      found = findAllele(data, thisallele, locus);
2274 	      if(second)
2275 		{
2276 		  thatallele = data->yy[pop1][ind][locus][1];
2277 		  if(!strcmp(thisallele,thatallele))
2278 		    {
2279 		      if(!strchr(thisallele,'?'))
2280 			{
2281 			  freq[pop1][locus][found] += two ;
2282 			  //total[pop][locus] += two ;
2283 			}
2284 		    }
2285 		  else
2286 		    {
2287 		      if(!strchr(thisallele,'?'))
2288 			{
2289 			  freq[pop1][locus][found] += one ;
2290 			  //total[pop][locus] += one ;
2291 			}
2292 		      found = findAllele(data, thatallele, locus);
2293 		      if(!strchr(thatallele,'?'))
2294 			{
2295 			  freq[pop1][locus][found] += one ;
2296 			  //total[pop][locus] += one ;
2297 			}
2298 		    }
2299 		}
2300 	      else
2301 		{
2302 		  if(!strchr(thisallele,'?'))
2303 		    {
2304 		      freq[pop1][locus][found] += one ;
2305 		      //total[pop][locus] += one ;
2306 		    }
2307 		}
2308 	    }
2309 	}
2310     }
2311   // now we calculate the poptotals and popfreqs if present
2312   // this will change freq and total therefore transform the
2313   // location score to population scores that are repeated
2314   for (pop1 = 0; pop1 < data->numpop; pop1++)
2315     {
2316       pop = options->newpops[pop1]-1;
2317       if (pop == pop1)
2318 	continue;
2319       for (locus = 0; locus < data->loci; locus++)
2320 	{
2321 	  //accumulate the pooled populations frequencies
2322 	  for(a=0; a < maxalleles[locus]; a++)
2323 	    {
2324 	      freq[pop][locus][a] += freq[pop1][locus][a];
2325 	    }
2326 	  //replace all locations frequencies with the population freq
2327 	  for(a=0; a < maxalleles[locus]; a++)
2328 	    {
2329 	      freq[pop1][locus][a] = freq[pop][locus][a];
2330 	    }
2331 	}
2332     }
2333   // now we have collected all numbers of alleles for each locus and each location
2334   // we calculate now all the totals for each locality,
2335   for (pop = 0; pop < data->numpop; pop++)
2336     {
2337       for (locus = 0; locus < data->loci; locus++)
2338 	{
2339 	  loctotal = 0;
2340 	  for(a=0; a < maxalleles[locus]; a++)
2341 	    {
2342 	      loctotal += (long) freq[pop][locus][a];
2343 	    }
2344 	  total[pop][locus] = loctotal;
2345 	  grandtotal[locus] += loctotal;
2346 	}
2347     }
2348 
2349   // print in ascii
2350   //fprintf(stdout,"Allele frequency spectra\n");
2351   fprintf(outfile,"Allele frequency spectra\n");
2352   fprintf(outfile,"========================\n\n");
2353   avghet1=0.0;
2354   for (locus = 0; locus < data->loci; locus++)
2355     {
2356       fprintf(outfile,"Locus %li\n", locus + 1);
2357       fprintf(outfile,"Allele      ");
2358       general_homo = 0.0;
2359       for (pop1 = 0; pop1 < data->numpop; pop1++)
2360 	{
2361 	  maxallelepop[pop1]=0;
2362 	  pop = options->newpops[pop1]-1;
2363 	  fprintf(outfile,"Pop%-2li  ",pop+1);
2364 	}
2365       fprintf(outfile,"All\n-----------");
2366       for (pop = 0; pop < data->numpop+1; pop++)
2367 	fprintf(outfile,"-------");
2368       fprintf(outfile, "\n");
2369       for(a=0; a < maxalleles[locus]; a++)
2370 	{
2371 	  allfreq = 0.0;
2372 	  if(strlen(data->allele[locus][a])>8)
2373 	    fprintf(outfile,"%-8.8s...",data->allele[locus][a]);
2374 	  else
2375 	    fprintf(outfile,"%-8.8s   ",data->allele[locus][a]);
2376 	  for (pop1 = 0; pop1 < data->numpop; pop1++)
2377 	    {
2378 	      pop = options->newpops[pop1]-1;
2379 	      if(freq[pop][locus][a]>0.0)
2380 		{
2381 		  maxallelepop[pop1] += 1;
2382 		  //		  maxallelepop[pop] += 1.0/data->numpop;
2383 		  fprintf(outfile," %1.3f ",freq[pop][locus][a]/total[pop][locus]);
2384 		  allfreq += freq[pop][locus][a];
2385 		  //		  allfreq += freq[pop][locus][a];
2386 		}
2387 	      else
2388 		{
2389 		  fprintf(outfile,"   -   ");
2390 		}
2391 	    }
2392 	  fprintf(outfile," %1.3f\n", (MYREAL) allfreq/grandtotal[locus]);
2393 	  fx  =  allfreq/grandtotal[locus];
2394 	  general_homo += fx * fx;
2395 	  //	  fprintf(outfile," %1.3f %f\n", fx*fx,general_homo);
2396 #ifdef BEAGLE
2397 	  // debug not ready yet for subloci
2398 	  world->mutationmodels[world->sublocistarts[locus]].basefreqs[a] = allfreq/data->numpop;
2399 #endif
2400 	}
2401       fprintf(outfile,"Alleles    ");
2402       for (pop1 = 0; pop1 < data->numpop; pop1++)
2403 	{
2404 	  // pop = options->newpops[pop1]-1;
2405 	  fprintf(outfile," %5li ",  maxallelepop[pop1]);
2406 	}
2407       fprintf(outfile," %5li\n", maxalleles[locus]);
2408       fprintf(outfile,"Samplesize ");
2409       for (pop1 = 0; pop1 < data->numpop; pop1++)
2410 	{
2411 	  fprintf(outfile," %5li ",  (long) total[pop1][locus]);
2412 	}
2413       fprintf(outfile," %5li\n", (long) grandtotal[locus]);
2414 
2415       //if(maxalleles[locus]<=1 && strchr(SNPTYPES,options->datatype))
2416       //{
2417       //  data->skiploci[locus] = TRUE;
2418       //}
2419 
2420       fprintf(outfile,"H_exp      ");
2421       for (pop1 = 0; pop1 < data->numpop; pop1++)
2422 	{
2423 	  pop = options->newpops[pop1]-1;
2424 	  homo = 0.0;
2425 	  for(a=0;a<maxalleles[locus];a++)
2426 	    {
2427 	      f = freq[pop][locus][a]/total[pop][locus];
2428 	      homo += f*f;
2429 	    }
2430 	  v = 1.0 - homo;
2431 	  avghet[pop1] += v;
2432 	  fprintf(outfile," %5.3f ",v);
2433 	}
2434       fprintf(outfile," %5.3f\n\n",1.0-general_homo);
2435       avghet1 += 1.0 - general_homo;
2436       //      fprintf(outfile," %5.3f\n\n",avghet1/data->numpop);
2437     }
2438   fprintf(outfile,"Average expected heterozygosity\n");
2439   for (pop1 = 0; pop1 < data->numpop; pop1++)
2440     {
2441       pop = options->newpops[pop1]-1;
2442       fprintf(outfile,"Pop%-2li  ",pop+1);
2443     }
2444   fprintf(outfile,"All\n");
2445   for (pop = 0; pop < data->numpop+1; pop++)
2446     fprintf(outfile,"-------");
2447   fprintf(outfile, "\n");
2448   for (pop1 = 0; pop1 < data->numpop; pop1++)
2449     {
2450       pop = options->newpops[pop1]-1;
2451       fprintf(outfile,"%5.3f  ",avghet[pop1] / data->loci);
2452     }
2453   fprintf(outfile,"%5.3f\n\n",avghet1/data->loci);
2454   // printd in PDF
2455 #ifdef PRETTY
2456   //pdf_print_spectra(data, options, freq, total, grandtotal, avghet, avghet1, maxalleles);
2457 #endif
2458   fflush(outfile);
2459   // cleanup
2460   for (pop = 0; pop < data->numpop; pop++)
2461     {
2462       for (locus = 0; locus < data->loci; locus++)
2463 	{
2464 	  myfree(freq[pop][locus]);
2465 	}
2466       myfree(freq[pop]);
2467     }
2468   //printf("finished with printspectra");
2469   myfree(freq);
2470   myfree(maxalleles);
2471   myfree(maxallelepop);
2472   myfree(avghet);
2473   free_doublevec2d(total);
2474   myfree(grandtotal);
2475 }
2476 
2477 void
print_alleledata(world_fmt * world,data_fmt * data,option_fmt * options)2478 print_alleledata (world_fmt * world, data_fmt * data, option_fmt * options)
2479 {
2480     long i, pop, ind, locus, mult80;
2481     for (pop = 0; pop < data->numpop; pop++)
2482     {
2483         print_header (world->outfile, pop, world, options, data);
2484         for (ind = 0; ind < data->numind[pop][0]; ind++)
2485         {
2486             fprintf (world->outfile, "%-*.*s ", (int) options->nmlength,
2487                      (int) options->nmlength, data->indnames[pop][ind][0]);
2488             mult80 = options->nmlength;
2489             for (locus = 0; locus < data->loci; locus++)
2490             {
2491                 mult80 +=
2492                     1 + (long) (strlen (data->yy[pop][ind][locus][0]));
2493                 if (mult80 >= 80)
2494                 {
2495                     mult80 = 0;
2496                     fprintf (world->outfile, "\n");
2497                     for (i = 0; i < options->nmlength; i++)
2498                         FPRINTF(world->outfile, " ");
2499                 }
2500                 fprintf (world->outfile, " %c.%c",
2501                          data->yy[pop][ind][locus][0][0],
2502                          data->yy[pop][ind][locus][0][1]);
2503             }
2504             fprintf (world->outfile, "\n");
2505         }
2506         fprintf (world->outfile, "\n");
2507     }
2508     fprintf (world->outfile, "\n\n");
2509     fflush (world->outfile);
2510 }
2511 
2512 void
print_microdata(world_fmt * world,data_fmt * data,option_fmt * options)2513 print_microdata (world_fmt * world, data_fmt * data, option_fmt * options)
2514 {
2515     long i, pop, ind, locus, mult80;
2516     for (pop = 0; pop < data->numpop; pop++)
2517     {
2518         print_header (world->outfile, pop, world, options, data);
2519         for (ind = 0; ind < data->numind[pop][0]; ind++)
2520         {
2521             fprintf (world->outfile, "%-*.*s ", (int) options->nmlength,
2522                      (int) options->nmlength, data->indnames[pop][ind][0]);
2523             mult80 = options->nmlength;
2524             for (locus = 0; locus < data->loci; locus++)
2525             {
2526                 mult80 +=
2527                     1 + (long) (strlen (data->yy[pop][ind][locus][0]) +
2528                     strlen (data->yy[pop][ind][locus][1]));
2529                 if (mult80 >= 80)
2530                 {
2531                     mult80 = 0;
2532                     fprintf (world->outfile, "\n");
2533                     for (i = 0; i < options->nmlength; i++)
2534                         FPRINTF(world->outfile, " ");
2535                 }
2536                 fprintf (world->outfile, " %s.%-s",
2537                          data->yy[pop][ind][locus][0],
2538                          data->yy[pop][ind][locus][1]);
2539             }
2540             fprintf (world->outfile, "\n");
2541         }
2542         fprintf (world->outfile, "\n");
2543     }
2544     fprintf (world->outfile, "\n\n");
2545     fflush (world->outfile);
2546 }
2547 
2548 void
print_seqdata(world_fmt * world,option_fmt * options,data_fmt * data)2549 print_seqdata (world_fmt * world, option_fmt * options, data_fmt * data)
2550 {
2551     long pop, locus;
2552     for (pop = 0; pop < data->numpop; pop++)
2553     {
2554         print_header (world->outfile, pop, world, options, data);
2555         for (locus = 0; locus < data->loci; locus++)
2556         {
2557             print_locus_head (locus, world, options, data);
2558             print_seq_pop (locus, pop, world, options, data);
2559         }
2560     }
2561     fflush (world->outfile);
2562 }
2563 
2564 void
print_header(FILE * outfile,long pop,world_fmt * world,option_fmt * options,data_fmt * data)2565 print_header (FILE * outfile, long pop, world_fmt * world,
2566               option_fmt * options, data_fmt * data)
2567 {
2568     long i;
2569     long locus, mult80 = 80;
2570     fprintf (outfile, "\n%-s", data->popnames[pop]);
2571     for (i = 0; i < (long) (80 - (long) strlen (data->popnames[pop])); i++)
2572          fprintf(world->outfile, "-");
2573     fprintf (outfile, "\n\n");
2574     if (!strchr (SEQUENCETYPES, options->datatype))
2575     {
2576         fprintf (outfile, "%-s  ", (data->loci == 1 ? "locus" : "loci "));
2577         for (i = 0; i < (options->nmlength - 6); i++)
2578             fprintf(world->outfile, " ");
2579         for (locus = 0; locus < data->loci; locus++)
2580         {
2581             if (locus * 4 + options->nmlength > mult80)
2582             {
2583                 mult80 += 80;
2584                 fprintf (outfile, "\n");
2585                 for (i = 0; i < options->nmlength; i++)
2586                     fprintf (outfile, " ");
2587             }
2588             fprintf (outfile, "  %2li", locus + 1);
2589         }
2590         fprintf (outfile, "\n%-*.*s\n", (int) options->nmlength, (int) options->nmlength, "indiv.");
2591     }
2592 }
2593 
2594 
findleastsquare(MYREAL * rawdata,long total,long repeatlength,long shift,MYREAL * startvalue)2595 MYREAL findleastsquare(MYREAL *rawdata, long total, long repeatlength, long shift, MYREAL *startvalue)
2596 {
2597   long i;
2598   long j;
2599   long high;
2600   long low;
2601   MYREAL lowvalue=MYREAL_MAX;
2602   MYREAL highvalue=0.;
2603   MYREAL sum = 0.0;
2604   MYREAL value;
2605   MYREAL oldvalue;
2606   for(i=0;i<total;i++)
2607     {
2608       if(rawdata[i] < lowvalue)
2609 	lowvalue = rawdata[i];
2610       if(rawdata[i]> highvalue)
2611 	highvalue = rawdata[i];
2612     }
2613   high = (long) (highvalue+repeatlength+shift);
2614   low  = (long) (lowvalue -repeatlength+shift);
2615   if(low<0){
2616     low = 0;
2617   }
2618   *startvalue = low;
2619   for(i=0;i<total;i++)
2620     {
2621       oldvalue = MYREAL_MAX;
2622       for(j=low ; j < high; j += repeatlength)
2623 	{
2624 	  value = rawdata[i] - j;
2625 	  value *= value;
2626 	  if(value < oldvalue)
2627 	    {
2628 	      oldvalue = value;
2629 	    }
2630 	}
2631       sum += oldvalue;
2632     }
2633   return sum;
2634 }
2635 
find_allele_repeatlength(data_fmt * data,option_fmt * options,long locus)2636 void find_allele_repeatlength(data_fmt *data, option_fmt *options, long locus)
2637 {
2638   long z=0;
2639   long zz=0;
2640   long pop;
2641   long ind;
2642   long r;
2643   MYREAL *rawdata;
2644   char *a1;
2645   char *a2;
2646   long total=z;
2647   MYREAL minLS=MYREAL_MAX;
2648   MYREAL value;
2649   long intval;
2650   MYREAL startvalue= 0.0;
2651   MYREAL keepstartvalue= 0.0;
2652   MYREAL leastsquare;
2653   MYREAL nonfloored;
2654   MYREAL floored;
2655   MYREAL diff;
2656   for (pop = 0; pop < data->numpop; pop++)
2657     {
2658       zz += data->numind[pop][locus];
2659     }
2660   rawdata = (MYREAL *) calloc(2 * zz, sizeof(MYREAL));
2661 
2662   for (pop = 0; pop < data->numpop; pop++)
2663     {
2664       for (ind = 0; ind < data->numind[pop][locus]; ind++)
2665 	{
2666 	  a1 = data->yy[pop][ind][locus][0];
2667 	  a2 = data->yy[pop][ind][locus][1];
2668 	  if (a1[0]!='?')
2669 	    {
2670 	      rawdata[z++] = atof(a1);
2671 	    }
2672 	  if (a2[0]!='?')
2673 	    {
2674 	      rawdata[z++] = atof(a2);
2675 	    }
2676 	}
2677     }
2678 
2679   total=z;
2680   minLS=MYREAL_MAX;
2681   startvalue= 0.0;
2682   keepstartvalue= 0.0;
2683 
2684   for(r=0;r<data->repeatlength[locus];r++)
2685     {
2686       //MYREAL endvalue  = 0.0;
2687       leastsquare=findleastsquare(rawdata,total,data->repeatlength[locus],r,&startvalue);
2688       if(leastsquare < minLS)
2689 	{
2690 	  minLS = leastsquare;
2691 	 // keepr = r;
2692 	  keepstartvalue = startvalue;
2693 	}
2694     }
2695   for (pop = 0; pop < data->numpop; pop++)
2696     {
2697       for (ind = 0; ind < data->numind[pop][locus]; ind++)
2698 	{
2699 	  a1 = data->yy[pop][ind][locus][0];
2700 	  a2 = data->yy[pop][ind][locus][1];
2701 	  if (a1[0]!='?')
2702 	    {
2703 	      value = atof(a1);
2704 	      //Floor[(279.47 - 259)/4 + If[(Random[]) < 1 - FractionalPart[(279.47 - 259)/4 ], 0, 1]], {1000}] // Tally
2705 	      nonfloored = (value - keepstartvalue)/data->repeatlength[locus];
2706 	      floored = floor(nonfloored);
2707 	      diff = nonfloored - floored;
2708 	      intval = MSAT_OFFSET + (long) floored + (RANDUM() < (1.0-diff) ? 0 : 1);
2709 	      sprintf(data->yy[pop][ind][locus][0],"%li",intval);
2710 	    }
2711 	  if (a2[0]!='?')
2712 	    {
2713 	      value = atof(a2);
2714 	      nonfloored = (value - keepstartvalue)/data->repeatlength[locus];
2715 	      floored = floor(nonfloored);
2716 	      diff = nonfloored - floored;
2717 	      intval =  MSAT_OFFSET + (long) floored + (RANDUM() < (1.0-diff) ? 0 : 1);
2718 	      sprintf(data->yy[pop][ind][locus][1],"%li",intval);
2719 	    }
2720 	}
2721     }
2722 }
2723 
2724 
2725 MYREAL
create_alleles(data_fmt * data,option_fmt * options)2726 create_alleles (data_fmt * data, option_fmt *options)
2727 {
2728   long n=0;
2729   MYREAL  mean=0.;
2730   MYREAL  delta=0.;
2731   long locus, pop, ind;
2732   long z;
2733   char *a1;//DEFAULT_ALLELENMLENGTH];
2734   char *a2;//DEFAULT_ALLELENMLENGTH];
2735   boolean second = (strchr(ALLELETYPES,options->datatype)!=NULL);
2736   a1 = (char *) malloc(sizeof(char)*LINESIZE);
2737   a2 = (char *) malloc(sizeof(char)*LINESIZE);
2738   long lena1 = LINESIZE;
2739   long lena2 = LINESIZE;
2740   for (locus = 0; locus < data->loci; locus++)
2741     {
2742       if(data->repeatlength[locus]!=0)
2743 	find_allele_repeatlength(data, options,locus);
2744       z = 0;
2745       for (pop = 0; pop < data->numpop; pop++)
2746         {
2747             for (ind = 0; ind < data->numind[pop][locus]; ind++)
2748             {
2749 
2750 	      char *data1 = data->yy[pop][ind][locus][0];
2751 	      long lendata1 = strlen(data1);
2752 	      if (lena1 < lendata1)
2753 		{
2754 		  a1 = (char *) realloc(a1,sizeof(char)*(lendata1+1));
2755 		  lena1 = lendata1 + 1;
2756 		}
2757 	      strcpy (a1, data1);
2758 	      if(second)
2759 		{
2760 		  char *data2 = data->yy[pop][ind][locus][1];
2761 		  long lendata2 = strlen(data2);
2762 		  if (lena2 < lendata2)
2763 		    {
2764 		      a2 = (char *) realloc(a2,sizeof(char)*(lendata2+1));
2765 		      lena2 = lendata2 + 1;
2766 		    }
2767 		  strcpy (a2, data2);
2768 		  if (!strcmp (a1, a2))
2769 		    {
2770 		      addAllele (data, a1, locus, &z);
2771 		    }
2772 		  else
2773 		    {
2774 		      addAllele (data, a1, locus, &z);
2775 		      addAllele (data, a2, locus, &z);
2776 		    }
2777 		}
2778 	      else
2779 		{
2780 		  addAllele (data, a1, locus, &z);
2781 		}
2782 	    }
2783 	}
2784       if(z<=1)
2785 	{
2786 	  data->skiploci[locus] = TRUE;// this skips loci with _no_ data
2787 	  continue;
2788 	}
2789 
2790       data->maxalleles[locus] = z + 1;
2791       /* + 1: for all the unencountered alleles */
2792       if(options->murates_fromdata)
2793 	{
2794 	  if(options->mu_rates==NULL)
2795 	    {
2796 	      options->mu_rates = (MYREAL * ) mycalloc(data->loci,sizeof(MYREAL));
2797 	      //		printf("%i> data.c: 1557 murate size %li\n",myID,data->loci * sizeof (MYREAL));
2798 	    }
2799 	  options->mu_rates[locus] = z+1;
2800 	  n = n + 1;
2801 	  delta = options->mu_rates[locus] - mean;
2802 	  mean += delta/n;
2803 	}
2804     }
2805   if(options->murates_fromdata)
2806     {
2807       options->muloci = data->loci;
2808       for (locus=0; locus < data->loci; locus++)
2809 	{
2810 	  if(!data->skiploci[locus])
2811 	    options->mu_rates[locus] /= mean;
2812 	}
2813     }
2814   myfree(a1);
2815   myfree(a2);
2816   return mean;
2817 }
2818 
2819 void
addAllele(data_fmt * data,char s[],long locus,long * z)2820 addAllele (data_fmt * data, char s[], long locus, long *z)
2821 {
2822     long found = 0;
2823     if(!strcmp("?",s))
2824       return;
2825     while ((data->allele[locus][found++][0] != '\0')
2826             && (strcmp (s, data->allele[locus][found - 1])))
2827         ;
2828     if (found > (*z))
2829     {
2830         strcpy (data->allele[locus][*z], s);
2831         (*z)++;
2832     }
2833 }
2834 
2835 void
set_numind(data_fmt * data)2836 set_numind (data_fmt * data)
2837 {
2838     long locus, pop;
2839     for (locus = 1; locus < data->loci; locus++)
2840     {
2841         for (pop = 0; pop < data->numpop; pop++)
2842         {
2843             data->numind[pop][locus] = data->numind[pop][0];
2844             data->numalleles[pop][locus] = data->numalleles[pop][0];
2845         }
2846     }
2847 }
2848 
2849 
2850 void
print_seq_pop(long locus,long pop,world_fmt * world,option_fmt * options,data_fmt * data)2851 print_seq_pop (long locus, long pop, world_fmt * world, option_fmt * options,
2852                data_fmt * data)
2853 {
2854     long ind;
2855     for (ind = 0; ind < data->numalleles[pop][locus]; ind++)
2856     {
2857         print_seq_ind (locus, pop, ind, world, options, data);
2858     }
2859 }
2860 
2861 void
print_seq_ind(long locus,long pop,long ind,world_fmt * world,option_fmt * options,data_fmt * data)2862 print_seq_ind (long locus, long pop, long ind, world_fmt * world,
2863                option_fmt * options, data_fmt * data)
2864 {
2865     long site;
2866     char blank[2] = " ";
2867     fprintf (world->outfile, "%-*.*s", (int) options->nmlength,
2868              (int) options->nmlength, data->indnames[pop][ind][0]);
2869     fprintf (world->outfile, " %c", data->yy[pop][ind][locus][0][0]);
2870     for (site = 1; site < data->seq[0]->sites[locus]; site++)
2871     {
2872         if ((site) % 60 == 0)
2873         {
2874             fprintf (world->outfile, "\n%-*.*s %c", (int) options->nmlength,
2875                      (int) options->nmlength, blank,
2876                      data->yy[pop][ind][locus][0][site]);
2877         }
2878         else
2879         {
2880             if ((site) % 10 == 0)
2881             {
2882                 fprintf (world->outfile, " ");
2883             }
2884             fprintf (world->outfile, "%c", data->yy[pop][ind][locus][0][site]);
2885         }
2886     }
2887     fprintf (world->outfile, "\n");
2888 }
2889 
2890 
2891 void
print_locus_head(long locus,world_fmt * world,option_fmt * options,data_fmt * data)2892 print_locus_head (long locus, world_fmt * world, option_fmt * options,
2893                   data_fmt * data)
2894 {
2895     char *head;
2896     head = (char *) mycalloc (1, sizeof (char) * MAX (10, options->nmlength));
2897     sprintf (head, "Locus %li", locus);
2898     fprintf (world->outfile, "%-*.*s --------10 --------20 --------30",
2899              (int) options->nmlength, (int) options->nmlength, head);
2900     fprintf (world->outfile, " --------40 --------50 --------60\n");
2901 
2902     myfree(head);
2903 }
2904 
2905 void
read_geofile(data_fmt * data,option_fmt * options,long numpop)2906 read_geofile (data_fmt * data, option_fmt * options, long numpop)
2907 {
2908     long i, j, pop;
2909     long numpop2 = numpop * numpop;
2910     data->geo = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2);
2911     data->lgeo = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2);
2912     if (!options->geo)
2913     {
2914         for (i = 0; i < numpop2; i++)
2915             data->geo[i] = 1.0;
2916     }
2917     else
2918     {
2919         data->ogeo = (MYREAL **) mycalloc (1, sizeof (MYREAL *) * numpop);
2920         data->ogeo[0] = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2);
2921         for (pop = 1; pop < numpop; pop++)
2922             data->ogeo[pop] = data->ogeo[0] + numpop * pop;
2923         read_distance_fromfile (data->geofile, numpop, options->nmlength,
2924                                 data->ogeo);
2925         for (i = 0; i < numpop; i++)
2926         {
2927             for (j = 0; j < numpop; j++)
2928             {
2929                 if(i!=j)
2930                 {
2931                     data->geo[mm2m (i, j, numpop)] =   1. / data->ogeo[i][j];
2932                     data->lgeo[mm2m (i, j, numpop)] =  data->ogeo[i][j] > 0.0 ?
2933                                                        LOG (1. / data->ogeo[i][j]) : -MYREAL_MAX;
2934                 }
2935             }
2936         }
2937     }
2938 }
2939 
2940 ///
2941 /// read the file with the tip dates and returns the oldest date
2942 MYREAL
read_date_fromfile(FILE * datefile,data_fmt * data,option_fmt * options,long nmlength)2943 read_date_fromfile (FILE * datefile, data_fmt *data, option_fmt *options, long nmlength)
2944 {
2945     char input[LINESIZE];
2946     long pop;
2947     long locus;
2948     long l;
2949     long ind;
2950     MYREAL oldest = 0. ;
2951     MYREAL youngest = DBL_MAX;
2952     MYREAL temp;
2953     char *name;
2954     boolean backward = TRUE;
2955     name = (char *) mycalloc(LINESIZE,sizeof(char));
2956     if (datefile != NULL)
2957     {
2958         FGETS (input, LINESIZE, datefile); //title line
2959 	while(input[0]=='#')
2960 	  FGETS(input,LINESIZE,datefile);
2961 	if(input[0]=='F') //this checks the direction of time
2962 	  backward=FALSE;
2963 	fprintf(stdout,"\n%i> Tip dates from file %s\n----------------------------------------\n", myID, options->datefilename);
2964 	fprintf(stdout,"Generations per year:            %f\n", options->generation_year);
2965 	for(pop=0;pop<data->numpop;pop++)
2966 	  {
2967 	    FGETS (input, LINESIZE, datefile); //first populations
2968 	    while(input[0]=='#')
2969 	      FGETS(input,LINESIZE,datefile);
2970 	    for (locus=0; locus < data->loci; locus++)
2971 	      {
2972 		l = (locus >= options->mutationrate_year_numalloc ? options->mutationrate_year_numalloc -1 : locus);
2973 		fprintf(stdout,"Mutation rate of Locus %li: %g\n", locus, options->mutationrate_year[l]);
2974 		for(ind=0; ind < data->numind[pop][locus]; ind++)
2975 		  {
2976 		    FGETS (input, LINESIZE, datefile);
2977 		    while(input[0]=='#')
2978 		      FGETS(input,LINESIZE,datefile);
2979 #ifdef USE_MYREAL_FLOAT
2980 		    sscanf (input, "%s",name);
2981 		    sscanf (input+nmlength, "%f",&temp);
2982 #else
2983 		    sscanf (input, "%s",name);
2984 		    sscanf (input+nmlength, "%lf",&temp);
2985 #endif
2986 		    fprintf(stdout,"Locus %li: Tipdate %*.*s %f %f\n", locus, (int) nmlength, (int) nmlength, input, temp, temp * options->mutationrate_year[l] / options->generation_year);
2987 		    data->sampledates[pop][locus][ind].date = temp;
2988 		    data->sampledates[pop][locus][ind].name = (char *) mycalloc(strlen(name)+1,sizeof(char));
2989 		    strcpy(data->sampledates[pop][locus][ind].name, name);
2990 		    unpad(data->sampledates[pop][locus][ind].name," ");
2991 		    translate(data->sampledates[pop][locus][ind].name,' ', '_');
2992 		    if(oldest < temp)
2993 		      oldest = temp;
2994 		    if(youngest > temp)
2995 		      youngest = temp;
2996 		  }
2997 	      }
2998 	  }
2999 	// are the times forward or backward?
3000 	for(pop=0;pop<data->numpop;pop++)
3001 	  {
3002 	    for (locus=0; locus < data->loci; locus++)
3003 	      {
3004 		for(ind=0; ind < data->numind[pop][locus]; ind++)
3005 		  {
3006 		    if(backward)
3007 		      {
3008 			data->sampledates[pop][locus][ind].date =
3009 			  data->sampledates[pop][locus][ind].date - youngest;
3010 		      }
3011 		    else
3012 		      {
3013 			data->sampledates[pop][locus][ind].date =
3014 			  (oldest - data->sampledates[pop][locus][ind].date)
3015 			  + (youngest - oldest);
3016 		      }
3017 		  }
3018 	      }
3019 	  }
3020     }
3021     free(name);
3022     return oldest;
3023 }
3024 
3025 ///
3026 /// read the file with the tip dates
3027 void
read_datefile(data_fmt * data,option_fmt * options,long numpop)3028 read_datefile (data_fmt * data, option_fmt * options, long numpop)
3029 {
3030   long locus, pop;
3031   data->sampledates = (tipdate_fmt ***) mycalloc (data->numpop, sizeof (tipdate_fmt **));
3032   data->sampledates[0] = (tipdate_fmt **) mycalloc (data->numpop * data->loci, sizeof (tipdate_fmt *));
3033   for (pop = 1; pop < data->numpop; pop++)
3034     {
3035       data->sampledates[pop] = data->sampledates[0] + data->loci * pop;
3036     }
3037   for(locus=0;locus<data->loci;locus++)
3038     {
3039       for(pop=0;pop < data->numpop; pop++)
3040 	{
3041 	  data->sampledates[pop][locus] = (tipdate_fmt*) mycalloc(data->numind[pop][locus],sizeof(tipdate_fmt));
3042 	}
3043     }
3044 
3045   if (options->has_datefile)
3046     {
3047       data->maxsampledate = read_date_fromfile (data->datefile, data, options, options->nmlength);
3048       //      printf("%i> in data section maxsampledate=%f\n",myID, data->maxsampledate);
3049     }
3050   else
3051     {
3052       data->maxsampledate=0.0;
3053     }
3054 }
3055 
3056 #ifdef UEP
3057 void
read_uepfile(data_fmt * data,option_fmt * options,long numpop)3058 read_uepfile (data_fmt * data, option_fmt * options, long numpop)
3059 {
3060     long i;
3061     long sumtips = 0;
3062 
3063     if (!options->uep)
3064         return;
3065 
3066     for (i = 0; i < numpop; ++i)
3067         sumtips += data->numind[i][0];   //Assumes that UEP has the same number of individuals as
3068     // locus 1 (Is this OK? most dataset with UEP will have 1 locus?????)
3069     data->uep = (int **) mycalloc (number_genomes (options->datatype) * sumtips,
3070                                  sizeof (int *));
3071     read_uep_fromfile (data->uepfile, sumtips, options->nmlength, data->uep,
3072                        &data->uepsites, options->datatype);
3073 }
3074 
3075 #endif
3076