1 /*------------------------------------------------------
2 Bayesian Maximum likelihood estimation
3 of migration rate and effectice population size
4 using a Metropolis-Hastings Monte Carlo algorithm
5 -------------------------------------------------------
6 D A T A R O U T I N E S
7
8 creates data structures,
9 read data (Electrophoretic loci, sequences, microsats),
10 prints data,
11 destroys data.
12
13 Copyright 1997-2002 Peter Beerli and Joseph Felsenstein
14 Copyright 2003-2008 Peter Beerli
15 Copyright 2009-2012 Peter Beerli and Michal Palczewski
16
17 Permission is hereby granted, free of charge, to any person obtaining a copy
18 of this software and associated documentation files (the "Software"), to deal
19 in the Software without restriction, including without limitation the rights
20 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
21 of the Software, and to permit persons to whom the Software is furnished to do
22 so, subject to the following conditions:
23
24 The above copyright notice and this permission notice shall be included in all copies
25 or substantial portions of the Software.
26
27 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
28 INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
29 PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
30 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
31 CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
32 OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33
34 $Id: data.c 2172 2013-10-30 17:50:08Z beerli $
35
36 -----------------------------------
37 add this to the microsatellite data, this will allow to use
38 a known repeat lengths to read number of sites instead of
39 number of repeats, the datafile will need an additional line
40 perhaps use something like #@ 2 2 2 3 .... on the second line
41 so that other non microsatellite data models
42 can ignore it
43 y = Map[If[(a = Mod[#, rlen]) != 0,
44 If[Random[] < 0.5, # - (rlen + a), # + rlen - a], #] &, x]/
45 rlen - Min[y] + 1
46 -------------------------------------------------------*/
47 /*! \file data.c
48
49 Data manipulation routines
50
51 */
52
53
54 #include <string.h>
55 #include <wchar.h>
56 #ifndef WIN32
57 #include <xlocale.h>
58 #endif
59 #include "migration.h"
60 #include "sighandler.h"
61 #include "tools.h"
62 #include "migrate_mpi.h"
63 #include "data.h"
64 #include "sequence.h"
65 #include "random.h"
66 #ifdef PRETTY
67 #include "pretty.h"
68 #endif
69 extern long number_genomes (int type);
70 extern void jumble (long *s, long n);
71 extern void jumble_ownseed (long *s, long n);
72
73 #ifdef DMALLOC_FUNC_CHECK
74 #include <dmalloc.h>
75 #endif
76 /* prototypes ----------------------------------------- */
77 //void create_data (data_fmt ** data);
78 //void get_data (FILE * infile, data_fmt * data, option_fmt * options);
79 //void print_data (world_fmt * world, option_fmt * options, data_fmt * data);
80 //long find_missing(data_fmt *data, long pop, long locus);
81 //void print_data_summary (FILE * file, world_fmt * world, option_fmt * options,
82 // data_fmt * data);
83 //short findAllele (data_fmt * data, char s[], long locus);
84 //void free_datapart (data_fmt * data, option_fmt * options, long locus);
85 /*private functions */
86 //void init_data_structure1 (data_fmt ** data, option_fmt * options);
87 void read_header (FILE * infile, data_fmt * data, option_fmt * options);
88 void read_sites (data_fmt * data, world_fmt *world, option_fmt *options);
89 //void init_data_structure2 (data_fmt ** data, option_fmt * options, long pop);
90 //void init_data_structure3 (data_fmt * data);
91 void read_popheader (FILE * infile, data_fmt * data, option_fmt * options, long pop, long genomes);
92 void read_indname (FILE * file, data_fmt * data, long pop, long ind,
93 long locus, long nmlength);
94 void read_popdata (FILE * file, data_fmt * data, long pop,
95 option_fmt * options);
96 void read_microalleles (FILE * infile, data_fmt * data, option_fmt *options, long pop, long ind);
97 void read_alleles (FILE * infile, data_fmt * data, long pop, long ind);
98 long read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options,
99 long locus, long pop, long ind, long baseread);
100 long read_ind_seq_oneliner (FILE * infile, data_fmt * data, option_fmt * options,
101 long pop, long ind, long baseread);
102
103 void read_hapmap (FILE * infile, data_fmt * data, option_fmt * options, long pop);
104 void
105 read_hapmap_genotypes (FILE * infile, data_fmt * data, option_fmt * options, long pop);
106 void read_distance_fromfile (FILE * dfile, long tips, long nmlength, MYREAL **m);
107 void finish_read_seq (FILE * infile, data_fmt * data, option_fmt * options,
108 long pop, long baseread);
109 void print_alleledata (world_fmt * world, data_fmt * data,
110 option_fmt * options);
111 void print_microdata (world_fmt * world, data_fmt * data,
112 option_fmt * options);
113 void print_seqdata (world_fmt * world, option_fmt * options, data_fmt * data);
114
115 void print_header (FILE * outfile, long pop, world_fmt * world,
116 option_fmt * options, data_fmt * data);
117 MYREAL create_alleles (data_fmt * data, option_fmt *options);
118 void addAllele (data_fmt * data, char s[], long locus, long *z);
119 void set_numind (data_fmt * data);
120 void print_seq_pop (long locus, long pop, world_fmt * world,
121 option_fmt * options, data_fmt * data);
122 void print_seq_ind (long locus, long pop, long ind, world_fmt * world,
123 option_fmt * options, data_fmt * data);
124 void print_locus_head (long locus, world_fmt * world, option_fmt * options,
125 data_fmt * data);
126 void find_delimiter (char *title, char *dlm);
127 void read_geofile (data_fmt * data, option_fmt * options, long numpop);
128 void read_datefile (data_fmt * data, option_fmt * options, long numpop);
129 void read_uep_fromfile (FILE * uepfile, long tips, long nmlength, int **uep,
130 long *uepsites, long datatype);
131 void read_uepfile (data_fmt * data, option_fmt * options, long numpop);
132
133
134 /*=====================================================*/
135 /// creates the data structure
136 void
create_data(data_fmt ** data)137 create_data (data_fmt ** data)
138 {
139 (*data) = (data_fmt *) mycalloc (1, sizeof (data_fmt));
140 }
141
142 /*
143 void
144 init_data (data_fmt * data)
145 {
146
147 }
148 */
149
150 ///
151 /// free the data module
152 void
destroy_data(data_fmt * data)153 destroy_data (data_fmt * data)
154 {
155 long ind;
156 long indalloc;
157 long locus;
158 long pop;
159 long loci = data->loci;
160 long numpop = data->numpop;
161
162 // free data from init_data_structure3
163 for (locus = 0; locus < loci; locus++)
164 {
165 myfree(data->allele[locus]);
166 }
167 myfree(data->allele);
168 myfree(data->subloci);
169 myfree(data->maxalleles);
170 myfree(data->skiploci);
171 myfree(data->locusweight);
172
173 // free data from init_data_structure2
174 for(pop=0; pop < numpop ; pop++)
175 {
176 indalloc = -1;
177 for(locus=0; locus < loci; locus++)
178 {
179 if(indalloc < data->numind[pop][locus])
180 indalloc = data->numind[pop][locus];
181 }
182 for (ind = 0; ind < indalloc; ind++)
183 {
184 for(locus=0; locus < loci; locus++)
185 {
186 myfree(data->indnames[pop][ind][locus]);
187 myfree(data->yy[pop][ind][locus]);
188 }
189 myfree(data->indnames[pop][ind]);
190 myfree(data->yy[pop][ind]);
191 }
192 myfree(data->indnames[pop]);
193 myfree(data->yy[pop]);
194 }
195 myfree(data->indnames);
196 myfree(data->yy);
197
198 // data->yy were already freed in free_datapart()
199
200 // free data from init_structure_1
201 myfree(data->popnames[0]);
202 myfree(data->numind[0]);
203 myfree(data->numalleles[0]);
204 myfree(data->popnames);
205 myfree(data->numind);
206 myfree(data->numalleles);
207
208 if(data->position!=NULL)
209 myfree(data->position);
210 myfree(data->geo);
211 myfree(data->lgeo);
212 if(data->ogeo != NULL)
213 {
214 myfree(data->ogeo[0]);
215 myfree(data->ogeo);
216 }
217 // free sampledates
218 for(locus=0;locus<data->loci;locus++)
219 {
220 for(pop=0;pop < data->numpop; pop++)
221 {
222 myfree(data->sampledates[pop][locus]);
223 }
224 }
225 myfree(data->sampledates[0]);
226 myfree(data->sampledates);
227 myfree(data);
228 }
229
230 ///
231 /// shuffles (jumbles) the individuals so that we can take the first y to subsample the dataset
shuffle(long ** shuffled,long n,int shuffle_ON)232 void shuffle(long **shuffled, long n, int shuffle_ON)
233 {
234 long i;
235 for(i=0;i<n;i++)
236 {
237 (*shuffled)[i] = i;
238 }
239 switch(shuffle_ON)
240 {
241 case 0:
242 break;
243 case -1:
244 jumble(*shuffled, n);
245 break;
246 default:
247 jumble_ownseed(*shuffled, n);
248 }
249 }
250
shuffle_data(data_fmt * data,option_fmt * options)251 void shuffle_data(data_fmt *data, option_fmt *options)
252 {
253 long pop;
254 long locus;
255 data->shuffled = (long ***) mycalloc(data->numpop,sizeof(long **));
256 for(pop=0; pop < data->numpop; pop++)
257 {
258 data->shuffled[pop] = (long **) mycalloc(data->loci, sizeof(long *));
259 for(locus=0; locus < data->loci; locus++)
260 {
261 data->shuffled[pop][locus] = (long *) mycalloc(data->numind[pop][locus], sizeof(long));
262 if(options->randomsubset>0 && options->randomsubset < data->numind[pop][locus])
263 {
264 shuffle(&data->shuffled[pop][locus],data->numind[pop][locus],options->randomsubsetseed);
265 }
266 else
267 {
268 shuffle(&data->shuffled[pop][locus],data->numind[pop][locus],0);
269 }
270 }
271 }
272 }
273
274
max_shuffled_individuals(option_fmt * options,data_fmt * data,long pop,long locus)275 long max_shuffled_individuals(option_fmt *options, data_fmt *data, long pop, long locus)
276 {
277 long top;
278 if(options->randomsubset>0 && options->randomsubset < data->numind[pop][locus])
279 {
280 top = options->randomsubset;
281 }
282 else
283 {
284 top = data->numind[pop][locus];
285 }
286 return top;
287 }
288
289
290 //---------------------------------------------------------------
291 // ..number_pop.number_loci.<delimiter>.<title [no numbers to start]
292 // locu_specification {sNumber_of_sites}{n{number of sites}{m1}{a1}
293 // [parentheses () mark link loci
294 // example:
295 // (n3 s120 m1)(m1 m1 m1 a1 s100)(n1)(n2)(m1)
296 // contains 5 linked loci, the first two are compounds
297 // the system is to have for each individual haplotype a set of
298 // loci so that it may look like this
299 // ind11......ACA ATTAGA 13 15 5 3 2 A ATACG A AT 13
300 // ind12......ACA ATTCGA 12 15 6 3 2 A ATACG T CT 5
301 void
get_new_data(FILE * infile,data_fmt * data,option_fmt * options,world_fmt * world)302 get_new_data (FILE * infile, data_fmt * data, option_fmt * options, world_fmt *world)
303 {
304 //long locus;
305 MYREAL mean;
306 long pop;
307 long locus;
308 long genomes=1;
309 boolean saved_option_murate=FALSE;
310 data->hasghost = FALSE;
311 // read how many populations and loci and delimiter and title
312 // if we have the new haplotype input then there should be no delimiter
313 // needed
314 read_header (infile, data, options);
315 init_data_structure1 (&data, options);
316 //
317 // here I need to add the msat_lengths support
318 //
319 // this will set up all the locus structure, but may need some additional
320 // init_data-structure1a()
321 read_sites(data, world, options);
322 // snps etc need something like
323 // data->seq[0]->addon = 4;
324 // hapmap data is peculiar check that
325 // if not sequence the fracchange [for F84] needs to be 1.0
326 // data->seq[0]->fracchange = 1.0;
327
328 if (options->progress)
329 fprintf (stdout, "\n\n");
330 if (options->writelog)
331 fprintf (options->logfile, "\n\n");
332
333 for (pop = 0; pop < data->numpop; pop++)
334 {
335 read_popheader (infile, data, options, pop, genomes);
336 if (options->progress)
337 fprintf (stdout, "Reading (%li) %s ...\n", pop, data->popnames[pop]);
338 if (options->writelog)
339 fprintf (options->logfile, "Reading (%li) %s ...\n", pop, data->popnames[pop]);
340 init_data_structure2 (&data, options, pop);
341 read_popdata (infile, data, pop, options);
342 }
343 read_geofile (data, options, data->numpop);
344 #ifdef UEP
345
346 read_uepfile (data, options, data->numpop);
347 #endif
348 read_datefile(data, options, data->numpop);
349 if (options->progress)
350 fprintf (stdout, "\n\n");
351 init_data_structure3 (data,options);
352 if ((options->onlyvariable || options->has_variableandone) && !options->murates_fromdata)
353 {
354 saved_option_murate = TRUE;
355 options->murates_fromdata = TRUE;
356 }
357 switch (options->datatype)
358 {
359 case 'a':
360 mean = create_alleles (data, options);
361 break;
362 case 'b':
363 mean = create_alleles (data, options);
364 for (pop = 0; pop < data->loci; pop++)
365 data->maxalleles[pop] = XBROWN_SIZE;
366 break;
367 case 'm':
368 mean = create_alleles (data, options);
369 for (pop = 0; pop < data->loci; pop++)
370 data->maxalleles[pop] = options->micro_stepnum;
371 break;
372 case 'h':
373 case 'n':
374 case 'u':
375 mean = create_alleles (data, options);
376 break;
377 default: /*DNA types*/
378 for (pop = 0; pop < data->loci; pop++)
379 data->maxalleles[pop] = 4;
380 }
381
382 shuffle_data(data, options);
383
384 if(options->murates_fromdata)
385 {
386 if (strchr (SEQUENCETYPES, options->datatype))
387 {
388 find_rates_fromdata(data, options, world);
389 }
390 else
391 {
392 find_rates_fromdata_alleles(data, options, world, mean);
393 }
394 }
395 if(saved_option_murate)
396 {
397 for (locus=0; locus < data->loci; locus++)
398 {
399 options->mu_rates[locus] = 1.0;
400 }
401 options->murates_fromdata=FALSE;
402 saved_option_murate = FALSE;
403 }
404 }
405
406 void
get_data(FILE * infile,data_fmt * data,option_fmt * options,world_fmt * world)407 get_data (FILE * infile, data_fmt * data, option_fmt * options, world_fmt *world)
408 {
409 long locus;
410 long pop;
411 long genomes=1;
412 MYREAL mean;
413 boolean saved_option_murate=FALSE;
414 data->hasghost = FALSE;
415 read_header (infile, data, options);
416 genomes = number_genomes (options->datatype);
417 init_data_structure1 (&data, options);
418 data->datatype[0] = options->datatype;
419 switch (options->datatype)
420 {
421 case 's':
422 case 'f':// read standard sequence data
423 read_sites (data,world, options);
424 break;
425 case 'n': // read snp data
426 read_sites (data,world, options);
427 data->seq[0]->addon = 4;
428 break;
429 case 'h': //read data from hapmap allele frequency files
430 // there is no sites line in these files!
431 for(locus=0;locus<data->loci;locus++)
432 data->seq[0]->sites[locus] = 1;
433 data->seq[0]->addon = 4;
434 break;
435 case 'u': // read unlinked snp
436 read_sites (data,world, options);
437 data->seq[0]->addon = 4;
438 break;
439 default:
440 read_sites(data,world, options); // checks whether there is a line for microsat repeat numbers
441 data->seq[0]->fracchange = 1.0;
442 break;
443 }
444 if (options->progress)
445 fprintf (stdout, "\n\n");
446 if (options->writelog)
447 fprintf (options->logfile, "\n\n");
448 for (pop = 0; pop < data->numpop; pop++)
449 {
450 read_popheader (infile, data, options, pop, genomes);
451 if (options->progress)
452 fprintf (stdout, "Reading (%li) %s ...\n", pop, data->popnames[pop]);
453 if (options->writelog)
454 fprintf (options->logfile, "Reading (%li) %s ...\n", pop, data->popnames[pop]);
455 init_data_structure2 (&data, options, pop);
456 read_popdata (infile, data, pop, options);
457 }
458 read_geofile (data, options, data->numpop);
459 #ifdef UEP
460
461 read_uepfile (data, options, data->numpop);
462 #endif
463 read_datefile(data, options, data->numpop);
464 if (options->progress)
465 fprintf (stdout, "\n\n");
466 init_data_structure3 (data,options);
467 if ((options->onlyvariable || options->has_variableandone) && !options->murates_fromdata)
468 {
469 saved_option_murate = TRUE;
470 options->murates_fromdata = TRUE;
471 }
472 switch (options->datatype)
473 {
474 case 'a':
475 mean=create_alleles (data, options);
476 break;
477 case 'b':
478 mean=create_alleles (data, options);
479 for (pop = 0; pop < data->loci; pop++)
480 data->maxalleles[pop] = XBROWN_SIZE;
481 break;
482 case 'm':
483 mean=create_alleles (data, options);
484 for (pop = 0; pop < data->loci; pop++)
485 data->maxalleles[pop] = options->micro_stepnum;
486 break;
487 case 'h':
488 case 'n':
489 case 'u':
490 mean=create_alleles (data, options);
491 break;
492 default: /*DNA types*/
493 for (pop = 0; pop < data->loci; pop++)
494 data->maxalleles[pop] = 4;
495 }
496
497 shuffle_data(data, options);
498
499 if(options->murates_fromdata)
500 {
501 if (strchr (SEQUENCETYPES, options->datatype))
502 {
503 find_rates_fromdata(data, options, world);
504 }
505 else
506 {
507 find_rates_fromdata_alleles(data, options, world, mean);
508 }
509 }
510 if(saved_option_murate)
511 {
512 for (locus=0; locus < data->loci; locus++)
513 {
514 options->mu_rates[locus] = 1.0;
515 }
516 options->murates_fromdata=FALSE;
517 saved_option_murate = FALSE;
518 }
519 }
520
521 /* private functions ========================================== */
522
523 void
init_data_structure1(data_fmt ** data,option_fmt * options)524 init_data_structure1 (data_fmt ** data, option_fmt * options)
525 {
526 long pop;
527 long locus;
528 long numpop = (*data)->numpop;
529 long loci = (*data)->loci;
530
531 (*data)->ogeo = NULL;
532 (*data)->geo = NULL;
533 if ((*data)->yy == NULL)
534 {
535 (*data)->yy = (char *****) mymalloc (sizeof (char ****) * numpop);
536 (*data)->seq = (seqmodel_fmt **) mycalloc (1, sizeof (seqmodel_fmt *));
537 (*data)->seq[0] = (seqmodel_fmt *) mycalloc (1, sizeof (seqmodel_fmt));
538 (*data)->popnames =(char **) mymalloc (sizeof (char *) * numpop);
539 (*data)->popnames[0] =(char *) mycalloc (numpop * STRSIZE,sizeof(char));
540 (*data)->indnames = (char ****) mymalloc (sizeof (char ***) * numpop);
541 (*data)->numind = (long **) mymalloc (sizeof (long *) * numpop);
542 (*data)->numind[0] = (long *) mymalloc (sizeof (long) * numpop * loci);
543 (*data)->numalleles = (long **) mymalloc (sizeof (long *) * numpop);
544 (*data)->numalleles[0] = (long *) mymalloc (sizeof (long) * numpop * loci);
545 (*data)->locitypes = (char *) mycalloc(loci, sizeof(char));
546 (*data)->locusname = (char **) mycalloc(loci, sizeof(char*));
547 for (pop = 1; pop < numpop; pop++)
548 {
549 (*data)->popnames[pop] = (*data)->popnames[0] + pop * STRSIZE;
550 (*data)->numind[pop] = (*data)->numind[0] + pop * loci;
551 (*data)->numalleles[pop] = (*data)->numalleles[0] + pop * loci;
552 }
553 (*data)->seq[0]->sites = (long *) mycalloc (loci, sizeof (long));
554 (*data)->seq[0]->links = (boolean *) mycalloc (loci, sizeof (boolean));
555 (*data)->position = (long *) mycalloc (loci, sizeof (long));
556 (*data)->datatype = (char *) mycalloc(loci, sizeof(char));
557 (*data)->numdatatypealloc = loci;
558 (*data)->numsublocialloc = loci;
559 (*data)->subloci = (long *) mycalloc(loci,sizeof(long));
560 (*data)->numrepeatnumbers = (long *) mycalloc(loci, sizeof(long));
561 (*data)->repeatnumbers = (long **) mycalloc(loci, sizeof(long *));
562 (*data)->repeatlength = (long *) mycalloc(loci, sizeof(long));
563 for(locus=0;locus<loci;locus++)
564 {
565 (*data)->numrepeatnumbers[locus] = 1;
566 (*data)->repeatnumbers[locus] = (long *) mycalloc(1, sizeof(long));
567 }
568 }
569 else
570 {
571 error ("Problem with initialization of data matrix yy\n");
572 }
573 }
574
575
576 void
init_data_structure2(data_fmt ** data,option_fmt * options,long pop)577 init_data_structure2 (data_fmt ** data, option_fmt * options, long pop)
578 {
579 long ind, locus;
580 long indalloc = -1;
581 for(locus=0;locus<(*data)->loci;locus++)
582 {
583 if(indalloc < (*data)->numind[pop][locus])
584 indalloc = (*data)->numind[pop][locus];
585 }
586 if (indalloc == 0)
587 indalloc = 2;
588
589 (*data)->yy[pop] = (char ****) mymalloc (sizeof (char ***) * indalloc);
590 (*data)->indnames[pop] = (char ***) mycalloc (1, sizeof (char **) * indalloc);
591 #ifdef DEBUG
592 printf("init_data_structure2: indalloc=%li\n",indalloc);
593 #endif
594 for (ind = 0; ind < indalloc; ind++)
595 {
596 (*data)->indnames[pop][ind] =
597 (char **) mymalloc (sizeof (char *) * (*data)->loci);
598 (*data)->yy[pop][ind] =
599 (char ***) mymalloc (sizeof (char **) * (*data)->loci);
600 for (locus = 0; locus < (*data)->loci; locus++)
601 {
602 (*data)->indnames[pop][ind][locus] =
603 (char *) mycalloc (1, sizeof (char) * (1 + options->nmlength));
604 if (!strchr (SEQUENCETYPES, options->datatype))
605 {
606 (*data)->yy[pop][ind][locus] =
607 (char **) mycalloc (1, sizeof (char *) * 2);
608 (*data)->yy[pop][ind][locus][0] =
609 (char *) mycalloc (1, sizeof (char) * (options->allelenmlength+1));
610 (*data)->yy[pop][ind][locus][1] =
611 (char *) mycalloc (1, sizeof (char) * (options->allelenmlength+1));
612 }
613 else
614 {
615 (*data)->yy[pop][ind][locus] =
616 (char **) mycalloc (1, sizeof (char *));
617 (*data)->yy[pop][ind][locus][0] =
618 (char *) mycalloc (1,
619 sizeof (char) * ((*data)->seq[0]->sites[locus]+1));
620 #ifdef DEBUG
621 printf("%i> sites=%li+1, pop=%li ind=%li locus=%li\n",myID, ((*data)->seq[0]->sites[locus]+1),pop, ind, locus);
622 #endif
623 }
624 }
625 }
626 }
627
628
629 short
findAllele(data_fmt * data,char s[],long locus)630 findAllele (data_fmt * data, char s[], long locus)
631 {
632 short found = 0;
633 while (data->allele[locus][found]!=NULL && (data->allele[locus][found][0] != '\0') &&(strcmp (s, data->allele[locus][found])))
634 found++;
635 return found;
636 }
637
638 void
free_datapart(data_fmt * data,option_fmt * options,long locus)639 free_datapart (data_fmt * data, option_fmt * options, long locus)
640 {
641 long ind, pop;
642 // long genomes = number_genomes (options->datatype);
643 for (pop = 0; pop < data->numpop; pop++)
644 {
645 for (ind = 0; ind < data->numind[pop][locus]; ind++)
646 {
647 if (!strchr (SEQUENCETYPES, options->datatype))
648 {
649 myfree(data->yy[pop][ind][locus][0]);
650 myfree(data->yy[pop][ind][locus][1]);
651 myfree(data->yy[pop][ind][locus]);
652 }
653 else
654 {
655 myfree(data->yy[pop][ind][locus][0]);
656 myfree(data->yy[pop][ind][locus]);
657 }
658 }
659 }
660 }
661
662
663 void
init_data_structure3(data_fmt * data,option_fmt * options)664 init_data_structure3 (data_fmt * data, option_fmt *options)
665 {
666 long locus, pop, maxi;
667 data->allele =
668 (char ***) mycalloc (1, sizeof (char **) * data->loci);
669 data->subloci = (long *) mycalloc(data->loci,sizeof(long));
670 for (locus = 0; locus < data->loci; locus++)
671 {
672 data->subloci[locus] = 1;
673 maxi = 0;
674 for (pop = 0; pop < data->numpop; pop++)
675 maxi += data->numalleles[pop][locus];
676 data->allele[locus] =
677 (char **) mycalloc (maxi+1, sizeof (char*));
678 for (pop = 0; pop < maxi; pop++)
679 data->allele[locus][pop] =
680 (char *) mycalloc (options->allelenmlength+1, sizeof (char));
681 }
682 data->maxalleles = (long *) mycalloc (1, sizeof (long) * data->loci);
683 data->skiploci =
684 (boolean *) mycalloc (1, sizeof (boolean) * (data->loci + 1));
685 data->locusweight =
686 (MYREAL *) mycalloc (1, sizeof (MYREAL) * (data->loci + 1));
687 for (locus=0;locus < data->loci; locus++)
688 {
689 data->locusweight[locus]=1.0;
690 }
691 }
692
print_utf_warning(void)693 void print_utf_warning(void)
694 {
695 warning("Tried and failed to read a file that is not in ASCII TEXT format\n");
696 warning("This is most commonly UTF-8 or UTF-16 formatted\n");
697 warning("Try to convert the file into plain ASCII, enclodings like\n");
698 warning("WESTERN (MAC ROMAN) or WESTERN (WINDOWS LATIN 1) or WESTERN ISO LATIN\n");
699 usererror("Program aborts now\n");
700 }
701
check_ascii(FILE * infile)702 void check_ascii(FILE *infile)
703 {
704 wint_t wch, wch2;
705 wch = fgetwc(infile);
706 if (wch > 127)
707 {
708 print_utf_warning();
709 }
710 else
711 {
712 wch2 = fgetwc(infile);
713 if(wch==0 && wch2>0)
714 {
715 print_utf_warning();
716 }
717 else
718 {
719 if (wch>0 && wch2==0)
720 {
721 print_utf_warning();
722 }
723 }
724 rewind(infile);
725 }
726 }
727 ///
728 /// read the first line of the data file
729 /// \param infile datafilename
730 /// \param data data structure that holds all the data
731 /// \param options structure that contain all option information
732 /// \reval none
read_header(FILE * infile,data_fmt * data,option_fmt * options)733 void read_header (FILE * infile, data_fmt * data, option_fmt * options)
734 {
735 char input[LINESIZE], *p;
736 char title[LINESIZE];
737 strcpy(title,"\0");
738 //disabled for now: check_ascii(infile);
739 input[0] = '#';
740 while(input[0]=='#')
741 {
742 FGETS (input, sizeof (input), infile);
743 #ifdef DEBUG
744 printf("@@>%s<@@\n",input);
745 #endif
746 if ((p = (char *) strpbrk (input, CRLF)) != NULL)
747 *p = '\0';
748 }
749 switch (lowercase (input[0]))
750 {
751 case 'a':
752 sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype, &(data->numpop),
753 &(data->loci), title);
754 find_delimiter (title, &data->dlm);
755 if(!(title[0] == '\0'))
756 strcpy(options->title,title);
757 break;
758 case 'b':
759 case 'm':
760 sscanf (input, "%1s%ld%ld%1s%[^\n]", &options->datatype,
761 &(data->numpop), &(data->loci), &data->dlm, title);
762 if(!(title[0] == '\0'))
763 strcpy(options->title,title);
764 break;
765 case 's':
766 case 'n':
767 case 'h': //hapmap data
768 case 'u':
769 case 'f':
770 sscanf (input, "%1s%ld%ld%[^\n]", &options->datatype, &(data->numpop),
771 &(data->loci), title);
772 if(!(title[0] == '\0'))
773 strcpy(options->title,title);
774 break;
775 case 'g': /* fall through if a menu change forces to analyze data
776 instead of using the already sampled genealogies */
777 if (options->datatype == 'g')
778 break;
779 else
780 memmove (input, input + 1, (strlen (input) - 1) * sizeof (char));
781 default:
782 if(input[0]== '<')
783 {
784 usererror ("This data file may contain an XML or HTML tag,\nand cannot be read properly, check the data formatting section in the manual!");
785 exit(-1);
786 }
787 switch (options->datatype)
788 {
789 case 'a':
790 sscanf (input, "%ld%ld%[^\n]", &(data->numpop), &(data->loci),
791 title);
792 find_delimiter (title, &data->dlm);
793
794 if(!(title[0] == '\0'))
795 strcpy(options->title,title);
796 break;
797 case 'b':
798 case 'm':
799 sscanf (input, "%ld%ld%1s%[^\n]", &(data->numpop), &(data->loci),
800 &(data->dlm), title);
801 if(!(title[0] == '\0'))
802 strcpy(options->title,title);
803 break;
804 case 's':
805 case 'n':
806 case 'h': // hapmap data
807 case 'u':
808 case 'f':
809 sscanf (input, "%ld%ld%[^\n]", &(data->numpop), &(data->loci),
810 title);
811 if(!(title[0] == '\0'))
812 strcpy(options->title,title);
813 break;
814 default:
815 usererror ("Datatype is wrong, please use a valid data type!");
816 }
817 }
818 options->datatype = lowercase (options->datatype);
819 }
820
821 void
find_delimiter(char * title,char * dlm)822 find_delimiter (char *title, char *dlm)
823 {
824 char *p = title;
825 long z = 0;
826 while (*p == ' ')
827 {
828 p++;
829 z++;
830 }
831 if (isalnum (*p))
832 memmove (title, p, sizeof (char) * (strlen (title) - z));
833 else
834 {
835 *dlm = *p;
836 p++;
837 while (*p == ' ')
838 {
839 p++;
840 z++;
841 }
842 memmove (title, p, sizeof (char) * (strlen (title) - z));
843 }
844 }
845
set_datatype(char input,long locus,char ** locitypes,long * alloc)846 void set_datatype(char input, long locus, char ** locitypes, long *alloc)
847 {
848 if(locus >= *alloc)
849 {
850 *alloc += SUBLOCICHUNKS;
851 *locitypes = (char *) myrealloc(*locitypes, sizeof(char) * (*alloc));
852 }
853 (*locitypes)[locus] = input;
854 }
855
856 //==================================================================
857 /// read the number of sites for each locus in the dataset,
858 /// does not assume a fixed line length, but assumes that at the end of the line is either
859 /// a \n or \r or \r\l (similar to the sequence reader) to accommodate windows, mac and
860 /// unix line ends.
861 /// The generalized sites reader can also read link status using a parenthesis notation,
862 /// and allows to give short names to the
863 /// loci, example: (ldh=s234 agdh=n3) gpi=s100 str1=b1 (str2=m1 a1)
864 /// the labels a, s, n etc are the same used for the datatype and will mark each locus
865 /// for the datatype=h it is assumed that all loci are the same and of type n
866 /// plain numbers are considered to be sequence loci with no name
867 void
read_sites(data_fmt * data,world_fmt * world,option_fmt * options)868 read_sites (data_fmt * data, world_fmt *world, option_fmt *options)
869 {
870 char * val=NULL;
871 boolean linked=TRUE;
872 long allocbufsize=LINESIZE;
873 long locus;
874 char *input;
875 char * word;
876 long oldi;
877 long i;
878 long len;
879 long z=0; //z is adding subloci and is reset with a closing ')'
880 long zz=0; //zz is used for mutationmodels and is not reset at closing ')'
881 input = (char *) mycalloc(allocbufsize ,sizeof(char));
882 word = (char *) mycalloc(allocbufsize ,sizeof(char));
883 // check whether the microsats are not repeats but fragmentlength
884 data->has_repeats = TRUE;
885 for (locus = 0; locus < data->loci; locus++)
886 {
887 read_word(data->infile, input);
888 switch(input[0])
889 {
890 case '#':
891 if(input[1]=='@' || input[1]=='M')
892 {
893 if (input[1]=='M')
894 {
895 warning("You should use #@M if you want to treat the input as fragment lengths in your infile\n");
896 }
897 // line is microsatellite instruction
898 if(input[2]=='M' || input[1]=='M')
899 {
900 FGETS2(&input,&allocbufsize,data->infile); // read the line
901 locus--;
902 oldi=0;
903 // long i;
904 len=0;
905 data->has_repeats = FALSE;
906 for(i = 0 ; i < data->loci; i++)
907 {
908 len += read_word_delim(input+len,word," ;,\t");
909 if(word[0]!='\0')
910 {
911 data->repeatlength[i] = atol(word);
912 oldi=i;
913 }
914 else
915 {
916 data->repeatlength[i] = data->repeatlength[oldi];
917 }
918 }
919 }
920 else
921 {
922 FGETS2(&input,&allocbufsize,data->infile); // read the line
923 locus--;
924 }
925 myfree(input);
926 myfree(word);
927 return;
928 }
929 else
930 {
931 // line is a comment
932 FGETS2(&input,&allocbufsize,data->infile); // read the line
933 locus--;
934 continue;
935 }
936 break;
937
938 case '(':
939 // unlinked loci are all on one line!
940 // this is a change from the old format, dna is now treated the same way as other loci
941 // I am not sure whether this really works, but would make things simpler,
942 // still unclear how to deal with haplotype versus diplotype information
943 data->oneliner=TRUE;
944 linked=TRUE;
945 if(z>=data->numsublocialloc)
946 {
947 data->numsublocialloc += SUBLOCICHUNKS;
948 data->subloci = (long *) myrealloc(data->subloci, data->numsublocialloc * sizeof(long));
949 }
950 data->subloci[z] += 1; //we opened a ( and therefore start a linkage group
951 if(input[1]!='\0' && isalpha(input[1]))
952 {
953 read_word_delim(input,word,"=");
954 if(input[0]!='\0')
955 {
956 data->locusname[z] = (char *) mycalloc(strlen(word),sizeof(char));
957 strcpy(data->locusname[locus],word+1);
958 // what is this good for?
959 if(!strcmp(word,input))
960 val=strchr((char *) "bmsnuh", input[0]);
961 else
962 val=strchr((char *) "bmsnuh", input[1]);
963
964 if(val==0)
965 {
966 set_datatype((input[0]=='(' ? input[1] : input[0]),locus, &data->locitypes, &data->numlocitypesalloc);
967 zz++;
968 }
969 else
970 {
971 // no datatype specified in the model therewe assume there is a unique data type specified in the parmfile or menu
972 set_datatype(data->datatype[0],locus, &data->locitypes, &data->numlocitypesalloc);
973 }
974 }
975 else
976 {
977 // no datatype specified in the model therefore we assume there is a unique data type specified in the parmfile or menu
978 set_datatype(data->datatype[0],locus, &data->locitypes, &data->numlocitypesalloc);
979 }
980 }
981 else
982 {
983 if(strlen(input)==1 || input[1]==' ')
984 {
985 locus--;
986 continue;
987 }
988 }
989 break;
990 case ')':
991 z=0; // reset subloci for the next locus
992 break;
993 default:
994 if (!strchr (SEQUENCETYPES, options->datatype))
995 {
996 unread_word(data->infile, input);
997 myfree(input);
998 myfree(word);
999 return;
1000 }
1001 if(isalpha(input[0]))
1002 {
1003 read_word_delim(input,word,"=");
1004 data->locusname[locus] = (char *) mycalloc(strlen(word),sizeof(char));
1005 strcpy(data->locusname[locus],word);
1006 }
1007 }
1008 if(linked)
1009 data->seq[0]->links[locus] = TRUE;
1010 else
1011 data->seq[0]->links[locus] = FALSE;
1012 if(input[0]=='(')
1013 {
1014 if(strchr("bmsnuh",input[1]))
1015 {
1016 val = input+2;
1017 }
1018 else
1019 {
1020 val = input+1;
1021 }
1022 }
1023 else
1024 {
1025 if(strchr("bmsnuh",input[0]))
1026 {
1027 val = input+1;
1028 }
1029 else
1030 {
1031 val = input;
1032 }
1033 }
1034 data->seq[0]->sites[locus] = atoi (val);
1035 val = NULL;
1036 if(strchr("n",options->datatype))
1037 {
1038 if (options->allelenmlength < data->seq[0]->sites[locus])
1039 options->allelenmlength = data->seq[0]->sites[locus];
1040 }
1041 if (data->seq[0]->sites[locus] == 0)
1042 {
1043 warning ("This does look like sequence data\n");
1044 warning ("I just read a number of sites=0\n");
1045 warning ("If you use the wrong data type, the program\n");
1046 usererror ("will crash anyway, so I stop now\n");
1047 }
1048 }
1049 FGETS2(&input,&allocbufsize,data->infile);
1050 while(input[0]=='#')
1051 FGETS2(&input,&allocbufsize,data->infile);
1052 myfree(input);
1053 myfree(word);
1054 }
1055
1056 //old version
1057 void
read_old_sites(data_fmt * data)1058 read_old_sites (data_fmt * data)
1059 {
1060 char * val=NULL;
1061 boolean linked=TRUE;
1062 long locus;
1063 char *input;
1064 char * word;
1065
1066 input = (char *) mycalloc(LONGLINESIZE ,sizeof(char));
1067 word = (char *) mycalloc(LONGLINESIZE ,sizeof(char));
1068
1069 for (locus = 0; locus < data->loci-1; locus++)
1070 {
1071 read_word(data->infile, input);
1072 switch(input[0])
1073 {
1074 case '#':
1075 FGETS(input,LINESIZE,data->infile);
1076 locus--;
1077 continue;
1078 break;
1079
1080 case '(':
1081 // unlinked loci are all on one line!
1082 // this is a change from the old format, dna is now treated the same way as other loci
1083 // I am not sure whether this really works, but would make things simpler,
1084 // still unclear how to deal with haplotype versus diplotype information
1085 data->oneliner=TRUE;
1086 if(input[1]!='\0' && isalpha(input[1]))
1087 {
1088 read_word_delim(input,word,"=");
1089 if(input[0]!='\0')
1090 {
1091 data->locusname[locus] = (char *) mycalloc(strlen(word),sizeof(char));
1092 strcpy(data->locusname[locus],word+1);
1093 if(!strcmp(word,input))
1094 val=strchr((char *) "bmsnuh", input[0]);
1095 else
1096 val=strchr((char *) "bmsnuh", input[1]);
1097
1098
1099 if(val==0)
1100 {
1101 set_datatype((input[0]=='(' ? input[1] : input[0]),locus, &data->locitypes, &data->numlocitypesalloc);
1102 //zz++;
1103 }
1104 else
1105 {
1106 // no datatype specified in the model therefore we assume there is a
1107 // unique data type specified in the parmfile or menu
1108 set_datatype(data->datatype[0],locus, &data->locitypes, &data->numlocitypesalloc);
1109 }
1110 }
1111 else
1112 {
1113 // no datatype specified in the model therefore we assume there is a unique data type specified in the parmfile or menu
1114 set_datatype(data->datatype[0],locus, &data->locitypes, &data->numlocitypesalloc);
1115
1116 }
1117 }
1118 else
1119 {
1120 if(strlen(input)==1 || input[1]==' ')
1121 {
1122 locus--;
1123 continue;
1124 }
1125 }
1126 break;
1127 case '|':
1128 if(linked==FALSE)
1129 linked=TRUE;
1130 else
1131 linked=FALSE;
1132 break;
1133 default:
1134 if(isalpha(input[0]))
1135 {
1136 read_word_delim(input,word,"=");
1137 data->locusname[locus] = (char *) mycalloc(strlen(word),sizeof(char));
1138 strcpy(data->locusname[locus],word);
1139 }
1140 }
1141 if(linked)
1142 data->seq[0]->links[locus] = TRUE;
1143 else
1144 data->seq[0]->links[locus] = FALSE;
1145 if(input[0]=='(')
1146 {
1147 if(strchr("bmsnuh",input[1]))
1148 {
1149 val = input+2;
1150 }
1151 else
1152 {
1153 val = input+1;
1154 }
1155 }
1156 else
1157 {
1158 if(strchr("bmsnuh",input[0]))
1159 {
1160 val = input+1;
1161 }
1162 else
1163 {
1164 val = input;
1165 }
1166 }
1167 data->seq[0]->sites[locus] = atoi (val);
1168 val = NULL;
1169 if (data->seq[0]->sites[locus] == 0)
1170 {
1171 warning ("This does look like sequence data\n");
1172 warning ("I just read a number of sites=0\n");
1173 warning ("If you use the wrong data type, the program\n");
1174 usererror ("will crash anyway, so I stop now\n");
1175 }
1176 }
1177 FGETS(input,LINESIZE,data->infile);
1178 while(input[0]=='#')
1179 FGETS(input,LINESIZE,data->infile);
1180 if(linked)
1181 data->seq[0]->links[locus] = TRUE;
1182 else
1183 data->seq[0]->links[locus] = FALSE;
1184 if(strchr("bmsnuh",input[0]))
1185 {
1186 val = input+1;
1187 }
1188 else
1189 {
1190 val = input;
1191 while(!strchr("123456789",*val))
1192 val++;
1193 }
1194 data->seq[0]->sites[locus] = atoi (val);
1195
1196 myfree(input);
1197 myfree(word);
1198 }
1199
1200
1201 void
read_popheader(FILE * infile,data_fmt * data,option_fmt * options,long pop,long genomes)1202 read_popheader (FILE * infile, data_fmt * data, option_fmt * options, long pop, long genomes)
1203 {
1204 boolean havepopname = FALSE;
1205 long minlength = 0;
1206 long lo;
1207 long locus;
1208 char *input;
1209
1210 input = (char *) mycalloc(LINESIZE,sizeof(char));
1211
1212 // allows that sequence data can have different numbers of individuals for different loci
1213 // data syntax changes: #ind1 #ind2 #IND3 .... pop_name
1214 havepopname=FALSE;
1215 // with multiple loci we need to separate the nubers of individuals per locus
1216 // from the population title, and also take care in case the number of
1217 // individuals is not specified for all loci.
1218 if(data->loci>1)
1219 {
1220 read_word(data->infile, input);
1221 while(input[0]=='#')
1222 {
1223 FGETS(input,LINESIZE,data->infile);//read rest of line and discard
1224 read_word(data->infile, input); //read first word on next line
1225 }
1226 data->numind[pop][0] = atol(input);//set first numind value, this must be always present
1227 data->numalleles[pop][0] = data->numind[pop][0] * genomes;
1228 for(locus=1; locus < data->loci; locus++)
1229 {
1230 read_word(infile, input);// if we encounter a # we treat the rest of the line as comment
1231 while(input[0]=='#')
1232 {
1233 FGETS(input,LINESIZE,data->infile);
1234 read_word(data->infile, input);
1235 }
1236 if(isdigit(input[0]) && havepopname == FALSE )
1237 {
1238 data->numind[pop][locus] = atol(input);
1239 data->numalleles[pop][locus] = data->numind[pop][locus] * genomes;
1240 }
1241 else
1242 {
1243 // encountered a letter and assume this is the population name
1244 unread_word(infile, input);
1245 FGETS(input,LINESIZE,infile);
1246 havepopname=TRUE;
1247 if(input!=NULL)
1248 {
1249 minlength = strlen(input);
1250 minlength = MIN(minlength,80);
1251 sprintf(data->popnames[pop],"%-*.*s",(int) minlength,(int) minlength,input);
1252 }
1253 break;//this leaves the numind empty of not specified, the must be filled in later
1254 }
1255 }
1256 if(!havepopname)
1257 {
1258 read_word(infile, input);
1259 if(input!=NULL)
1260 {
1261 unread_word(infile, input);
1262 FGETS(input,LINESIZE,infile);
1263 minlength = strlen(input);
1264 minlength = MIN(minlength,80);
1265 sprintf(data->popnames[pop],"%-*s", (int) minlength, input);
1266 }
1267 }
1268
1269 // fills numind for additional locus in case the numind was not specified
1270 for(lo=locus; lo < data->loci; lo++)
1271 {
1272 data->numind[pop][lo] = data->numind[pop][locus-1];
1273 data->numalleles[pop][lo] = data->numind[pop][lo] * genomes;
1274 }
1275 }
1276 else
1277 {
1278 // only one locus so we can use old scheme [see below]
1279 FGETS(input,LINESIZE,infile);
1280 while(input[0]=='#')
1281 {
1282 FGETS(input,LINESIZE,data->infile);
1283 }
1284 sscanf (input, "%ld%[^\n]", &(data->numind[pop][0]), data->popnames[pop]);
1285 data->numalleles[pop][0] = data->numind[pop][0] * genomes;
1286 }
1287
1288 translate (data->popnames[pop], ' ', '_');
1289 translate (data->popnames[pop], '\t', '_');
1290 unpad(data->popnames[pop],"_");
1291 myfree(input);
1292 }
1293
1294
1295 void
read_indname(FILE * file,data_fmt * data,long pop,long ind,long locus,long nmlength)1296 read_indname (FILE * file, data_fmt * data, long pop, long ind, long locus, long nmlength)
1297 {
1298 long i = 0;
1299 char ch;
1300 char input[LINESIZE];
1301 while (i < nmlength)
1302 {
1303 ch = getc (file);
1304 while(ch =='#')
1305 {
1306 FGETS(input,LINESIZE,data->infile);
1307 ch = getc (file);
1308 }
1309 if(!strchr("\r\n",ch))
1310 data->indnames[pop][ind][locus][i++] = ch;
1311 if(strchr("\t",ch))
1312 break;
1313 }
1314 data->indnames[pop][ind][locus][nmlength] = '\0';
1315 }
1316
1317 void
read_popdata(FILE * infile,data_fmt * data,long pop,option_fmt * options)1318 read_popdata (FILE * infile, data_fmt * data, long pop, option_fmt * options)
1319 {
1320 long ind, baseread = 0;
1321 long locus = 0;
1322 char c;
1323 if(options->datatype=='h')
1324 {
1325 //check whether this is genotypeformat or not
1326 if ((c=getc(infile))!='r')
1327 {
1328 ungetc(c,infile);
1329 read_hapmap(infile, data, options, pop);
1330 }
1331 else
1332 read_hapmap_genotypes(infile, data, options, pop);
1333 }
1334 else
1335 {
1336 for (ind = 0; ind < data->numind[pop][0]; ind++)
1337 {
1338 read_indname (infile, data, pop, ind, locus, options->nmlength);
1339 switch (options->datatype)
1340 {
1341 case 'a':
1342 case 'b':
1343 case 'm':
1344 if (data->dlm == '\0')
1345 read_alleles (infile, data, pop, ind);
1346 else
1347 read_microalleles (infile, data, options, pop, ind);
1348 break;
1349 case 's':
1350 case 'n':
1351 case 'u':
1352 case 'f':
1353 if(data->oneliner)
1354 {
1355 baseread = read_ind_seq_oneliner (infile, data, options, pop, ind, 0);
1356 }
1357 else
1358 {
1359 baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0);
1360 }
1361 break;
1362 default:
1363 usererror
1364 ("Wrong datatype, only the types a, m, s, n\n (electrophoretic alleles, \n microsatellite data,\n sequence data,\n SNP polymorphism)\n are allowed.\n");
1365 break;
1366 }
1367 }
1368 if (!strchr (SEQUENCETYPES, options->datatype))
1369 return;
1370 else
1371 {
1372 if(!data->oneliner)
1373 finish_read_seq (infile, data, options, pop, baseread);
1374 }
1375 }
1376 }
1377
check_list(long la1,long nla1,long locus,data_fmt * data)1378 long check_list(long la1, long nla1, long locus, data_fmt *data)
1379 {
1380 if(la1 > data->numrepeatnumbers[locus])
1381 {
1382 data->repeatnumbers[locus] = (long *) realloc(data->repeatnumbers[locus], sizeof(long) * (la1+1));
1383 memset(data->repeatnumbers[locus]+data->numrepeatnumbers[locus], 0, sizeof(long)*(la1+1-data->numrepeatnumbers[locus]));
1384 data->numrepeatnumbers[locus]= la1+1;
1385 data->repeatnumbers[locus][la1]=nla1;
1386 }
1387 else
1388 {
1389 if(data->repeatnumbers[locus][la1]!=0)
1390 return data->repeatnumbers[locus][la1];
1391 }
1392 return la1;
1393 }
1394
len2repeat(char * a1,long rlen,long locus,data_fmt * data)1395 void len2repeat(char *a1, long rlen, long locus, data_fmt *data)
1396 {
1397 long la1 = (long) atol(a1);
1398 long a=0;
1399 if((a=la1 % rlen) != 0)
1400 {
1401 // long correction = (((((float) rlen) / 2.) < a) ? (-a) : (rlen/2 == a ? (UNIF_RANDUM()<0.5 ? -a : a) : (rlen - a)));
1402 long correction = (((((float) rlen) / 2.) < a) ? (-a) : (rlen/2 == a ? (UNIF_RANDUM()<0.5 ? -a : a) : (rlen - a)));
1403 //long nla = check_list(la1,la1+correction, locus, data);
1404 long nla = (la1+correction)/rlen;
1405 sprintf(a1,"%li",nla);
1406 }
1407 else
1408 {
1409 sprintf(a1,"%li",la1/rlen);
1410 }
1411 }
1412
1413
1414 void
read_microalleles(FILE * infile,data_fmt * data,option_fmt * options,long pop,long ind)1415 read_microalleles (FILE * infile, data_fmt * data, option_fmt *options, long pop, long ind)
1416 {
1417 char *input, *isave, dlm[2], ddlm[2], *p, *a, *a1, *a2;
1418 long locus, i;
1419 input = (char *) mycalloc (1, sizeof (char) * (SUPERLINESIZE + 1));
1420 isave = input;
1421 a = (char *) mycalloc (1, sizeof (char) * LINESIZE);
1422 a1 = (char *) mycalloc (1, sizeof (char) * LINESIZE);
1423 a2 = (char *) mycalloc (1, sizeof (char) * LINESIZE);
1424 dlm[0] = data->dlm, dlm[1] = '\0';
1425 ddlm[0] = ' ', ddlm[1] = '\0';
1426 FGETS (input, SUPERLINESIZE, infile);
1427 if ((p = (char *) strpbrk (input, CRLF)) != NULL)
1428 *p = '\0';
1429 for (locus = 0; locus < data->loci; locus++)
1430 {
1431 while (isspace ((int) *input))
1432 input++;
1433 if (input[0] == '\0')
1434 FGETS (input, SUPERLINESIZE, infile);
1435 i = 0;
1436 while (strchr(" \t",input[i])==NULL && input[i] != dlm[0])
1437 {
1438 a1[i] = input[i];
1439 i++;
1440 }
1441 a1[i] = '\0';
1442 input += i;
1443 i = 0;
1444 if (input[i] == dlm[0])
1445 {
1446 input++;
1447 while (strchr(" \t",input[i])==NULL && input[i] != '\0')
1448 {
1449 a2[i] = input[i];
1450 i++;
1451 }
1452 a2[i] = '\0';
1453 if (a2[0] == '\0')
1454 {
1455 strcpy (a2, a1);
1456 }
1457 input += i;
1458 }
1459 else
1460 {
1461 strcpy (a2, a1);
1462 }
1463 sprintf (data->yy[pop][ind][locus][0], "%-.*s",(int) options->allelenmlength,a1);
1464 sprintf (data->yy[pop][ind][locus][1], "%-.*s",(int) options->allelenmlength,a2);
1465 }
1466 myfree(a);
1467 myfree(a1);
1468 myfree(a2);
1469 myfree(isave);
1470 }
1471
1472 void
read_alleles(FILE * infile,data_fmt * data,long pop,long ind)1473 read_alleles (FILE * infile, data_fmt * data, long pop, long ind)
1474 {
1475 char *input, *isave, *p, *a;
1476 long locus;
1477 a = (char *) mycalloc (1, sizeof (char) * LINESIZE);
1478
1479 input = (char *) mycalloc (1, sizeof (char) * SUPERLINESIZE);
1480 isave = input;
1481 FGETS (input, SUPERLINESIZE, infile);
1482 if ((p = (char *) strpbrk (input, CRLF)) != NULL)
1483 *p = '\0';
1484 for (locus = 0; locus < data->loci; locus++)
1485 {
1486 while (isspace ((int) *input))
1487 {
1488 input++;
1489 }
1490 if (sscanf (input, "%s", a) == 1)
1491 {
1492 input += (long) strlen (a);
1493 }
1494
1495 data->yy[pop][ind][locus][0][0] = a[0];
1496 data->yy[pop][ind][locus][0][1] = '\0';
1497 if (a[1] == '\0')
1498 {
1499 data->yy[pop][ind][locus][1][0] = a[0];
1500 data->yy[pop][ind][locus][1][1] = '\0';
1501 }
1502 else
1503 {
1504 data->yy[pop][ind][locus][1][0] = a[1];
1505 data->yy[pop][ind][locus][1][1] = '\0';
1506 }
1507 }
1508 myfree(a);
1509 myfree(isave);
1510 }
1511
1512 ///
1513 /// reads the standard hapmap genotype data file format using this header
1514 /// rs# alleles chrom pos strand assembly# center protLSID assayLSID panelLSID QCcode individual# ....
1515 void
read_hapmap_genotypes(FILE * infile,data_fmt * data,option_fmt * options,long pop)1516 read_hapmap_genotypes (FILE * infile, data_fmt * data, option_fmt * options, long pop)
1517 {
1518 int error;
1519 long ind;
1520 char label1;
1521 char label2;
1522 long label1count;
1523 long label2count;
1524 long label12count;
1525 char *input;
1526 long locus;
1527 long genomes = number_genomes(options->datatype);
1528 input = (char *) mycalloc(LONGLINESIZE,sizeof(char));
1529 // read header
1530 FGETS(input,LONGLINESIZE,infile);
1531 for(locus=0;locus<data->loci;locus++)
1532 {
1533 //rs# or comment
1534 read_word(infile,input);
1535 if(input[0]=='#')
1536 {
1537 fprintf(stdout,"%s\n",input);
1538 FGETS(input,LONGLINESIZE,infile);
1539 locus--;
1540 continue;
1541 }
1542 // discard rs#
1543 // read allele and record
1544 read_word(infile,input);
1545 label1 = input[0];// format is like this G/C
1546 label2 = input[2];
1547 // read chromosome and discard
1548 read_word(infile,input);
1549 // read position
1550 read_word(infile,input);
1551 data->position[locus] = atol(input);
1552 if(options->printdata)
1553 {
1554 fprintf(stdout,"%20li|",data->position[locus]);
1555 }
1556 // read strand must be positive but not checked
1557 read_word(infile,input);
1558 // read assembly# and discard
1559 read_word(infile,input);
1560 // read center and discard
1561 read_word(infile,input);
1562 // read protLSID and discard
1563 read_word(infile,input);
1564 // read assyLSID and discard
1565 read_word(infile,input);
1566 // read panelLSID and discard
1567 read_word(infile,input);
1568 // read QCODE and use as dummy
1569 read_word(infile,input);
1570 data->seq[0]->sites[locus]=1;
1571 label1count=0;
1572 label2count=0;
1573 read_word(infile,input);
1574 while(strstr(input,"rs")==NULL && !isdigit(input[0]))
1575 {
1576 if (label1 == input[0])
1577 label1count++;
1578 else
1579 label2count++;
1580 if (label2 == input[1])
1581 label2count++;
1582 else
1583 label1count++;
1584 error = read_word(infile,input);
1585 if (error==EOF)
1586 break;
1587 }
1588 unread_word(infile,input);
1589 label12count = label1count + label2count;
1590 if(options->printdata)
1591 {
1592 fprintf(stdout, " freq[%c]=%f freq[%c]=%f\n",label1,
1593 (float) label1count/label12count,label2, (float) label2count/label12count);
1594 }
1595 //was messing with the reading, use fraction data->numind[pop][locus] = label12count;
1596 if(options->randomsubset > 0 && options->randomsubset < data->numind[pop][locus])
1597 {
1598 data->numalleles[pop][locus] = options->randomsubset * genomes;
1599 data->numind[pop][locus] = options->randomsubset / genomes;
1600 }
1601 else
1602 {
1603 data->numalleles[pop][locus] = data->numind[pop][locus] = label12count;
1604 // data->numalleles[pop][locus] = data->numind[pop][locus] * genomes;
1605 }
1606 label1count = (long) floor( (((double) label1count/label12count * data->numalleles[pop][locus]))+0.5 );
1607 for(ind=0;ind < label1count; ind++)
1608 {
1609 data->yy[pop][ind][locus][0][0] = label1;
1610 }
1611 for(ind=label1count;ind < data->numalleles[pop][locus]; ind++)
1612 {
1613 data->yy[pop][ind][locus][0][0] = label2;
1614 }
1615 }
1616 myfree(input);
1617 }
1618
1619 ///
1620 /// my own hapmap format derived from the frequency data
1621 void
read_hapmap(FILE * infile,data_fmt * data,option_fmt * options,long pop)1622 read_hapmap (FILE * infile, data_fmt * data, option_fmt * options, long pop)
1623 {
1624 long ind;
1625 char label1;
1626 char label2;
1627 long label1count;
1628 long label2count;
1629 long label12count;
1630 char *input;
1631 long locus;
1632 long genomes = number_genomes(options->datatype);
1633 input = (char *) mycalloc(LINESIZE,sizeof(char));
1634 for(locus=0;locus<data->loci;locus++)
1635 {
1636 read_word(infile,input);
1637 if(input[0]=='#')
1638 {
1639 fprintf(stdout,"%s\n",input);
1640 FGETS(input,LINESIZE,infile);
1641 locus--;
1642 continue;
1643 }
1644 data->position[locus] = atol(input);
1645 if(options->printdata)
1646 {
1647 fprintf(stdout,"%20li|",data->position[locus]);
1648 }
1649 data->seq[0]->sites[locus]=1;
1650 read_word(infile,input);
1651 label1 = input[0];;
1652 read_word(infile,input);
1653 label1count = atol(input);
1654
1655 read_word(infile,input);
1656 label2 = input[0];
1657 read_word(infile,input);
1658 label2count = atol(input);
1659 read_word(infile,input);
1660 label12count = atol(input);
1661 if(options->printdata)
1662 {
1663 fprintf(stdout, " freq[%c]=%f freq[%c]=%f\n",label1,
1664 (float) label1count/label12count,label2, (float) label2count/label12count);
1665 }
1666 //was messing with the reading, use fraction data->numind[pop][locus] = label12count;
1667 if(options->randomsubset > 0 && options->randomsubset < data->numind[pop][locus])
1668 {
1669 data->numalleles[pop][locus] = options->randomsubset * genomes;
1670 data->numind[pop][locus] = options->randomsubset / genomes;
1671 }
1672 else
1673 {
1674 data->numalleles[pop][locus] = label12count;
1675 data->numind[pop][locus] = label12count / genomes;
1676 }
1677 // data->numind[pop][locus] * genomes;
1678 label1count = (long) floor( (((double) label1count/label12count * data->numalleles[pop][locus]))+0.5 );
1679 for(ind=0;ind < label1count; ind++)
1680 {
1681 data->yy[pop][ind][locus][0][0] = label1;
1682 }
1683 for(ind=label1count;ind < data->numalleles[pop][locus]; ind++)
1684 {
1685 data->yy[pop][ind][locus][0][0] = label2;
1686 }
1687 }
1688 myfree(input);
1689 }
1690
1691 long
read_ind_seq(FILE * infile,data_fmt * data,option_fmt * options,long locus,long pop,long ind,long baseread)1692 read_ind_seq (FILE * infile, data_fmt * data, option_fmt * options,
1693 long locus, long pop, long ind, long baseread)
1694 {
1695 long j;
1696 char charstate;
1697 j = (options->interleaved) ? baseread : 0;
1698 charstate = getc (infile);
1699 ungetc ((int) charstate, infile);
1700 if(options->printdata)
1701 {
1702 fprintf(stdout,"%s|",data->indnames[pop][ind][locus]);
1703 }
1704 while (j < data->seq[0]->sites[locus]
1705 && !(options->interleaved && strchr(CRLF,charstate)))
1706 {
1707 charstate = getc (infile);
1708 if (strchr(CRLF,charstate))
1709 {
1710 #ifdef INTERLEAVED
1711 if (options->interleaved)
1712 {
1713 while(strchr(CRLF,charstate=getc(infile)))
1714 ;
1715 ungetc ((int) charstate, infile);
1716 return j;
1717 }
1718 else
1719 #endif
1720 charstate = ' ';
1721 }
1722 if (charstate == ' '
1723 || (charstate >= '0' && charstate <= '9') || charstate == '\\')
1724 continue;
1725 charstate = uppercase (charstate);
1726 if(options->printdata)
1727 {
1728 fprintf(stdout, "%c",charstate);
1729 }
1730 if ((strchr ("ABCDGHKMNRSTUVWXY?O-", (int) charstate)) == NULL)
1731 {
1732 printf
1733 ("ERROR: BAD BASE: %c AT POSITION %5ld OF INDIVIDUUM %3li in POPULATION %ld\n",
1734 charstate, j, ind+1, pop+1);
1735 printf
1736 ("Last complete sequences was in population %li, individual %li and locus %li:\n%s",
1737 pop + 1, ind, locus+1, data->indnames[pop][ind - 1][locus]);
1738 for (j = 0; j < data->seq[0]->sites[locus]; j++)
1739 printf ("%c", data->yy[pop][ind - 1][locus][0][j]);
1740 exit (EXIT_FAILURE);
1741 }
1742 data->yy[pop][ind][locus][0][j++] = charstate;
1743 }
1744 charstate = getc (infile); /* swallow the \n or \r */
1745 while(charstate == '\r' || charstate == '\n' || charstate == '\t' || charstate == ' ' || charstate == ';')
1746 {
1747 charstate = getc(infile);
1748 }
1749 if(charstate!='\n')
1750 ungetc((int) charstate, infile);
1751
1752 if(options->printdata)
1753 {
1754 fprintf(stdout,"\n");
1755 }
1756 return j;
1757 }
1758
1759 ///
1760 /// reads sequence data that comes where all loci are (1) on one line and (2) are not interleaved
1761 long
read_ind_seq_oneliner(FILE * infile,data_fmt * data,option_fmt * options,long pop,long ind,long baseread)1762 read_ind_seq_oneliner (FILE * infile, data_fmt * data, option_fmt * options,
1763 long pop, long ind, long baseread)
1764 {
1765 long j = 0;
1766 long locus=0;
1767 char charstate;
1768 charstate = getc (infile);
1769 ungetc ((int) charstate, infile);
1770 if(options->printdata)
1771 {
1772 fprintf(stdout,"%s",data->indnames[pop][ind][locus]);
1773 }
1774 for(locus = 0; locus < data->loci; locus++)
1775 {
1776 j=0;
1777 if(options->printdata)
1778 {
1779 fprintf(stdout,"|");
1780 }
1781 while (j < data->seq[0]->sites[locus])
1782 {
1783 charstate = getc (infile);
1784 if (strchr(CRLF,charstate))
1785 {
1786 charstate = ' ';
1787 }
1788 if (charstate == ' '
1789 || (charstate >= '0' && charstate <= '9') || charstate == '\\')
1790 continue;
1791 charstate = uppercase (charstate);
1792 if(options->printdata)
1793 {
1794 fprintf(stdout, "%c",charstate);
1795 }
1796 if ((strchr ("ABCDGHKMNRSTUVWXY?O-", (int) charstate)) == NULL)
1797 {
1798 printf
1799 ("ERROR: BAD BASE: %c AT POSITION %5ld OF INDIVIDUUM %3li in POPULATION %ld\n",
1800 charstate, j, ind+1, pop+1);
1801 printf
1802 ("Last complete sequences was in population %li, individual %li and locus %li:\n%s",
1803 pop + 1, ind, locus+1, data->indnames[pop][ind - 1][locus]);
1804 for (j = 0; j < data->seq[0]->sites[locus]; j++)
1805 printf ("%c", data->yy[pop][ind - 1][locus][0][j]);
1806 exit (EXIT_FAILURE);
1807 }
1808 data->yy[pop][ind][locus][0][j++] = charstate;
1809 }
1810 }
1811 charstate = getc (infile); /* swallow the \n or \r */
1812 while(charstate == '\n' || charstate == '\t' || charstate == ' ' || charstate == ';')
1813 {
1814 charstate = getc(infile);
1815 }
1816 if(charstate!='\n')
1817 ungetc((int) charstate, infile);
1818 if (charstate == '\r')
1819 {
1820 charstate = getc (infile); /* swallow the \n */
1821 if(charstate!='\n')
1822 ungetc((int) charstate, infile);
1823 }
1824 if(options->printdata)
1825 {
1826 fprintf(stdout,"\n");
1827 }
1828 return j;
1829 }
1830
1831 void
read_distance_fromfile(FILE * dfile,long tips,long nmlength,MYREAL ** m)1832 read_distance_fromfile (FILE * dfile, long tips, long nmlength, MYREAL **m)
1833 {
1834 char input[SUPERLINESIZE];
1835 long i, j;
1836 int retval;
1837 if (dfile != NULL)
1838 {
1839 // assumes that the dfile is in PHYLIP format
1840 // and that all n x n cells are filled.
1841 FGETS (input, LONGLINESIZE, dfile); //reads header line with
1842 for (i = 0; i < tips; i++) // of individuals
1843 {
1844 //reads the population label
1845 FGETS (input, nmlength + 1, dfile);
1846 for (j = 0; j < tips; j++)
1847 {
1848 #ifdef USE_MYREAL_FLOAT
1849 retval = fscanf (dfile, "%f", &m[i][j]);
1850 #else
1851 retval = fscanf (dfile, "%lf", &m[i][j]);
1852 #endif
1853 if((i!=j) && (m[i][j] < EPSILON))
1854 {
1855 warning("Reading geofile: adjusting dist[%li][%li]=%f to %f\n",i,j,m[i][j],EPSILON);
1856 m[i][j] = EPSILON;
1857 }
1858 }
1859 // reads the last \n from the
1860 // data matrix
1861 FGETS (input, LONGLINESIZE, dfile);
1862 }
1863 }
1864 }
1865
1866 #ifdef UEP
1867 // uep function
1868
1869 void
read_uep_fromfile(FILE * uepfile,long tips,long nmlength,int ** uep,long * uepsites,long datatype)1870 read_uep_fromfile (FILE * uepfile, long tips, long nmlength, int **uep,
1871 long *uepsites, long datatype)
1872 {
1873 char input[LINESIZE];
1874 long i, j;
1875 long thistips;
1876 if (uepfile != NULL)
1877 {
1878 // assumes that the uepfile is in PHYLIP format
1879 // and that all n cells are filled.
1880 FGETS (input, LINESIZE, uepfile); //reads header line with
1881 // of individuals and uep sites
1882 sscanf (input, "%li%li", &thistips, uepsites);
1883 if (thistips != tips)
1884 error ("UEP datafile and infile are inconsistent!");
1885 if (strchr (SEQUENCETYPES, datatype))
1886 {
1887 for (i = 0; i < tips; i++)
1888 {
1889 uep[i] = (int *) mycalloc (*uepsites, sizeof (int));
1890 FGETS (input, nmlength + 1, uepfile); //reads each line
1891 for (j = 0; j < *uepsites; ++j)
1892 retval = fscanf (uepfile, "%i", &uep[i][j]);
1893 // reads the last \n from the data matrix
1894 FGETS (input, LINESIZE, uepfile);
1895 }
1896 }
1897 else
1898 {
1899 for (i = 0; i < tips; i++)
1900 {
1901 uep[i] = (int *) mycalloc (*uepsites, sizeof (int));
1902 uep[i + tips] = (int *) mycalloc (*uepsites, sizeof (int));
1903 FGETS (input, nmlength + 1, uepfile); //reads each line
1904 for (j = 0; j < *uepsites; ++j)
1905 retval = fscanf (uepfile, "%i", &uep[i][j]);
1906 // finished reading first allele, no onto the second
1907 for (j = 0; j < *uepsites; ++j)
1908 retval = fscanf (uepfile, "%i", &uep[i + tips][j]);
1909 // reads the last \n from the data matrix
1910 FGETS (input, LINESIZE, uepfile);
1911 }
1912 }
1913 }
1914 }
1915 #endif
1916
1917 void
finish_read_seq(FILE * infile,data_fmt * data,option_fmt * options,long pop,long baseread)1918 finish_read_seq (FILE * infile, data_fmt * data, option_fmt * options,
1919 long pop, long baseread)
1920 {
1921
1922 long ind, baseread2 = 0, locus = 0;
1923 if (options->interleaved)
1924 {
1925 while (baseread < data->seq[0]->sites[0])
1926 {
1927 for (ind = 0; ind < data->numind[pop][0]; ind++)
1928 {
1929 baseread2 =
1930 read_ind_seq (infile, data, options, locus, pop, ind,
1931 baseread);
1932 }
1933 baseread = baseread2;
1934 }
1935 }
1936 for (locus = 1; locus < data->loci; locus++)
1937 {
1938 baseread = 0;
1939 for (ind = 0; ind < data->numind[pop][locus]; ind++)
1940 {
1941 read_indname (infile, data, pop, ind, locus, options->nmlength);
1942 baseread = read_ind_seq (infile, data, options, locus, pop, ind, 0);
1943 }
1944 if (options->interleaved)
1945 {
1946 while (baseread < data->seq[0]->sites[locus])
1947 {
1948 for (ind = 0; ind < data->numind[pop][locus]; ind++)
1949 {
1950 baseread2 =
1951 read_ind_seq (infile, data, options, locus, pop, ind,
1952 baseread);
1953 }
1954 baseread = baseread2;
1955 }
1956 }
1957 }
1958 }
1959
find_missing(data_fmt * data,long pop,long locus)1960 long find_missing(data_fmt *data, long pop, long locus)
1961 {
1962 long missing = 0;
1963 long ind;
1964 for(ind=0; ind < data->numind[pop][locus]; ind++)
1965 {
1966 if(data->yy[pop][ind][locus][0][0]=='?')
1967 missing++;
1968 if(data->yy[pop][ind][locus][1][0]=='?')
1969 missing++;
1970 }
1971 return missing;
1972 }
1973
1974
1975 ///
1976 /// Data set was subsampled and used a random sample of size
print_random_subset(FILE * file,data_fmt * data,option_fmt * options)1977 void print_random_subset(FILE * file, data_fmt * data, option_fmt *options)
1978 {
1979 long locus;
1980 long pop;
1981 char *name;
1982 long maxnum;
1983 long count;
1984 long ind;
1985 long length;
1986 long index;
1987 if(options->randomsubset > 0)
1988 {
1989 fprintf (file, "\nData set was subsampled and used a random sample of size: %li\n\n", options->randomsubset);
1990 fprintf (file, "Locus Population Individuals\n");
1991 fprintf (file, "----- ---------- ---------------------------------------------------------------------\n");
1992 name = (char*) mycalloc(options->nmlength+1,sizeof(char));
1993 for (locus=0; locus < data->loci; locus++)
1994 {
1995 fprintf(file,"%5li ",locus+1);
1996 for (pop=0; pop < data->numpop; pop++)
1997 {
1998 fprintf(file, "%-10.10s ", data->popnames[pop]);
1999 maxnum = options->randomsubset < data->numind[pop][locus] ? options->randomsubset : data->numind[pop][locus];
2000 count = 18;
2001 for(ind=0;ind<maxnum;ind++)
2002 {
2003 index = data->shuffled[pop][locus][ind];
2004
2005 if(data->indnames[pop][index][locus][0]=='\0')
2006 memcpy(name,data->indnames[pop][index][0],sizeof(char)*options->nmlength);
2007 else
2008 memcpy(name,data->indnames[pop][index][locus],sizeof(char)*options->nmlength);
2009 remove_trailing_blanks(&name);
2010 length = strlen(name);
2011 if (count+length < LINELENGTH)
2012 {
2013 fprintf(file,"%s ",name);
2014 count += length+1;
2015 }
2016 else
2017 {
2018 fprintf(file,"\n");
2019 fprintf(file," ");
2020 fprintf(file,"%s ",name);
2021 count=17+length+1;
2022 }
2023 }
2024 if(pop<data->numpop-1)
2025 fprintf(file,"\n ");
2026 else
2027 fprintf(file,"\n");
2028 }
2029 }
2030 myfree(name);
2031 }
2032 }
2033
2034 void
print_data_summary(FILE * file,world_fmt * world,option_fmt * options,data_fmt * data)2035 print_data_summary (FILE * file, world_fmt * world, option_fmt * options,
2036 data_fmt * data)
2037 {
2038 int maxlength = 24; // length of the the total string , for popnames
2039 int length;
2040 long locus;
2041 long pop;
2042 long numind;
2043 long nummiss;
2044 char dstring[LINESIZE];
2045 long *total;
2046 long *totalmiss;
2047 total = (long *) mycalloc(data->loci,sizeof(long));
2048 totalmiss = (long *) mycalloc(data->loci,sizeof(long));
2049 fprintf (file, "\nSummary of data:\n");
2050 fprintf(file, "Title:%54.54s\n",options->title);
2051
2052 switch (options->datatype)
2053 {
2054 case 'a':
2055 strcpy (dstring, "Allelic data");
2056 break;
2057 case 'f':
2058 case 's':
2059 strcpy (dstring, "Sequence data");
2060 break;
2061 case 'b':
2062 case 'm':
2063 strcpy (dstring, "Microsatellite data");
2064 break;
2065 case 'n':
2066 case 'u':
2067 strcpy (dstring, "SNP data");
2068 break;
2069 case 'h':
2070 strcpy (dstring, "SNP data (Hapmap data)");
2071 break;
2072 default:
2073 strcpy (dstring, "Unknown data [ERROR]");
2074 break;
2075 }
2076 fprintf (file, "Data file: %48.48s\n", options->infilename);
2077 fprintf (file, "Datatype: %48.48s\n", dstring);
2078 if(!data->has_repeats)
2079 {
2080 fprintf (file, "[Fragment length is translated to repeats]\n");
2081 }
2082 else
2083 {
2084 if (dstring[0]=='M')
2085 fprintf (file, "[Data was used as repeat-length information]\n");
2086 }
2087
2088 fprintf (file, "Number of loci: %20li\n\n",
2089 data->loci);
2090 if(options->has_datefile)
2091 {
2092 fprintf (file, "Title:%54.54s\n",options->title);
2093 fprintf (file, "Sample dates:%46.46s\n", options->datefilename);
2094 fprintf (file, "Generations per year: %5.5f\n", options->generation_year);
2095 fprintf (file, "Mutationrate per year: %.10g", options->mutationrate_year[0]);
2096 for(locus=1; locus < options->mutationrate_year_numalloc; locus++)
2097 {
2098 if(locus % 4 == 0)
2099 fprintf(file," ");
2100 fprintf(file,", %f", options->mutationrate_year[locus]);
2101 }
2102 fprintf(file,"\n");
2103 }
2104 //
2105 print_random_subset(file,data,options);
2106 if(file!=stdout)
2107 {
2108 pdf_print_random_subset(data,options);
2109 }
2110 for (pop = 0; pop < data->numpop; pop++)
2111 {
2112 length = strlen(data->popnames[pop]);
2113 if(maxlength < length)
2114 maxlength = length;
2115 }
2116 if(maxlength > 40)
2117 maxlength = 40;
2118
2119 if (!strchr (SEQUENCETYPES, options->datatype))
2120 {
2121 fprintf (file,"%-*.*s Locus Gene copies \n",maxlength,maxlength,"Population");
2122 fprintf (file,"%-*.*s ---------------\n",maxlength, maxlength, " ");
2123 fprintf (file,"%-*.*s data (missing)\n",maxlength, maxlength, " ");
2124 }
2125 else
2126 {
2127 fprintf (file,"%-*.*s Locus Gene copies \n",maxlength,maxlength,"Population");
2128 }
2129 fprintf (file,"----%-*.*s------------------------\n",maxlength,maxlength,"------------------------------------------------------------------");
2130 for (pop = 0; pop < data->numpop; pop++)
2131 {
2132 for(locus=0; locus< data->loci; locus++)
2133 {
2134 if (!strchr (SEQUENCETYPES, options->datatype))
2135 {
2136 nummiss = find_missing(data,pop,locus);
2137 numind = data->numalleles[pop][locus] - nummiss;
2138 fprintf (file, "%3li %-*.*s %5li %6li (%li)\n", options->newpops[pop], maxlength, maxlength,
2139 (locus==0 ? data->popnames[pop] : " " ), locus+1 , numind, nummiss);
2140 }
2141 else
2142 {
2143 nummiss = 0;
2144 numind = data->numind[pop][locus];
2145 fprintf (file, "%3li %-*.*s %5li %6li\n", options->newpops[pop], maxlength, maxlength,
2146 (locus==0 ? data->popnames[pop] : " "), locus+1 , numind);
2147 }
2148 total[locus] += numind;
2149 totalmiss[locus] += nummiss;
2150 }
2151 }
2152 if (!strchr (SEQUENCETYPES, options->datatype))
2153 {
2154 for(locus=0; locus< data->loci; locus++)
2155 {
2156 fprintf (file," %-*.*s %5li %6li (%li)\n",maxlength,maxlength,
2157 (locus == 0 ? "Total of all populations" : " "), locus+1, total[locus], totalmiss[locus]);
2158 }
2159 }
2160 else
2161 {
2162 for(locus=0; locus< data->loci; locus++)
2163 {
2164 fprintf (file," %-*.*s %5li %6li\n",maxlength,maxlength,
2165 (locus == 0 ? "Total of all populations" : " "), locus+1, total[locus]);
2166 }
2167 }
2168 fprintf(file,"\n");
2169 myfree(total);
2170 myfree(totalmiss);
2171 fflush (file);
2172 }
2173
2174 void
print_data(world_fmt * world,option_fmt * options,data_fmt * data)2175 print_data (world_fmt * world, option_fmt * options, data_fmt * data)
2176 {
2177 if (options->printdata)
2178 {
2179 switch (options->datatype)
2180 {
2181 case 'a':
2182 // if(options->dlm=='\0')
2183 // print_alleledata (world, data, options);
2184 //else
2185 print_microdata (world, data, options);
2186 break;
2187 case 'b':
2188 case 'm':
2189 print_microdata (world, data, options);
2190 break;
2191 case 's':
2192 case 'n':
2193 case 'h':
2194 case 'u':
2195 case 'f':
2196 print_seqdata (world, options, data);
2197 break;
2198 }
2199 }
2200 }
2201
2202 ///
2203 /// calculate allele frequency spectrum and print
2204 /// allele population1 .. populationN All
2205 /// taking into account population labeling and random_subsets
print_spectra(world_fmt * world,option_fmt * options,data_fmt * data)2206 void print_spectra(world_fmt * world, option_fmt * options,data_fmt * data)
2207 {
2208 const double one = 1.0;
2209 const double two = 2.0;
2210 long found;
2211 long locus;
2212 long a;
2213 long pop;
2214 long pop1;
2215 long ind;
2216 MYREAL **total;
2217 MYREAL *grandtotal;
2218 MYREAL allfreq;
2219 MYREAL f;
2220 MYREAL homo = 0.0;
2221 MYREAL *avghet;
2222 MYREAL v;
2223 MYREAL avghet1;
2224 //MYREAL avghetall = 0.0;
2225 long *maxalleles;
2226 long *maxallelepop;
2227 MYREAL ***freq;
2228 char *thisallele;
2229 char *thatallele;
2230 MYREAL fx;
2231 MYREAL general_homo;
2232 FILE *outfile = world->outfile;
2233 long loctotal;
2234 //long t;
2235 boolean second = (strchr(ALLELETYPES,options->datatype)!=NULL);
2236 if (strchr (DNASEQUENCETYPES, options->datatype))
2237 return; /*we do not calculate allele frequencies for DNA sequences*/
2238 // find total number of alleles
2239 maxalleles = (long *) mycalloc(data->loci, sizeof(long));
2240 maxallelepop = (long *) mycalloc(data->numpop, sizeof(long));
2241 avghet = (MYREAL *) mycalloc(data->numpop, sizeof(MYREAL));
2242 for (locus = 0; locus < data->loci; locus++)
2243 {
2244 maxalleles[locus] = findAllele(data,"\0",locus);
2245 #ifdef BEAGLE
2246 world->mutationmodels[world->sublocistarts[locus]].numstates = maxalleles[locus];
2247 world->mutationmodels[world->sublocistarts[locus]].basefreqs = (double *) calloc(world->mutationmodels[world->sublocistarts[locus]].numstates,sizeof(double));
2248 #endif
2249 }
2250
2251 // create bins for each population and all
2252 grandtotal = (MYREAL *) mycalloc(data->loci, sizeof(MYREAL));
2253 freq = (MYREAL ***) mycalloc(data->numpop, sizeof(MYREAL **));
2254 doublevec2d(&total,data->numpop,data->loci);
2255 for (pop = 0; pop < data->numpop; pop++)
2256 {
2257 freq[pop] = (MYREAL **) mycalloc(data->loci, sizeof(MYREAL *));
2258 for (locus = 0; locus < data->loci; locus++)
2259 {
2260 freq[pop][locus] = (MYREAL *) mycalloc(1+maxalleles[locus], sizeof(MYREAL));
2261 }
2262 }
2263
2264 // calculate spectrum
2265 for (pop1 = 0; pop1 < data->numpop; pop1++)
2266 {
2267 //pop = options->newpops[pop1]-1;
2268 for (ind = 0; ind < data->numind[pop1][0]; ind++)
2269 {
2270 for (locus = 0; locus < data->loci; locus++)
2271 {
2272 thisallele = data->yy[pop1][ind][locus][0];
2273 found = findAllele(data, thisallele, locus);
2274 if(second)
2275 {
2276 thatallele = data->yy[pop1][ind][locus][1];
2277 if(!strcmp(thisallele,thatallele))
2278 {
2279 if(!strchr(thisallele,'?'))
2280 {
2281 freq[pop1][locus][found] += two ;
2282 //total[pop][locus] += two ;
2283 }
2284 }
2285 else
2286 {
2287 if(!strchr(thisallele,'?'))
2288 {
2289 freq[pop1][locus][found] += one ;
2290 //total[pop][locus] += one ;
2291 }
2292 found = findAllele(data, thatallele, locus);
2293 if(!strchr(thatallele,'?'))
2294 {
2295 freq[pop1][locus][found] += one ;
2296 //total[pop][locus] += one ;
2297 }
2298 }
2299 }
2300 else
2301 {
2302 if(!strchr(thisallele,'?'))
2303 {
2304 freq[pop1][locus][found] += one ;
2305 //total[pop][locus] += one ;
2306 }
2307 }
2308 }
2309 }
2310 }
2311 // now we calculate the poptotals and popfreqs if present
2312 // this will change freq and total therefore transform the
2313 // location score to population scores that are repeated
2314 for (pop1 = 0; pop1 < data->numpop; pop1++)
2315 {
2316 pop = options->newpops[pop1]-1;
2317 if (pop == pop1)
2318 continue;
2319 for (locus = 0; locus < data->loci; locus++)
2320 {
2321 //accumulate the pooled populations frequencies
2322 for(a=0; a < maxalleles[locus]; a++)
2323 {
2324 freq[pop][locus][a] += freq[pop1][locus][a];
2325 }
2326 //replace all locations frequencies with the population freq
2327 for(a=0; a < maxalleles[locus]; a++)
2328 {
2329 freq[pop1][locus][a] = freq[pop][locus][a];
2330 }
2331 }
2332 }
2333 // now we have collected all numbers of alleles for each locus and each location
2334 // we calculate now all the totals for each locality,
2335 for (pop = 0; pop < data->numpop; pop++)
2336 {
2337 for (locus = 0; locus < data->loci; locus++)
2338 {
2339 loctotal = 0;
2340 for(a=0; a < maxalleles[locus]; a++)
2341 {
2342 loctotal += (long) freq[pop][locus][a];
2343 }
2344 total[pop][locus] = loctotal;
2345 grandtotal[locus] += loctotal;
2346 }
2347 }
2348
2349 // print in ascii
2350 //fprintf(stdout,"Allele frequency spectra\n");
2351 fprintf(outfile,"Allele frequency spectra\n");
2352 fprintf(outfile,"========================\n\n");
2353 avghet1=0.0;
2354 for (locus = 0; locus < data->loci; locus++)
2355 {
2356 fprintf(outfile,"Locus %li\n", locus + 1);
2357 fprintf(outfile,"Allele ");
2358 general_homo = 0.0;
2359 for (pop1 = 0; pop1 < data->numpop; pop1++)
2360 {
2361 maxallelepop[pop1]=0;
2362 pop = options->newpops[pop1]-1;
2363 fprintf(outfile,"Pop%-2li ",pop+1);
2364 }
2365 fprintf(outfile,"All\n-----------");
2366 for (pop = 0; pop < data->numpop+1; pop++)
2367 fprintf(outfile,"-------");
2368 fprintf(outfile, "\n");
2369 for(a=0; a < maxalleles[locus]; a++)
2370 {
2371 allfreq = 0.0;
2372 if(strlen(data->allele[locus][a])>8)
2373 fprintf(outfile,"%-8.8s...",data->allele[locus][a]);
2374 else
2375 fprintf(outfile,"%-8.8s ",data->allele[locus][a]);
2376 for (pop1 = 0; pop1 < data->numpop; pop1++)
2377 {
2378 pop = options->newpops[pop1]-1;
2379 if(freq[pop][locus][a]>0.0)
2380 {
2381 maxallelepop[pop1] += 1;
2382 // maxallelepop[pop] += 1.0/data->numpop;
2383 fprintf(outfile," %1.3f ",freq[pop][locus][a]/total[pop][locus]);
2384 allfreq += freq[pop][locus][a];
2385 // allfreq += freq[pop][locus][a];
2386 }
2387 else
2388 {
2389 fprintf(outfile," - ");
2390 }
2391 }
2392 fprintf(outfile," %1.3f\n", (MYREAL) allfreq/grandtotal[locus]);
2393 fx = allfreq/grandtotal[locus];
2394 general_homo += fx * fx;
2395 // fprintf(outfile," %1.3f %f\n", fx*fx,general_homo);
2396 #ifdef BEAGLE
2397 // debug not ready yet for subloci
2398 world->mutationmodels[world->sublocistarts[locus]].basefreqs[a] = allfreq/data->numpop;
2399 #endif
2400 }
2401 fprintf(outfile,"Alleles ");
2402 for (pop1 = 0; pop1 < data->numpop; pop1++)
2403 {
2404 // pop = options->newpops[pop1]-1;
2405 fprintf(outfile," %5li ", maxallelepop[pop1]);
2406 }
2407 fprintf(outfile," %5li\n", maxalleles[locus]);
2408 fprintf(outfile,"Samplesize ");
2409 for (pop1 = 0; pop1 < data->numpop; pop1++)
2410 {
2411 fprintf(outfile," %5li ", (long) total[pop1][locus]);
2412 }
2413 fprintf(outfile," %5li\n", (long) grandtotal[locus]);
2414
2415 //if(maxalleles[locus]<=1 && strchr(SNPTYPES,options->datatype))
2416 //{
2417 // data->skiploci[locus] = TRUE;
2418 //}
2419
2420 fprintf(outfile,"H_exp ");
2421 for (pop1 = 0; pop1 < data->numpop; pop1++)
2422 {
2423 pop = options->newpops[pop1]-1;
2424 homo = 0.0;
2425 for(a=0;a<maxalleles[locus];a++)
2426 {
2427 f = freq[pop][locus][a]/total[pop][locus];
2428 homo += f*f;
2429 }
2430 v = 1.0 - homo;
2431 avghet[pop1] += v;
2432 fprintf(outfile," %5.3f ",v);
2433 }
2434 fprintf(outfile," %5.3f\n\n",1.0-general_homo);
2435 avghet1 += 1.0 - general_homo;
2436 // fprintf(outfile," %5.3f\n\n",avghet1/data->numpop);
2437 }
2438 fprintf(outfile,"Average expected heterozygosity\n");
2439 for (pop1 = 0; pop1 < data->numpop; pop1++)
2440 {
2441 pop = options->newpops[pop1]-1;
2442 fprintf(outfile,"Pop%-2li ",pop+1);
2443 }
2444 fprintf(outfile,"All\n");
2445 for (pop = 0; pop < data->numpop+1; pop++)
2446 fprintf(outfile,"-------");
2447 fprintf(outfile, "\n");
2448 for (pop1 = 0; pop1 < data->numpop; pop1++)
2449 {
2450 pop = options->newpops[pop1]-1;
2451 fprintf(outfile,"%5.3f ",avghet[pop1] / data->loci);
2452 }
2453 fprintf(outfile,"%5.3f\n\n",avghet1/data->loci);
2454 // printd in PDF
2455 #ifdef PRETTY
2456 //pdf_print_spectra(data, options, freq, total, grandtotal, avghet, avghet1, maxalleles);
2457 #endif
2458 fflush(outfile);
2459 // cleanup
2460 for (pop = 0; pop < data->numpop; pop++)
2461 {
2462 for (locus = 0; locus < data->loci; locus++)
2463 {
2464 myfree(freq[pop][locus]);
2465 }
2466 myfree(freq[pop]);
2467 }
2468 //printf("finished with printspectra");
2469 myfree(freq);
2470 myfree(maxalleles);
2471 myfree(maxallelepop);
2472 myfree(avghet);
2473 free_doublevec2d(total);
2474 myfree(grandtotal);
2475 }
2476
2477 void
print_alleledata(world_fmt * world,data_fmt * data,option_fmt * options)2478 print_alleledata (world_fmt * world, data_fmt * data, option_fmt * options)
2479 {
2480 long i, pop, ind, locus, mult80;
2481 for (pop = 0; pop < data->numpop; pop++)
2482 {
2483 print_header (world->outfile, pop, world, options, data);
2484 for (ind = 0; ind < data->numind[pop][0]; ind++)
2485 {
2486 fprintf (world->outfile, "%-*.*s ", (int) options->nmlength,
2487 (int) options->nmlength, data->indnames[pop][ind][0]);
2488 mult80 = options->nmlength;
2489 for (locus = 0; locus < data->loci; locus++)
2490 {
2491 mult80 +=
2492 1 + (long) (strlen (data->yy[pop][ind][locus][0]));
2493 if (mult80 >= 80)
2494 {
2495 mult80 = 0;
2496 fprintf (world->outfile, "\n");
2497 for (i = 0; i < options->nmlength; i++)
2498 FPRINTF(world->outfile, " ");
2499 }
2500 fprintf (world->outfile, " %c.%c",
2501 data->yy[pop][ind][locus][0][0],
2502 data->yy[pop][ind][locus][0][1]);
2503 }
2504 fprintf (world->outfile, "\n");
2505 }
2506 fprintf (world->outfile, "\n");
2507 }
2508 fprintf (world->outfile, "\n\n");
2509 fflush (world->outfile);
2510 }
2511
2512 void
print_microdata(world_fmt * world,data_fmt * data,option_fmt * options)2513 print_microdata (world_fmt * world, data_fmt * data, option_fmt * options)
2514 {
2515 long i, pop, ind, locus, mult80;
2516 for (pop = 0; pop < data->numpop; pop++)
2517 {
2518 print_header (world->outfile, pop, world, options, data);
2519 for (ind = 0; ind < data->numind[pop][0]; ind++)
2520 {
2521 fprintf (world->outfile, "%-*.*s ", (int) options->nmlength,
2522 (int) options->nmlength, data->indnames[pop][ind][0]);
2523 mult80 = options->nmlength;
2524 for (locus = 0; locus < data->loci; locus++)
2525 {
2526 mult80 +=
2527 1 + (long) (strlen (data->yy[pop][ind][locus][0]) +
2528 strlen (data->yy[pop][ind][locus][1]));
2529 if (mult80 >= 80)
2530 {
2531 mult80 = 0;
2532 fprintf (world->outfile, "\n");
2533 for (i = 0; i < options->nmlength; i++)
2534 FPRINTF(world->outfile, " ");
2535 }
2536 fprintf (world->outfile, " %s.%-s",
2537 data->yy[pop][ind][locus][0],
2538 data->yy[pop][ind][locus][1]);
2539 }
2540 fprintf (world->outfile, "\n");
2541 }
2542 fprintf (world->outfile, "\n");
2543 }
2544 fprintf (world->outfile, "\n\n");
2545 fflush (world->outfile);
2546 }
2547
2548 void
print_seqdata(world_fmt * world,option_fmt * options,data_fmt * data)2549 print_seqdata (world_fmt * world, option_fmt * options, data_fmt * data)
2550 {
2551 long pop, locus;
2552 for (pop = 0; pop < data->numpop; pop++)
2553 {
2554 print_header (world->outfile, pop, world, options, data);
2555 for (locus = 0; locus < data->loci; locus++)
2556 {
2557 print_locus_head (locus, world, options, data);
2558 print_seq_pop (locus, pop, world, options, data);
2559 }
2560 }
2561 fflush (world->outfile);
2562 }
2563
2564 void
print_header(FILE * outfile,long pop,world_fmt * world,option_fmt * options,data_fmt * data)2565 print_header (FILE * outfile, long pop, world_fmt * world,
2566 option_fmt * options, data_fmt * data)
2567 {
2568 long i;
2569 long locus, mult80 = 80;
2570 fprintf (outfile, "\n%-s", data->popnames[pop]);
2571 for (i = 0; i < (long) (80 - (long) strlen (data->popnames[pop])); i++)
2572 fprintf(world->outfile, "-");
2573 fprintf (outfile, "\n\n");
2574 if (!strchr (SEQUENCETYPES, options->datatype))
2575 {
2576 fprintf (outfile, "%-s ", (data->loci == 1 ? "locus" : "loci "));
2577 for (i = 0; i < (options->nmlength - 6); i++)
2578 fprintf(world->outfile, " ");
2579 for (locus = 0; locus < data->loci; locus++)
2580 {
2581 if (locus * 4 + options->nmlength > mult80)
2582 {
2583 mult80 += 80;
2584 fprintf (outfile, "\n");
2585 for (i = 0; i < options->nmlength; i++)
2586 fprintf (outfile, " ");
2587 }
2588 fprintf (outfile, " %2li", locus + 1);
2589 }
2590 fprintf (outfile, "\n%-*.*s\n", (int) options->nmlength, (int) options->nmlength, "indiv.");
2591 }
2592 }
2593
2594
findleastsquare(MYREAL * rawdata,long total,long repeatlength,long shift,MYREAL * startvalue)2595 MYREAL findleastsquare(MYREAL *rawdata, long total, long repeatlength, long shift, MYREAL *startvalue)
2596 {
2597 long i;
2598 long j;
2599 long high;
2600 long low;
2601 MYREAL lowvalue=MYREAL_MAX;
2602 MYREAL highvalue=0.;
2603 MYREAL sum = 0.0;
2604 MYREAL value;
2605 MYREAL oldvalue;
2606 for(i=0;i<total;i++)
2607 {
2608 if(rawdata[i] < lowvalue)
2609 lowvalue = rawdata[i];
2610 if(rawdata[i]> highvalue)
2611 highvalue = rawdata[i];
2612 }
2613 high = (long) (highvalue+repeatlength+shift);
2614 low = (long) (lowvalue -repeatlength+shift);
2615 if(low<0){
2616 low = 0;
2617 }
2618 *startvalue = low;
2619 for(i=0;i<total;i++)
2620 {
2621 oldvalue = MYREAL_MAX;
2622 for(j=low ; j < high; j += repeatlength)
2623 {
2624 value = rawdata[i] - j;
2625 value *= value;
2626 if(value < oldvalue)
2627 {
2628 oldvalue = value;
2629 }
2630 }
2631 sum += oldvalue;
2632 }
2633 return sum;
2634 }
2635
find_allele_repeatlength(data_fmt * data,option_fmt * options,long locus)2636 void find_allele_repeatlength(data_fmt *data, option_fmt *options, long locus)
2637 {
2638 long z=0;
2639 long zz=0;
2640 long pop;
2641 long ind;
2642 long r;
2643 MYREAL *rawdata;
2644 char *a1;
2645 char *a2;
2646 long total=z;
2647 MYREAL minLS=MYREAL_MAX;
2648 MYREAL value;
2649 long intval;
2650 MYREAL startvalue= 0.0;
2651 MYREAL keepstartvalue= 0.0;
2652 MYREAL leastsquare;
2653 MYREAL nonfloored;
2654 MYREAL floored;
2655 MYREAL diff;
2656 for (pop = 0; pop < data->numpop; pop++)
2657 {
2658 zz += data->numind[pop][locus];
2659 }
2660 rawdata = (MYREAL *) calloc(2 * zz, sizeof(MYREAL));
2661
2662 for (pop = 0; pop < data->numpop; pop++)
2663 {
2664 for (ind = 0; ind < data->numind[pop][locus]; ind++)
2665 {
2666 a1 = data->yy[pop][ind][locus][0];
2667 a2 = data->yy[pop][ind][locus][1];
2668 if (a1[0]!='?')
2669 {
2670 rawdata[z++] = atof(a1);
2671 }
2672 if (a2[0]!='?')
2673 {
2674 rawdata[z++] = atof(a2);
2675 }
2676 }
2677 }
2678
2679 total=z;
2680 minLS=MYREAL_MAX;
2681 startvalue= 0.0;
2682 keepstartvalue= 0.0;
2683
2684 for(r=0;r<data->repeatlength[locus];r++)
2685 {
2686 //MYREAL endvalue = 0.0;
2687 leastsquare=findleastsquare(rawdata,total,data->repeatlength[locus],r,&startvalue);
2688 if(leastsquare < minLS)
2689 {
2690 minLS = leastsquare;
2691 // keepr = r;
2692 keepstartvalue = startvalue;
2693 }
2694 }
2695 for (pop = 0; pop < data->numpop; pop++)
2696 {
2697 for (ind = 0; ind < data->numind[pop][locus]; ind++)
2698 {
2699 a1 = data->yy[pop][ind][locus][0];
2700 a2 = data->yy[pop][ind][locus][1];
2701 if (a1[0]!='?')
2702 {
2703 value = atof(a1);
2704 //Floor[(279.47 - 259)/4 + If[(Random[]) < 1 - FractionalPart[(279.47 - 259)/4 ], 0, 1]], {1000}] // Tally
2705 nonfloored = (value - keepstartvalue)/data->repeatlength[locus];
2706 floored = floor(nonfloored);
2707 diff = nonfloored - floored;
2708 intval = MSAT_OFFSET + (long) floored + (RANDUM() < (1.0-diff) ? 0 : 1);
2709 sprintf(data->yy[pop][ind][locus][0],"%li",intval);
2710 }
2711 if (a2[0]!='?')
2712 {
2713 value = atof(a2);
2714 nonfloored = (value - keepstartvalue)/data->repeatlength[locus];
2715 floored = floor(nonfloored);
2716 diff = nonfloored - floored;
2717 intval = MSAT_OFFSET + (long) floored + (RANDUM() < (1.0-diff) ? 0 : 1);
2718 sprintf(data->yy[pop][ind][locus][1],"%li",intval);
2719 }
2720 }
2721 }
2722 }
2723
2724
2725 MYREAL
create_alleles(data_fmt * data,option_fmt * options)2726 create_alleles (data_fmt * data, option_fmt *options)
2727 {
2728 long n=0;
2729 MYREAL mean=0.;
2730 MYREAL delta=0.;
2731 long locus, pop, ind;
2732 long z;
2733 char *a1;//DEFAULT_ALLELENMLENGTH];
2734 char *a2;//DEFAULT_ALLELENMLENGTH];
2735 boolean second = (strchr(ALLELETYPES,options->datatype)!=NULL);
2736 a1 = (char *) malloc(sizeof(char)*LINESIZE);
2737 a2 = (char *) malloc(sizeof(char)*LINESIZE);
2738 long lena1 = LINESIZE;
2739 long lena2 = LINESIZE;
2740 for (locus = 0; locus < data->loci; locus++)
2741 {
2742 if(data->repeatlength[locus]!=0)
2743 find_allele_repeatlength(data, options,locus);
2744 z = 0;
2745 for (pop = 0; pop < data->numpop; pop++)
2746 {
2747 for (ind = 0; ind < data->numind[pop][locus]; ind++)
2748 {
2749
2750 char *data1 = data->yy[pop][ind][locus][0];
2751 long lendata1 = strlen(data1);
2752 if (lena1 < lendata1)
2753 {
2754 a1 = (char *) realloc(a1,sizeof(char)*(lendata1+1));
2755 lena1 = lendata1 + 1;
2756 }
2757 strcpy (a1, data1);
2758 if(second)
2759 {
2760 char *data2 = data->yy[pop][ind][locus][1];
2761 long lendata2 = strlen(data2);
2762 if (lena2 < lendata2)
2763 {
2764 a2 = (char *) realloc(a2,sizeof(char)*(lendata2+1));
2765 lena2 = lendata2 + 1;
2766 }
2767 strcpy (a2, data2);
2768 if (!strcmp (a1, a2))
2769 {
2770 addAllele (data, a1, locus, &z);
2771 }
2772 else
2773 {
2774 addAllele (data, a1, locus, &z);
2775 addAllele (data, a2, locus, &z);
2776 }
2777 }
2778 else
2779 {
2780 addAllele (data, a1, locus, &z);
2781 }
2782 }
2783 }
2784 if(z<=1)
2785 {
2786 data->skiploci[locus] = TRUE;// this skips loci with _no_ data
2787 continue;
2788 }
2789
2790 data->maxalleles[locus] = z + 1;
2791 /* + 1: for all the unencountered alleles */
2792 if(options->murates_fromdata)
2793 {
2794 if(options->mu_rates==NULL)
2795 {
2796 options->mu_rates = (MYREAL * ) mycalloc(data->loci,sizeof(MYREAL));
2797 // printf("%i> data.c: 1557 murate size %li\n",myID,data->loci * sizeof (MYREAL));
2798 }
2799 options->mu_rates[locus] = z+1;
2800 n = n + 1;
2801 delta = options->mu_rates[locus] - mean;
2802 mean += delta/n;
2803 }
2804 }
2805 if(options->murates_fromdata)
2806 {
2807 options->muloci = data->loci;
2808 for (locus=0; locus < data->loci; locus++)
2809 {
2810 if(!data->skiploci[locus])
2811 options->mu_rates[locus] /= mean;
2812 }
2813 }
2814 myfree(a1);
2815 myfree(a2);
2816 return mean;
2817 }
2818
2819 void
addAllele(data_fmt * data,char s[],long locus,long * z)2820 addAllele (data_fmt * data, char s[], long locus, long *z)
2821 {
2822 long found = 0;
2823 if(!strcmp("?",s))
2824 return;
2825 while ((data->allele[locus][found++][0] != '\0')
2826 && (strcmp (s, data->allele[locus][found - 1])))
2827 ;
2828 if (found > (*z))
2829 {
2830 strcpy (data->allele[locus][*z], s);
2831 (*z)++;
2832 }
2833 }
2834
2835 void
set_numind(data_fmt * data)2836 set_numind (data_fmt * data)
2837 {
2838 long locus, pop;
2839 for (locus = 1; locus < data->loci; locus++)
2840 {
2841 for (pop = 0; pop < data->numpop; pop++)
2842 {
2843 data->numind[pop][locus] = data->numind[pop][0];
2844 data->numalleles[pop][locus] = data->numalleles[pop][0];
2845 }
2846 }
2847 }
2848
2849
2850 void
print_seq_pop(long locus,long pop,world_fmt * world,option_fmt * options,data_fmt * data)2851 print_seq_pop (long locus, long pop, world_fmt * world, option_fmt * options,
2852 data_fmt * data)
2853 {
2854 long ind;
2855 for (ind = 0; ind < data->numalleles[pop][locus]; ind++)
2856 {
2857 print_seq_ind (locus, pop, ind, world, options, data);
2858 }
2859 }
2860
2861 void
print_seq_ind(long locus,long pop,long ind,world_fmt * world,option_fmt * options,data_fmt * data)2862 print_seq_ind (long locus, long pop, long ind, world_fmt * world,
2863 option_fmt * options, data_fmt * data)
2864 {
2865 long site;
2866 char blank[2] = " ";
2867 fprintf (world->outfile, "%-*.*s", (int) options->nmlength,
2868 (int) options->nmlength, data->indnames[pop][ind][0]);
2869 fprintf (world->outfile, " %c", data->yy[pop][ind][locus][0][0]);
2870 for (site = 1; site < data->seq[0]->sites[locus]; site++)
2871 {
2872 if ((site) % 60 == 0)
2873 {
2874 fprintf (world->outfile, "\n%-*.*s %c", (int) options->nmlength,
2875 (int) options->nmlength, blank,
2876 data->yy[pop][ind][locus][0][site]);
2877 }
2878 else
2879 {
2880 if ((site) % 10 == 0)
2881 {
2882 fprintf (world->outfile, " ");
2883 }
2884 fprintf (world->outfile, "%c", data->yy[pop][ind][locus][0][site]);
2885 }
2886 }
2887 fprintf (world->outfile, "\n");
2888 }
2889
2890
2891 void
print_locus_head(long locus,world_fmt * world,option_fmt * options,data_fmt * data)2892 print_locus_head (long locus, world_fmt * world, option_fmt * options,
2893 data_fmt * data)
2894 {
2895 char *head;
2896 head = (char *) mycalloc (1, sizeof (char) * MAX (10, options->nmlength));
2897 sprintf (head, "Locus %li", locus);
2898 fprintf (world->outfile, "%-*.*s --------10 --------20 --------30",
2899 (int) options->nmlength, (int) options->nmlength, head);
2900 fprintf (world->outfile, " --------40 --------50 --------60\n");
2901
2902 myfree(head);
2903 }
2904
2905 void
read_geofile(data_fmt * data,option_fmt * options,long numpop)2906 read_geofile (data_fmt * data, option_fmt * options, long numpop)
2907 {
2908 long i, j, pop;
2909 long numpop2 = numpop * numpop;
2910 data->geo = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2);
2911 data->lgeo = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2);
2912 if (!options->geo)
2913 {
2914 for (i = 0; i < numpop2; i++)
2915 data->geo[i] = 1.0;
2916 }
2917 else
2918 {
2919 data->ogeo = (MYREAL **) mycalloc (1, sizeof (MYREAL *) * numpop);
2920 data->ogeo[0] = (MYREAL *) mycalloc (1, sizeof (MYREAL) * numpop2);
2921 for (pop = 1; pop < numpop; pop++)
2922 data->ogeo[pop] = data->ogeo[0] + numpop * pop;
2923 read_distance_fromfile (data->geofile, numpop, options->nmlength,
2924 data->ogeo);
2925 for (i = 0; i < numpop; i++)
2926 {
2927 for (j = 0; j < numpop; j++)
2928 {
2929 if(i!=j)
2930 {
2931 data->geo[mm2m (i, j, numpop)] = 1. / data->ogeo[i][j];
2932 data->lgeo[mm2m (i, j, numpop)] = data->ogeo[i][j] > 0.0 ?
2933 LOG (1. / data->ogeo[i][j]) : -MYREAL_MAX;
2934 }
2935 }
2936 }
2937 }
2938 }
2939
2940 ///
2941 /// read the file with the tip dates and returns the oldest date
2942 MYREAL
read_date_fromfile(FILE * datefile,data_fmt * data,option_fmt * options,long nmlength)2943 read_date_fromfile (FILE * datefile, data_fmt *data, option_fmt *options, long nmlength)
2944 {
2945 char input[LINESIZE];
2946 long pop;
2947 long locus;
2948 long l;
2949 long ind;
2950 MYREAL oldest = 0. ;
2951 MYREAL youngest = DBL_MAX;
2952 MYREAL temp;
2953 char *name;
2954 boolean backward = TRUE;
2955 name = (char *) mycalloc(LINESIZE,sizeof(char));
2956 if (datefile != NULL)
2957 {
2958 FGETS (input, LINESIZE, datefile); //title line
2959 while(input[0]=='#')
2960 FGETS(input,LINESIZE,datefile);
2961 if(input[0]=='F') //this checks the direction of time
2962 backward=FALSE;
2963 fprintf(stdout,"\n%i> Tip dates from file %s\n----------------------------------------\n", myID, options->datefilename);
2964 fprintf(stdout,"Generations per year: %f\n", options->generation_year);
2965 for(pop=0;pop<data->numpop;pop++)
2966 {
2967 FGETS (input, LINESIZE, datefile); //first populations
2968 while(input[0]=='#')
2969 FGETS(input,LINESIZE,datefile);
2970 for (locus=0; locus < data->loci; locus++)
2971 {
2972 l = (locus >= options->mutationrate_year_numalloc ? options->mutationrate_year_numalloc -1 : locus);
2973 fprintf(stdout,"Mutation rate of Locus %li: %g\n", locus, options->mutationrate_year[l]);
2974 for(ind=0; ind < data->numind[pop][locus]; ind++)
2975 {
2976 FGETS (input, LINESIZE, datefile);
2977 while(input[0]=='#')
2978 FGETS(input,LINESIZE,datefile);
2979 #ifdef USE_MYREAL_FLOAT
2980 sscanf (input, "%s",name);
2981 sscanf (input+nmlength, "%f",&temp);
2982 #else
2983 sscanf (input, "%s",name);
2984 sscanf (input+nmlength, "%lf",&temp);
2985 #endif
2986 fprintf(stdout,"Locus %li: Tipdate %*.*s %f %f\n", locus, (int) nmlength, (int) nmlength, input, temp, temp * options->mutationrate_year[l] / options->generation_year);
2987 data->sampledates[pop][locus][ind].date = temp;
2988 data->sampledates[pop][locus][ind].name = (char *) mycalloc(strlen(name)+1,sizeof(char));
2989 strcpy(data->sampledates[pop][locus][ind].name, name);
2990 unpad(data->sampledates[pop][locus][ind].name," ");
2991 translate(data->sampledates[pop][locus][ind].name,' ', '_');
2992 if(oldest < temp)
2993 oldest = temp;
2994 if(youngest > temp)
2995 youngest = temp;
2996 }
2997 }
2998 }
2999 // are the times forward or backward?
3000 for(pop=0;pop<data->numpop;pop++)
3001 {
3002 for (locus=0; locus < data->loci; locus++)
3003 {
3004 for(ind=0; ind < data->numind[pop][locus]; ind++)
3005 {
3006 if(backward)
3007 {
3008 data->sampledates[pop][locus][ind].date =
3009 data->sampledates[pop][locus][ind].date - youngest;
3010 }
3011 else
3012 {
3013 data->sampledates[pop][locus][ind].date =
3014 (oldest - data->sampledates[pop][locus][ind].date)
3015 + (youngest - oldest);
3016 }
3017 }
3018 }
3019 }
3020 }
3021 free(name);
3022 return oldest;
3023 }
3024
3025 ///
3026 /// read the file with the tip dates
3027 void
read_datefile(data_fmt * data,option_fmt * options,long numpop)3028 read_datefile (data_fmt * data, option_fmt * options, long numpop)
3029 {
3030 long locus, pop;
3031 data->sampledates = (tipdate_fmt ***) mycalloc (data->numpop, sizeof (tipdate_fmt **));
3032 data->sampledates[0] = (tipdate_fmt **) mycalloc (data->numpop * data->loci, sizeof (tipdate_fmt *));
3033 for (pop = 1; pop < data->numpop; pop++)
3034 {
3035 data->sampledates[pop] = data->sampledates[0] + data->loci * pop;
3036 }
3037 for(locus=0;locus<data->loci;locus++)
3038 {
3039 for(pop=0;pop < data->numpop; pop++)
3040 {
3041 data->sampledates[pop][locus] = (tipdate_fmt*) mycalloc(data->numind[pop][locus],sizeof(tipdate_fmt));
3042 }
3043 }
3044
3045 if (options->has_datefile)
3046 {
3047 data->maxsampledate = read_date_fromfile (data->datefile, data, options, options->nmlength);
3048 // printf("%i> in data section maxsampledate=%f\n",myID, data->maxsampledate);
3049 }
3050 else
3051 {
3052 data->maxsampledate=0.0;
3053 }
3054 }
3055
3056 #ifdef UEP
3057 void
read_uepfile(data_fmt * data,option_fmt * options,long numpop)3058 read_uepfile (data_fmt * data, option_fmt * options, long numpop)
3059 {
3060 long i;
3061 long sumtips = 0;
3062
3063 if (!options->uep)
3064 return;
3065
3066 for (i = 0; i < numpop; ++i)
3067 sumtips += data->numind[i][0]; //Assumes that UEP has the same number of individuals as
3068 // locus 1 (Is this OK? most dataset with UEP will have 1 locus?????)
3069 data->uep = (int **) mycalloc (number_genomes (options->datatype) * sumtips,
3070 sizeof (int *));
3071 read_uep_fromfile (data->uepfile, sumtips, options->nmlength, data->uep,
3072 &data->uepsites, options->datatype);
3073 }
3074
3075 #endif
3076