1 #ifndef GETDATA_INCLUDE
2 #include "getdata.h"
3 #endif
4 
5 #ifdef DMEMDEBUG
6 #define MEMDEBUG
7 #include "memdebug.h"
8 #endif
9 
10 unsigned int baseA=0, baseC=1, baseG=2, baseT=3;
11 
12 extern FILE *simlog, *weightfile;
13 extern long locus, population;
14 
15 extern void read_line(FILE *source, char *line);
16 extern void setupmsatdata(msatdata **ms, long numpop, long numloci,
17    long numind);
18 extern void freemsatdata(msatdata *ms);
19 extern void calculate_steps(msatdata *ms);
20 extern boolean isinvariant(option_struct *op, data_fmt *data, long site,
21   creature *cr);
22 
23 
24 /**********************************************************
25  * setupdata() initializes the appropiate data structures */
setupdata(data_fmt * data,char datatype,long numpop,long numloci,long numind,long * numsites,char * title)26 void setupdata(data_fmt *data, char datatype, long numpop, long numloci,
27   long numind, long *numsites, char *title)
28 {
29 
30 switch(datatype) {
31    case 'a':
32       data->dnaptr = NULL;
33       data->msptr = NULL;
34       break;
35    case 'b':
36    case 'm':
37       data->dnaptr = NULL;
38       setupmsatdata(&(data->msptr),numpop,numloci,numind);
39       strcpy(data->msptr->title,title);
40       calculate_steps(data->msptr);
41       break;
42    case 'n':
43    case 's':
44       setupdnadata(&(data->dnaptr),numsites,numind,numloci,numpop);
45       strcpy(data->dnaptr->title,title);
46       data->msptr = NULL;
47       break;
48    default:
49       fprintf(ERRFILE,"setupdata, found an unknown datatype %c\n",
50          datatype);
51       exit(-1);
52 }
53 } /* setupdata */
54 
55 
56 /***********************************************
57  * freedata() frees the general data structure */
freedata(data_fmt * data)58 void freedata(data_fmt *data)
59 {
60 
61 if (data->dnaptr) {freednadata(data->dnaptr); data->dnaptr = NULL;}
62 if (data->msptr) {freemsatdata(data->msptr); data->msptr = NULL;}
63 } /* freedata */
64 
65 
66 /********************************************************************
67  * getdata_numpop() returns the number of populations stored in the *
68  * general data structure or FLAGLONG if it fails.                  */
getdata_numpop(option_struct * op,data_fmt * data)69 long getdata_numpop(option_struct *op, data_fmt *data)
70 {
71 
72 switch(op->datatype)
73 {
74    case 'a':
75       break;
76    case 'b':
77    case 'm':
78       return(data->msptr->numpop);
79       break;
80    case 'n':
81    case 's':
82       return(data->dnaptr->numpop);
83       break;
84    default:
85       return(FLAGLONG);
86       break;
87 }
88 
89 return(FLAGLONG);
90 
91 } /* getdata_numpop */
92 
93 
94 /**************************************************************
95  * getdata_numloci() returns the number of loci stored in the *
96  * general data structure or FLAGLONG if it fails.            */
getdata_numloci(option_struct * op,data_fmt * data)97 long getdata_numloci(option_struct *op, data_fmt *data)
98 {
99 
100 switch(op->datatype)
101 {
102    case 'a':
103       break;
104    case 'b':
105    case 'm':
106       return(data->msptr->numloci);
107       break;
108    case 'n':
109    case 's':
110       return(data->dnaptr->numloci);
111       break;
112    default:
113       return(FLAGLONG);
114       break;
115 }
116 
117 return(FLAGLONG);
118 
119 } /* getdata_numloci */
120 
121 
122 /********************************************************************
123  * getdata_nummarkers() returns the number of markers that the user *
124  * provides in the input data.  FLAGLONG is returned on failure.    */
getdata_nummarkers(option_struct * op,data_fmt * data)125 long getdata_nummarkers(option_struct *op, data_fmt *data)
126 {
127 
128 switch(op->datatype)
129 {
130    case 'a':
131       break;
132    case 'b':
133    case 'm':
134       return(data->msptr->numloci);
135       break;
136    case 'n':
137    case 's':
138       return(data->dnaptr->sites[locus]);
139       break;
140    default:
141       return(FLAGLONG);
142       break;
143 }
144 
145 return(FLAGLONG);
146 
147 } /* getdata_nummarkers */
148 
149 
150 /**********************************************************************
151  * getdata_numseq() returns the number of individual sequences in the *
152  * data stored in the general data structure or FLAGLONG if it fails. */
getdata_numseq(option_struct * op,data_fmt * data)153 long getdata_numseq(option_struct *op, data_fmt *data)
154 {
155 
156 switch(op->datatype)
157 {
158    case 'a':
159       break;
160    case 'b':
161    case 'm':
162       return(data->msptr->numind[population]*NUMCHROM);
163       break;
164    case 'n':
165    case 's':
166       return(data->dnaptr->numseq[population]);
167       break;
168    default:
169       return(FLAGLONG);
170       break;
171 }
172 
173 return(FLAGLONG);
174 
175 } /* getdata_numseq */
176 
177 
178 /********************************************************************
179  * getdata_numtips() returns the number of tips present in the tree *
180  * stored in the general data structure or FLAGLONG if it fails.    */
getdata_numtips(option_struct * op,data_fmt * data)181 long getdata_numtips(option_struct *op, data_fmt *data)
182 {
183 
184 switch(op->datatype)
185 {
186    case 'a':
187       break;
188    case 'b':
189    case 'm':
190       return(data->msptr->numind[population]*NUMCHROM);
191       break;
192    case 'n':
193       if (op->panel)
194          return(data->dnaptr->numseq[population] +
195                 op->numpanel[population]);
196    case 's':
197       return(data->dnaptr->numseq[population]);
198       break;
199    default:
200       return(FLAGLONG);
201       break;
202 }
203 
204 return(FLAGLONG);
205 
206 } /* getdata_numtips */
207 
208 
209 /*******************************************************************
210  * getdata_sitecount() returns the contents of the sitecount array *
211  * at the passed marker or FLAGLONG if it fails.                   */
getdata_sitecount(option_struct * op,data_fmt * data,long marker)212 long getdata_sitecount(option_struct *op, data_fmt *data, long marker)
213 {
214 
215 switch(op->datatype)
216 {
217    case 'a':
218       break;
219    case 'b':
220    case 'm':
221       break;
222    case 'n':
223    case 's':
224       return(data->dnaptr->sitecount[locus][marker]);
225       break;
226    default:
227       return(FLAGLONG);
228       break;
229 }
230 
231 return(FLAGLONG);
232 
233 } /* getdata_sitecount */
234 
235 
236 /*********************************************************************
237  * getdata_markersite() returns the contents of the markersite array *
238  * at the passed marker or FLAGLONG if it fails.                     */
getdata_markersite(option_struct * op,data_fmt * data,long marker)239 long getdata_markersite(option_struct *op, data_fmt *data, long marker)
240 {
241 
242 switch(op->datatype)
243 {
244    case 'a':
245       break;
246    case 'b':
247    case 'm':
248       break;
249    case 'n':
250    case 's':
251       return(data->dnaptr->markersite[locus][marker]);
252       break;
253    default:
254       return(FLAGLONG);
255       break;
256 }
257 
258 return(FLAGLONG);
259 
260 } /* getdata_markersite */
261 
262 
263 /*******************************************************************
264  * getdata_space() returns the length of the passed link stored in *
265  * the general data structure or FLAGLONG if it fails.             */
getdata_space(option_struct * op,data_fmt * data,long target)266 double getdata_space(option_struct *op, data_fmt *data, long target)
267 {
268 
269 switch(op->datatype)
270 {
271    case 'a':
272       break;
273    case 'b':
274    case 'm':
275       return(data->msptr->mspace[population][locus][target]);
276       break;
277    case 'n':
278       if (target == 0) {
279          return(data->dnaptr->sspace[population][locus][target] +
280  data->dnaptr->sspace[population][locus][getdata_nummarkers(op,data)]);
281          break;
282       }
283    case 's':
284       return(data->dnaptr->sspace[population][locus][target]);
285       break;
286    default:
287       return(FLAGLONG);
288       break;
289 }
290 
291 return(FLAGLONG);
292 
293 } /* getdata_space */
294 
295 
296 /***************************************************************
297  * read_spacefile() reads the spacing info from the designated *
298  * spacefile.                                                  *
299  *                                                             *
300  * entry "0" from a spacefile is read as the number of spaces  *
301  * before the first marker.                                    */
read_spacefile(FILE * file,option_struct * op,data_fmt * data)302 void read_spacefile(FILE *file, option_struct *op, data_fmt *data)
303 {
304 long numgaps, gapcnt, markernum;
305 char *input,*tok;
306 dnadata *dna;
307 
308 input = (char *)calloc(LINESIZE,sizeof(char));
309 *input = '\0';
310 
311 dna = data->dnaptr;
312 
313 read_line(file,input);
314 sscanf(input,"%ld%c[^\n]",&numgaps,&(dna->sdlm));
315 
316 for(gapcnt = 0; gapcnt < numgaps; gapcnt++) {
317    read_line(file,input);
318    tok = strtok(input,&dna->sdlm);
319    markernum = atol(tok) - 1;
320 /* if the entry is supposed to be the spacing before the first
321    marker, then put it at its proper place */
322    if (markernum == -1) markernum = getdata_nummarkers(op,data);
323    dna->sspace[population][locus][markernum] =
324                                   (double)atof(strtok(NULL," "));
325 }
326 
327 } /* read_spacefile */
328 
329 
330 /*****************************************************************
331  * longcmp() is a qsort helper for longs, thanks to Peter Beerli */
longcmp(const void * v1,const void * v2)332 int longcmp (const void *v1, const void *v2)
333 {
334   if (*(long *) v1 < *(long *) v2)
335     {
336       return -1;
337     }
338   else
339     {
340       if (*(long *) v1 > *(long *) v2)
341         {
342           return 1;
343         }
344       else
345         return 0;
346     }
347 } /* longcmp */
348 
349 
350 /*********************************************
351  * doublecmp() is a qsort helper for doubles */
doublecmp(const void * v1,const void * v2)352 int doublecmp (const void *v1, const void *v2)
353 {
354   if (*(double *) v1 < *(double *) v2)
355     {
356       return -1;
357     }
358   else
359     {
360       if (*(double *) v1 > *(double *) v2)
361         {
362           return 1;
363         }
364       else
365         return 0;
366     }
367 } /* doublecmp */
368 
369 
370 /********************************************************************
371  * read_flipfile() reads the questionable site information from     *
372  * the passed file.                                                 *
373  *                                                                  *
374  * read_flipfile() can only be called after treesetup().            *
375  *                                                                  *
376  * Expected file format of a flipfile:                              *
377  *    1st line the number of individuals covered by the flipfile    *
378  *                                                                  *
379  *    then, for each individual in the file:                        *
380  *       a line containing one of the following                     *
381  *          "all" -> use if all the site are flippable.             *
382  *                                                                  *
383  *          numbers seperated by spaces -> use if you're specifying *
384  *             which sites are flippable.                           *
385  *             (e.g. "5 4 67 89 32 100" would mean that there are 5 *
386  *             flippable sites with position numbers 4, 32, 67, 89  *
387  *             and 100)                                             *
388  *                                                                  *
389  *          an exclamation point immediately followed by a list of  *
390  *          numbers seperated by spaces -> use if you're specifying *
391  *             which sites are NOT flippable.                       *
392  *             (e.g. "!5 4 67 89 32 100" means that there are 5     *
393  *             non-flippable sites with position numbers 4, 32, 67, *
394  *             89 and 100)                                          *
395  *                                                                  *
396  *    the last line should just contain the word "end".             */
read_flipfile(option_struct * op,data_fmt * data,tree * tr,FILE * file)397 void read_flipfile(option_struct *op, data_fmt *data, tree *tr, FILE *file)
398 {
399 long i, j, whichcreature, numcreatures, site, numsites, temp, *tempa;
400 char *fileline, *fline, *num, char1;
401 creature *cr;
402 
403 fscanf(file,"%ld",&numcreatures);
404 if (numcreatures != getdata_numtips(op,data)/NUMHAPLOTYPES) {
405    fprintf(ERRFILE,"\nERROR--number of sequences in infile and number");
406    fprintf(ERRFILE," of individuals in flipfile don't match!\n\n");
407    exit(-1);
408 }
409 
410 numsites = getdata_nummarkers(op,data);
411 
412 for(whichcreature = 0; whichcreature < numcreatures; whichcreature++) {
413    fline = fileline = (char *)calloc(LINESIZE,sizeof(char));
414    read_line(file,fileline);
415    char1 = fileline[0];
416    if (char1 != 'a' && char1 != '!' && !isdigit(char1)) {
417       fprintf(ERRFILE,"\nERROR--not a valid flipfile entry for line");
418       fprintf(ERRFILE," containing:\n %s\n",fileline);
419       exit(-1);
420    }
421    cr = &(tr->creatures[whichcreature]);
422    switch(char1) {
423       case 'a':
424          cr->numflipsites = numsites;
425          cr->flipsites = (long *)calloc(numsites,sizeof(long));
426          for(site = 0; site < numsites; site++) cr->flipsites[site] = site;
427          break;
428       case '!':
429          fileline++;
430          temp = atol(strtok(fileline," "));
431          cr->numflipsites = numsites-temp;
432          tempa = (long *)calloc(temp,sizeof(long));
433          for(i = 0; i < temp; i++) tempa[i] = atol(strtok(NULL," "));
434          qsort((void *)tempa,temp,sizeof(long),longcmp);
435          cr->flipsites = (long *)calloc(numsites-temp,sizeof(long));
436          for(site = 0, i = 0, j = 0; site < numsites; site++, j++) {
437             if (i < temp) {
438                if (site == tempa[i]) {
439                   j--;
440                   i++;
441                   continue;
442                }
443             }
444             cr->flipsites[j] = site;
445          }
446          free(tempa);
447          break;
448       default: /* assume we're dealing with simply digits */
449          cr->numflipsites = atol(strtok(fileline," "));
450          cr->flipsites = (long *)calloc(cr->numflipsites,sizeof(long));
451          for(site = 0; site < cr->numflipsites; site++) {
452             num = strtok(NULL," ");
453             if (!num) {
454                fprintf(ERRFILE,"ERROR--in flipfile, creature %ld",
455                   whichcreature);
456                fprintf(ERRFILE," %ldth entry, failure to read.\n",
457                   site+1);
458                exit(-1);
459             }
460             cr->flipsites[site] = atol(num);
461             /* cr->flipsites[site] = atol(strtok(NULL," ")); */
462          }
463          qsort((void *)cr->flipsites,cr->numflipsites,sizeof(long),longcmp);
464          break;
465    }
466    free(fline);
467 }
468 
469 
470 } /* read_flipfile */
471 
472 
473 /*******************************************************************
474  * pruneflips() removes "uninteresting" entries from the flippable *
475  * list.  It also "verifies" the marker aliasing for likelihood.   *
476  *                                                                 *
477  * "uninteresting" = invariant                                     */
pruneflips(option_struct * op,data_fmt * data,tree * tr)478 void pruneflips(option_struct *op, data_fmt *data, tree *tr)
479 {
480 long i, whichcreature, numcreatures, site, nummarkers, *fmarkers, temp,
481   *siteptr;
482 creature *cr;
483 boolean noflips;
484 
485 numcreatures = getdata_numtips(op,data)/NUMHAPLOTYPES;
486 nummarkers = getdata_nummarkers(op,data);
487 siteptr = data->siteptr;
488 fmarkers = (long *)calloc(nummarkers,sizeof(long));
489 
490 for(whichcreature = 0; whichcreature < numcreatures; whichcreature++) {
491    cr = &(tr->creatures[whichcreature]);
492    for(site = cr->numflipsites-1; site > -1; site--) {
493       if (isinvariant(op,data,cr->flipsites[site],cr)) {
494          cr->numflipsites--;
495          for(i = site; i < cr->numflipsites; i++) {
496             cr->flipsites[i] = cr->flipsites[i+1];
497          }
498       } else fmarkers[cr->flipsites[site]] = 1;
499    }
500 }
501 
502 /* now to disable datalikelihood aliasing on all remaining flippable
503 markers-if removed then fliphap must do a complete redo of siteptr
504 array, not just rebuild_alias() */
505 for(site = 0; site < nummarkers; site++) {
506     if (!fmarkers[site]) continue;
507 
508     temp = siteptr[site];
509     siteptr[site] = 0;
510     for(i = site+1; i < nummarkers; i++) {
511        if (siteptr[i] == site+1) {
512           siteptr[i] = temp;
513           break;
514        }
515     }
516 }
517 
518 free(fmarkers);
519 
520 noflips = TRUE;
521 for(whichcreature = 0; whichcreature < numcreatures; whichcreature++) {
522    cr = &(tr->creatures[whichcreature]);
523    if (cr->numflipsites != 0) noflips = FALSE;
524 }
525 if (noflips) {
526    fprintf(ERRFILE,"\n\nAll sites specified as phase-unknown are ");
527    fprintf(ERRFILE,"invariant, probable error in specifying flipfile.\n\n");
528    exit(-1);
529 }
530 
531 } /* pruneflips */
532 
533 
534 /*********************************************************
535  * read_indname() reads the individual names of the data */
read_indname(FILE * file,data_fmt * data,long pop,long lowcus,long ind,long nmlength,char datatype)536 void read_indname(FILE *file, data_fmt *data, long pop, long lowcus,
537    long ind, long nmlength, char datatype)
538 {
539 long i=0;
540 
541 switch(datatype) {
542    case 'a':
543       break;
544    case 'b':
545    case 'm':
546       while(i<nmlength)
547          data->msptr->indnames[pop][lowcus][ind][i++]=getc(file);
548       data->msptr->indnames[pop][lowcus][ind][nmlength]='\0';
549       break;
550    case 'n':
551    case 's':
552       while(i<nmlength)
553          data->dnaptr->indnames[pop][lowcus][ind][i++]=getc(file);
554       data->dnaptr->indnames[pop][lowcus][ind][nmlength]='\0';
555       break;
556    default:
557       fprintf(ERRFILE,"read_indname, can't get here!\n");
558       break;
559 }
560 
561 } /* read_indname */
562 
563 
564 /******************************************************************
565  * read_microalleles() actually reads in the microsattellite data */
read_microalleles(FILE * infile,msatdata * data,long pop,long ind)566 void read_microalleles(FILE *infile, msatdata *data, long pop, long ind)
567 {
568 char *input, dlm[2],ddlm[2], *a, *a1, *a2;
569 long lowcus,i;
570 
571 input = (char *) calloc(1,sizeof(char)*(LINESIZE+1));
572 a = (char *) calloc(1,sizeof(char)*LINESIZE);
573 a1 = (char *) calloc(1,sizeof(char)*LINESIZE);
574 a2 = (char *) calloc(1,sizeof(char)*LINESIZE);
575 dlm[0]=data->dlm, dlm[1]='\0';
576 ddlm[0]=' ', ddlm[1]='\0';
577 
578 read_line(infile,input);
579 
580 for(lowcus=0; lowcus<data->numloci; lowcus++) {
581    if(input[0]=='\0') read_line(infile,input);
582    while(isspace((int) *input)) input++;
583    i=0;
584    while(input[i]!=' ' && input[i]!=dlm[0]) {
585       a1[i]=input[i];
586       i++;
587    }
588    a1[i]='\0';
589    input += i;
590    i=0;
591    if(input[i]==dlm[0]){
592       input++;
593       while(input[i]!=' ' && input[i]!='\0') {
594         a2[i]=input[i];
595         i++;
596       }
597       a2[i]='\0';
598       if(a2[0]=='\0') strcpy(a2,a1);
599    input += i;
600    } else strcpy(a2,a1);
601    data->msats[pop][ind][lowcus][0] = atol(a1);
602    data->msats[pop][ind][lowcus][1] = atol(a2);
603 }
604 
605 free(a);
606 free(a1);
607 free(a2);
608 free(input);
609 
610 } /* read_microalleles */
611 
612 
613 /*************************************************************
614  * read_ind_seq() reads in the first line of a dna sequence, *
615  * subsequent lines will be read by finish_read_seq()        */
read_ind_seq(FILE * infile,dnadata * data,option_struct * op,long lowcus,long pop,long ind,long baseread)616 long read_ind_seq(FILE *infile, dnadata *data, option_struct *op,
617    long lowcus, long pop, long ind, long baseread)
618 {
619 long j;
620 char charstate;
621 
622 j = (op->interleaved) ? baseread : 0;
623 charstate = getc(infile);
624 ungetc((int) charstate,infile);
625 while (j < data->sites[lowcus] &&
626        !(op->interleaved && charstate=='\n')) {
627    charstate = getc(infile);
628    if (charstate == '\n') {
629       if(op->interleaved) return j;
630       else charstate=' ';
631    }
632    if (isspace(charstate) || isdigit(charstate))
633         continue;
634    charstate = uppercase(charstate);
635    if ((strchr("ABCDGHKMNRSTUVWXY?O-",(int) charstate)) == NULL){
636         fprintf(ERRFILE,"ERROR: BAD BASE: %c AT POSITION %5ld OF",
637            charstate,j);
638         fprintf(ERRFILE," INDIVIDUAL %3li in POPULATION %ld\n",ind,pop);
639         exit(-1);
640    }
641    data->seqs[pop][lowcus][ind][j++] = charstate;
642 }
643 
644 charstate = getc(infile); /* swallow the \n */
645 return j;
646 
647 } /* read_ind_seq */
648 
649 
650 /***************************************************************
651  * finish_read_seq() finishes reading in the dna data begun by *
652  * read_ind_seq().                                             */
finish_read_seq(FILE * infile,data_fmt * data,option_struct * op,long pop,long baseread)653 void finish_read_seq(FILE *infile, data_fmt *data, option_struct *op,
654    long pop, long baseread)
655 {
656 
657 long ind, baseread2=0, lowcus=0;
658 
659 if(op->interleaved){
660    while(baseread<data->dnaptr->sites[0]) {
661       for(ind=0; ind < data->dnaptr->numseq[pop]; ind++) {
662          baseread2 =
663             read_ind_seq(infile,data->dnaptr,op,lowcus,pop,ind,baseread);
664       }
665       baseread = baseread2;
666    }
667 }
668 
669 for(lowcus = 1; lowcus < getdata_numloci(op,data); lowcus++){
670    baseread=0;
671 /* "population" doesn't exist at this point so... */
672    for(ind=0; ind < data->dnaptr->numseq[pop]; ind++) {
673       read_indname(infile,data,pop,lowcus,ind,NMLNGTH,op->datatype);
674       baseread =
675          read_ind_seq(infile,data->dnaptr,op,lowcus,pop,ind,0);
676    }
677    if(op->interleaved){
678       while(baseread<data->dnaptr->sites[lowcus]){
679          for (ind=0; ind < data->dnaptr->numseq[pop]; ind++) {
680              baseread2 =
681                 read_ind_seq(infile,data->dnaptr,op,lowcus,pop,ind,baseread);
682          }
683          baseread=baseread2;
684       }
685    }
686 }
687 
688 } /* finish_read_seq */
689 
690 
691 /*****************************************************
692  * read_popdata() reads in the data from file INFILE */
read_popdata(FILE * infile,data_fmt * data,long pop,option_struct * op)693 void read_popdata(FILE *infile, data_fmt *data, long pop,
694    option_struct *op)
695 {
696 long ind, baseread=0, lowcus=0, numentries;
697 
698 /* initialization for lint */
699 numentries = 0;
700 
701 switch(op->datatype) {
702    case 'a':
703       break;
704    case 'b':
705    case 'm':
706       numentries = data->msptr->numind[pop];
707       break;
708    case 'n':
709    case 's':
710       numentries = data->dnaptr->numseq[pop];
711       break;
712    default:
713       fprintf(ERRFILE,"read_popdata: can't be here!\n");
714       exit(-1);
715 }
716 
717 for(ind=0; ind < numentries; ind++) {
718    read_indname(infile,data,pop,lowcus,ind,NMLNGTH,op->datatype);
719    switch(op->datatype) {
720    case 'a':
721      /* read_alleles(infile, data,pop, ind); */
722       break;
723    case 'b':
724    case 'm':
725      /* if (data->dlm == '\0') read_alleles(infile, data,pop, ind);
726       else */ read_microalleles(infile,data->msptr,pop,ind);
727       break;
728    case 'n':
729    case 's':
730       baseread=read_ind_seq(infile,data->dnaptr,op,lowcus,pop,ind,0L);
731       break;
732    default:
733       fprintf(stderr,"Wrong datatype, only the types a, m, s");
734       fprintf(stderr," (electrophoretic alleles, \nmicrosatellite data,");
735       fprintf(stderr,"sequence data) are allowed.\n");
736       exit(-1);
737       break;
738    }
739 }
740 if(op->datatype=='s' || op->datatype=='n')
741    finish_read_seq(infile,data,op,pop,baseread);
742 
743 } /* read_popdata */
744 
745 
746 /***********************************************************************
747  * setpopstuff() sets the data structure's population specific fields: *
748  *    # of individuals per population, population title.               */
setpopstuff(data_fmt * data,long pop,long numind,char * poptitle,char datatype)749 void setpopstuff(data_fmt *data, long pop, long numind, char *poptitle,
750    char datatype)
751 {
752 
753 switch(datatype) {
754    case 'a':
755       break;
756    case 'b':
757    case 'm':
758       data->msptr->numind[pop] = numind;
759       strcpy(data->msptr->popnames[pop],poptitle);
760       break;
761    case 'n':
762    case 's':
763       data->dnaptr->numseq[pop] = numind;
764       strcpy(data->dnaptr->popnames[pop],poptitle);
765       break;
766    default:
767       fprintf(ERRFILE,"setpopstuff, found an unknown datatype %c\n",
768          datatype);
769       exit(-1);
770 }
771 
772 } /* setpopstuff */
773 
774 
775 /*****************************************************
776  * setupdnadata() initializes the dna data structure */
setupdnadata(dnadata ** dna,long * numsites,long numseq,long numloci,long numpop)777 void setupdnadata(dnadata **dna, long *numsites, long numseq,
778    long numloci, long numpop)
779 {
780 long i, j, k, maxsites=0;
781 
782 (*dna) = (dnadata *)calloc(1,sizeof(dnadata));
783 
784 (*dna)->title = (char *)calloc(LINESIZE,sizeof(char));
785 (*dna)->popnames = (char **)calloc(numpop,sizeof(char *));
786 (*dna)->popnames[0] = (char *)calloc(numpop*LINESIZE,sizeof(char));
787 for(i = 1; i < numpop; i++)
788    (*dna)->popnames[i] = (*dna)->popnames[0] + i*LINESIZE;
789 
790 (*dna)->numpop = numpop;
791 (*dna)->numloci = numloci;
792 
793 (*dna)->numseq = (long *)calloc(numpop,sizeof(long));
794 /* This is reset for all populations, except the first,
795    by setpopstuff() */
796 for(i = 0; i < numpop; i++) (*dna)->numseq[i] = numseq;
797 
798 (*dna)->sites = (long *)calloc(numloci,sizeof(long));
799 (*dna)->sitecount = (long **)calloc(numloci,sizeof(long *));
800 (*dna)->markersite = (long **)calloc(numloci,sizeof(long *));
801 for(i = 0; i < numloci; i++) {
802    (*dna)->sites[i] = numsites[i];
803    if (numsites[i] > maxsites) maxsites = numsites[i];
804 }
805 (*dna)->sitecount[0] = (long *)calloc(numloci*maxsites,sizeof(long));
806 (*dna)->markersite[0] = (long *)calloc(numloci*maxsites,sizeof(long));
807 for(i = 1; i < numloci; i++) {
808    (*dna)->sitecount[i] = (*dna)->sitecount[0] + i*maxsites;
809    (*dna)->markersite[i] = (*dna)->markersite[0] + i*maxsites;
810 }
811 
812 (*dna)->seqs = (char ****)calloc(numpop,sizeof(char ***));
813 (*dna)->indnames = (char ****)calloc(numpop,sizeof(char ***));
814 (*dna)->sspace = (double ***)calloc(numpop,sizeof(double **));
815 
816 (*dna)->seqs[0] = (char ***)calloc(numpop*numloci,sizeof(char **));
817 (*dna)->indnames[0] = (char ***)calloc(numpop*numloci,sizeof(char **));
818 (*dna)->sspace[0] = (double **)calloc(numpop*numloci,sizeof(double *));
819 for(i = 1; i < numpop; i++) {
820    (*dna)->seqs[i] = (*dna)->seqs[0] + i*numloci;
821    (*dna)->indnames[i] = (*dna)->indnames[0] + i*numloci;
822    (*dna)->sspace[i] = (*dna)->sspace[0] + i*numloci;
823 }
824 
825 (*dna)->seqs[0][0] =
826    (char **)calloc(numpop*numloci*numseq,sizeof(char *));
827 (*dna)->indnames[0][0] =
828    (char **)calloc(numpop*numloci*numseq,sizeof(char *));
829 /* use maxsites+1 in allocating sspace to include spacing before
830    the first marker */
831 (*dna)->sspace[0][0] =
832    (double *)calloc(numpop*numloci*(maxsites+1),sizeof(double));
833 for(i = 0; i < numpop; i++)
834    for(j = 0; j < numloci; j++) {
835       (*dna)->seqs[i][j] = (*dna)->seqs[0][0] + i*numloci*numseq
836          + j*numseq;
837       (*dna)->indnames[i][j] = (*dna)->indnames[0][0] + i*numloci*numseq
838          + j*numseq;
839       (*dna)->sspace[i][j] = (*dna)->sspace[0][0] + i*numloci*(maxsites+1)
840          + j*(maxsites+1);
841    }
842 
843 for(i = 0; i < numpop; i++)
844    for(j = 0; j < numloci; j++) {
845       for(k = 0; k < maxsites; k++)
846          (*dna)->sspace[i][j][k] = 1.0;
847 /* initialize the most leftward space entry to zero, no additional sites
848    before the first marker. */
849       (*dna)->sspace[i][j][(*dna)->sites[i]] = 0.0;
850    }
851 
852 (*dna)->seqs[0][0][0] =
853    (char *)calloc(numpop*numloci*numseq*maxsites,sizeof(char));
854 (*dna)->indnames[0][0][0] =
855    (char *)calloc(numpop*numloci*numseq*(NMLNGTH+1),sizeof(char));
856 for(i = 0; i < numpop; i++)
857    for(j = 0; j < numloci; j++)
858       for(k = 0; k < numseq; k++) {
859          (*dna)->seqs[i][j][k] = (*dna)->seqs[0][0][0]
860              + i*numloci*numseq*maxsites + j*numseq*maxsites + k*maxsites;
861          (*dna)->indnames[i][j][k] = (*dna)->indnames[0][0][0]
862              + i*numloci*numseq*(NMLNGTH+1) + j*numseq*(NMLNGTH+1)
863              + k*(NMLNGTH+1);
864       }
865 
866 (*dna)->dnaweight = (long *)calloc(maxsites,sizeof(long));
867 for(i = 0; i < maxsites; i++) (*dna)->dnaweight[i] = 1;
868 
869 (*dna)->segranges0 = NULL;
870 (*dna)->segranges1 = NULL;
871 (*dna)->segranges2 = NULL;
872 (*dna)->segranges3 = NULL;
873 
874 } /* setupdnadata */
875 
876 
877 /**********************************************
878  * freednadata() frees the dna data structure */
freednadata(dnadata * dna)879 void freednadata(dnadata *dna)
880 {
881 free(dna->popnames[0]);
882 free(dna->popnames);
883 free(dna->title);
884 free(dna->dnaweight);
885 free(dna->numseq);
886 free(dna->sites);
887 free(dna->sspace[0]);
888 free(dna->sspace);
889 free(dna->sitecount[0]);
890 free(dna->sitecount);
891 free(dna->markersite[0]);
892 free(dna->markersite);
893 free(dna->indnames[0][0][0]);
894 free(dna->indnames[0][0]);
895 free(dna->indnames[0]);
896 free(dna->indnames);
897 free(dna->seqs[0][0][0]);
898 free(dna->seqs[0][0]);
899 free(dna->seqs[0]);
900 free(dna->seqs);
901 free(dna->segranges0);
902 free(dna->segranges1);
903 free(dna->segranges2);
904 free(dna->segranges3);
905 free(dna);
906 } /* freednadata */
907 
908 
909 /*******************************************************************
910  * printdnadata() prints the dna sequences contained in the passed *
911  * data structure to the passed file in sequential form.           */
printdnadata(option_struct * op,data_fmt * data,FILE * out)912 void printdnadata(option_struct *op, data_fmt *data, FILE *out)
913 {
914 long pop, numpop, loc, numloci, ind, numind, site, numsite, siteout;
915 dnadata *dna;
916 
917 /* can't use standard functions to set all num* due to use of global
918    "population" */
919 dna = data->dnaptr;
920 numpop = getdata_numpop(op,data);
921 for(pop = 0; pop < numpop; pop++) {
922    fprintf(out,"----------------------------\n");
923    fprintf(out,"Sequences for Population %ld\n",pop+1);
924    fprintf(out,"----------------------------\n");
925    numloci = getdata_numloci(op,data);
926    for(loc = 0; loc < numloci; loc++) {
927       fprintf(out,"Locus %ld -------------------\n",loc+1);
928       numind = dna->numseq[pop];
929       for(ind = 0; ind < numind; ind++) {
930          fprintf(out,"%s",dna->indnames[pop][loc][ind]);
931          numsite = dna->sites[loc];
932          for(site = 0, siteout = NMLNGTH; site < numsite; site++) {
933             fprintf(out,"%c",dna->seqs[pop][loc][ind][site]);
934             siteout++;
935             if (siteout == OUTLINESIZE) {fprintf(out,"\n"); siteout = 0;}
936          }
937          fprintf(out,"\n");
938       }
939    }
940    fprintf(out,"\n");
941 }
942 fprintf(out,"\n\n");
943 
944 } /* printdnadata */
945 
946 
947 /********************************************************************
948  * initinvartips() sets up the special fractional tips for SNP data */
initinvartips(option_struct * op,data_fmt * data,long categs,tree * curtree)949 void initinvartips(option_struct *op, data_fmt *data,  long categs,
950    tree *curtree)
951 {
952 long i, j, k, m, pos, n, numseq, numtips;
953 
954 pos = getdata_nummarkers(op,data);
955 numseq = getdata_numseq(op,data);
956 numtips = getdata_numtips(op,data);
957 
958 if (op->panel) {
959    /* real tips */
960    for(i = 1; i <= numseq; i++) {
961       for (m = pos; m < pos+NUMINVAR; m++) {
962          for(j=0; j < categs; j++) {
963             for (n = 0; n < NUMSLICE; n++) {
964                for (k = baseA; k <= baseT; k++)
965                   curtree->nodep[i]->x->s[m][j][n][k] = 1.0;
966             }
967          }
968       }
969    }
970    /* fake tips */
971    for (i=numseq+1; i <= numtips; i++) {
972       for (m = 0; m < pos+NUMINVAR; m++) {
973          for(j = 0; j < categs; j++) {
974             for(k = baseA; k <= baseT; k++) {
975                curtree->nodep[i]->x->s[m][j][0][k] = 1.0;
976                for(n = 1; n < NUMSLICE; n++)
977                   curtree->nodep[i]->x->s[m][j][n][k] = 0.0;
978             }
979             curtree->nodep[i]->x->s[m][j][1][baseA] = 1.0;
980             curtree->nodep[i]->x->s[m][j][2][baseC] = 1.0;
981             curtree->nodep[i]->x->s[m][j][3][baseG] = 1.0;
982             curtree->nodep[i]->x->s[m][j][4][baseT] = 1.0;
983          }
984       }
985    }
986 } else {
987    for(i = 1; i <= numtips; i++) {
988       for(j = 0; j < categs; j++) {
989          for (m = pos; m < pos+NUMINVAR; m++) {
990             for(k = baseA; k <= baseT; k++) {
991                curtree->nodep[i]->x->s[m][j][0][k] = 0.0;
992             }
993          }
994          curtree->nodep[i]->x->s[pos][j][0][baseA] = 1.0;
995          curtree->nodep[i]->x->s[pos+1][j][0][baseC] = 1.0;
996          curtree->nodep[i]->x->s[pos+2][j][0][baseG] = 1.0;
997          curtree->nodep[i]->x->s[pos+3][j][0][baseT] = 1.0;
998       }
999    }
1000 }
1001 
1002 } /* initinvartips */
1003 
1004 
1005 /*************************************************************
1006  * makednavalues() set up fractional likelihoods at the tips */
makednavalues(option_struct * op,data_fmt * data,long categs,tree * curtree)1007 void makednavalues(option_struct *op, data_fmt *data, long categs,
1008    tree *curtree)
1009 {
1010   long i, k, l, m, numslice;
1011   long b;
1012   char ****y;
1013 
1014   y = data->dnaptr->seqs;
1015   numslice = (op->panel) ? NUMSLICE : 1L;
1016 
1017   for (k = 0; k < getdata_nummarkers(op,data); k++) {
1018     /* real tips are 1 to numseq */
1019     for (i = 1; i <= getdata_numseq(op,data); i++) {
1020       strcpy(curtree->nodep[i]->nayme,
1021           data->dnaptr->indnames[population][locus][i-1]);
1022       for (l = 0; l < categs; l++) {
1023         for (m = 0; m < numslice; m++) {
1024 	  for (b = baseA; b <= baseT; b = b + 1)
1025 	    curtree->nodep[i]->x->s[k][l][m][b] = 0.0;
1026 
1027   	switch (y[population][locus][i-1][k]) {
1028 
1029   	case 'A':
1030   	  curtree->nodep[i]->x->s[k][l][m][baseA] = 1.0;
1031   	  break;
1032 
1033   	case 'C':
1034   	  curtree->nodep[i]->x->s[k][l][m][baseC] = 1.0;
1035   	  break;
1036 
1037   	case 'G':
1038   	  curtree->nodep[i]->x->s[k][l][m][baseG] = 1.0;
1039   	  break;
1040 
1041   	case 'T':
1042   	  curtree->nodep[i]->x->s[k][l][m][baseT] = 1.0;
1043   	  break;
1044 
1045   	case 'U':
1046   	  curtree->nodep[i]->x->s[k][l][m][baseT] = 1.0;
1047   	  break;
1048 
1049   	case 'M':
1050   	  curtree->nodep[i]->x->s[k][l][m][baseA] = 1.0;
1051   	  curtree->nodep[i]->x->s[k][l][m][baseC] = 1.0;
1052   	  break;
1053 
1054   	case 'R':
1055   	  curtree->nodep[i]->x->s[k][l][m][baseA] = 1.0;
1056   	  curtree->nodep[i]->x->s[k][l][m][baseG] = 1.0;
1057   	  break;
1058 
1059   	case 'W':
1060   	  curtree->nodep[i]->x->s[k][l][m][baseA] = 1.0;
1061   	  curtree->nodep[i]->x->s[k][l][m][baseT] = 1.0;
1062   	  break;
1063 
1064   	case 'S':
1065   	  curtree->nodep[i]->x->s[k][l][m][baseC] = 1.0;
1066   	  curtree->nodep[i]->x->s[k][l][m][baseG] = 1.0;
1067   	  break;
1068 
1069   	case 'Y':
1070   	  curtree->nodep[i]->x->s[k][l][m][baseC] = 1.0;
1071   	  curtree->nodep[i]->x->s[k][l][m][baseT] = 1.0;
1072   	  break;
1073 
1074   	case 'K':
1075   	  curtree->nodep[i]->x->s[k][l][m][baseG] = 1.0;
1076 	  curtree->nodep[i]->x->s[k][l][m][baseT] = 1.0;
1077 	  break;
1078 
1079 	case 'B':
1080 	  curtree->nodep[i]->x->s[k][l][m][baseC] = 1.0;
1081 	  curtree->nodep[i]->x->s[k][l][m][baseG] = 1.0;
1082 	  curtree->nodep[i]->x->s[k][l][m][baseT] = 1.0;
1083 	  break;
1084 
1085 	case 'D':
1086 	  curtree->nodep[i]->x->s[k][l][m][baseA] = 1.0;
1087 	  curtree->nodep[i]->x->s[k][l][m][baseG] = 1.0;
1088 	  curtree->nodep[i]->x->s[k][l][m][baseT] = 1.0;
1089 	  break;
1090 
1091 	case 'H':
1092 	  curtree->nodep[i]->x->s[k][l][m][baseA] = 1.0;
1093 	  curtree->nodep[i]->x->s[k][l][m][baseC] = 1.0;
1094 	  curtree->nodep[i]->x->s[k][l][m][baseT] = 1.0;
1095 	  break;
1096 
1097 	case 'V':
1098 	  curtree->nodep[i]->x->s[k][l][m][baseA] = 1.0;
1099 	  curtree->nodep[i]->x->s[k][l][m][baseC] = 1.0;
1100 	  curtree->nodep[i]->x->s[k][l][m][baseG] = 1.0;
1101 	  break;
1102 
1103 	case 'N':
1104 	  for (b = baseA; b <= baseT; b++)
1105 	    curtree->nodep[i]->x->s[k][l][m][b] = 1.0;
1106 	  break;
1107 
1108 	case 'X':
1109 	  for (b = baseA; b <= baseT; b++)
1110 	    curtree->nodep[i]->x->s[k][l][m][b] = 1.0;
1111 	  break;
1112 
1113 	case '?':
1114 	  for (b = baseA; b <= baseT; b++)
1115 	    curtree->nodep[i]->x->s[k][l][m][b] = 1.0;
1116 	  break;
1117 
1118 	case 'O':
1119 	  for (b = baseA; b <= baseT; b++)
1120 	    curtree->nodep[i]->x->s[k][l][m][b] = 1.0;
1121 	  break;
1122 
1123 	case '-':
1124 	  for (b = baseA; b <= baseT; b++)
1125 	    curtree->nodep[i]->x->s[k][l][m][b] = 1.0;
1126 	  break;
1127 
1128         default:
1129           fprintf(ERRFILE,"**ERROR in setting up tips, probable error");
1130           fprintf(ERRFILE," in input data file.\nInterleaved/sequential");
1131           fprintf(ERRFILE," status may be incorrectly set.\n\n");
1132           exit(-1);
1133           break;
1134         } /* sorry about the indenting */
1135 	}
1136       }
1137     }
1138   }
1139 }  /* makednavalues */
1140 
1141 
1142 /*************************************************************************
1143  * empiricaldnafreqs calculates empirical base frequencies from the data */
empiricaldnafreqs(option_struct * op,data_fmt * data,tree * curtree)1144 void empiricaldnafreqs(option_struct *op, data_fmt *data, tree *curtree)
1145 {
1146   long i, j;
1147   double temp, suma, sumc, sumg, sumt, w;
1148   dnadata *dna;
1149 
1150   dna = data->dnaptr;
1151 
1152   dna->freqa = 0.25;
1153   dna->freqc = 0.25;
1154   dna->freqg = 0.25;
1155   dna->freqt = 0.25;
1156   suma = 0.0;
1157   sumc = 0.0;
1158   sumg = 0.0;
1159   sumt = 0.0;
1160   for (i = 1; i <= getdata_numseq(op,data); i++) {
1161      for (j = 0; j < getdata_nummarkers(op,data); j++) {
1162 	w = dna->dnaweight[j];
1163 	temp = dna->freqa * curtree->nodep[i]->x->s[j][0][0][baseA];
1164 	temp += dna->freqc * curtree->nodep[i]->x->s[j][0][0][baseC];
1165 	temp += dna->freqg * curtree->nodep[i]->x->s[j][0][0][baseG];
1166 	temp += dna->freqt * curtree->nodep[i]->x->s[j][0][0][baseT];
1167 	suma += w * dna->freqa * curtree->nodep[i]->x->s[j][0][0][baseA] / temp;
1168 	sumc += w * dna->freqc * curtree->nodep[i]->x->s[j][0][0][baseC] / temp;
1169 	sumg += w * dna->freqg * curtree->nodep[i]->x->s[j][0][0][baseG] / temp;
1170 	sumt += w * dna->freqt * curtree->nodep[i]->x->s[j][0][0][baseT] / temp;
1171       }
1172     }
1173     temp = suma + sumc + sumg + sumt;
1174     dna->freqa = suma / temp;
1175     dna->freqc = sumc / temp;
1176     dna->freqg = sumg / temp;
1177     dna->freqt = sumt / temp;
1178 
1179 }  /* empiricaldnafreqs */
1180 
1181 
getbasednafreqs(dnadata * dna,option_struct * op,double locus_ttratio,FILE * outfile)1182 void getbasednafreqs(dnadata *dna, option_struct *op, double locus_ttratio,
1183   FILE *outfile)
1184 {
1185   double aa, bb;
1186 
1187   putc('\n', outfile);
1188   if (op->freqsfrom)
1189     fprintf(outfile, "Empirical ");
1190   fprintf(outfile, "Base Frequencies:\n\n");
1191   fprintf(outfile, "   A    %10.5f\n", dna->freqa);
1192   fprintf(outfile, "   C    %10.5f\n", dna->freqc);
1193   fprintf(outfile, "   G    %10.5f\n", dna->freqg);
1194   fprintf(outfile, "  T(U)  %10.5f\n", dna->freqt);
1195   dna->freqr = dna->freqa + dna->freqg;
1196   dna->freqy = dna->freqc + dna->freqt;
1197   dna->freqar = dna->freqa / dna->freqr;
1198   dna->freqcy = dna->freqc / dna->freqy;
1199   dna->freqgr = dna->freqg / dna->freqr;
1200   dna->freqty = dna->freqt / dna->freqy;
1201   fprintf(outfile, "Transition/transversion ratio = %10.6f\n", locus_ttratio);
1202   aa = locus_ttratio * dna->freqr * dna->freqy -
1203        dna->freqa * dna->freqg - dna->freqc * dna->freqt;
1204   bb = dna->freqa * dna->freqgr + dna->freqc * dna->freqty;
1205   dna->xi = aa / (aa + bb);
1206   dna->xv = 1.0 - dna->xi;
1207   dna->ttratio = dna->xi / dna->xv;
1208   if (dna->xi <= 0.0) {
1209     printf("WARNING: This transition/transversion ratio\n");
1210     printf("is impossible with these base frequencies!\n");
1211     dna->xi = 3.0 / 5;
1212     dna->xv = 2.0 / 5;
1213     fprintf(outfile, " Transition/transversion parameter reset\n\n");
1214   }
1215   fprintf(outfile, "(Transition/transversion parameter = %10.6f)\n",
1216 	  dna->xi / dna->xv);
1217   dna->fracchange = dna->xi * (2 * dna->freqa * dna->freqgr +
1218       2 * dna->freqc * dna->freqty) +
1219       dna->xv * (1.0 - dna->freqa * dna->freqa -
1220       dna->freqc * dna->freqc - dna->freqg * dna->freqg -
1221       dna->freqt * dna->freqt);
1222 }  /* getbasednafreqs */
1223 
1224 
uppercase(char ch)1225 char uppercase(char ch)
1226 {
1227 return((islower((int)ch) ? (char)toupper((int)ch) : (ch)));
1228 
1229 } /* uppercase */
1230 
1231 
1232 /******************************************************************
1233  * inputdnaweights() reads the sitewise weights for the dna data. *
1234  * 0-9 and A-Z for weights 0-35.                                  */
inputdnaweights(long numchars,dnadata * dna,option_struct * op)1235 void inputdnaweights(long numchars, dnadata *dna, option_struct *op)
1236 {
1237 char ch;
1238 long i;
1239 
1240 for (i = 0; i < numchars; i++) {
1241    do {
1242       if (eoln(weightfile)) {
1243          fscanf(weightfile, "%*[^\n]");
1244          getc(weightfile);
1245       }
1246       ch = getc(weightfile);
1247       if (ch == '\n') ch = ' ';
1248    } while (ch == ' ');
1249 
1250    dna->dnaweight[i] = 1;
1251    if (isdigit((int)ch))
1252       dna->dnaweight[i] = (long)(ch - '0');
1253    else if (isalpha((int)ch)) {
1254       ch = (char)uppercase(ch);
1255       dna->dnaweight[i] = (long)(ch - 'A' + 10);
1256    } else {
1257       printf("BAD WEIGHT CHARACTER: %c\n", ch);
1258       exit(-1);
1259    }
1260 }
1261 
1262 fscanf(weightfile, "%*[^\n]");
1263 getc(weightfile);
1264 op->weights = TRUE;
1265 fclose(weightfile);
1266 
1267 }  /* inputweights */
1268 
1269 
1270 /************************************************
1271  * hapdist computes a distance between two sets *
1272  * of haplotype resolutions.                    */
hapdist(option_struct * op,data_fmt * hap1,data_fmt * hap2,creature * critter)1273 long hapdist(option_struct *op,data_fmt *hap1, data_fmt *hap2, creature *critter)
1274 {
1275 long mkr, cr, numcreatures, markers;
1276 long score[2];
1277 long fullscore;
1278 
1279 numcreatures = getdata_numtips(op,hap1)/NUMHAPLOTYPES;
1280 markers = getdata_nummarkers(op,hap1);
1281 
1282 fullscore = 0;
1283 for (cr=0; cr<numcreatures; cr++) {
1284    score[0] = 0;
1285    score[1] = 0;
1286    for (mkr=0; mkr < markers; mkr++) {
1287 /* matching the two 0 and the two 1 haplotypes */
1288      if (hap1->dnaptr->seqs[population][locus]
1289        [critter[cr].haplotypes[0]->number-1][mkr] !=
1290          hap2->dnaptr->seqs[population][locus]
1291        [critter[cr].haplotypes[0]->number-1][mkr])
1292        score[0]++;
1293      if (hap1->dnaptr->seqs[population][locus]
1294        [critter[cr].haplotypes[1]->number-1][mkr] !=
1295          hap2->dnaptr->seqs[population][locus]
1296        [critter[cr].haplotypes[1]->number-1][mkr])
1297        score[0]++;
1298 /* matching 0-1 and 1-0 haplotypes */
1299      if (hap1->dnaptr->seqs[population][locus]
1300        [critter[cr].haplotypes[0]->number-1][mkr] !=
1301          hap2->dnaptr->seqs[population][locus]
1302        [critter[cr].haplotypes[1]->number-1][mkr])
1303        score[1]++;
1304      if (hap1->dnaptr->seqs[population][locus]
1305        [critter[cr].haplotypes[1]->number-1][mkr] !=
1306          hap2->dnaptr->seqs[population][locus]
1307        [critter[cr].haplotypes[0]->number-1][mkr])
1308        score[1]++;
1309   }
1310   if (score[0]<score[1]) fullscore += score[0];
1311   else fullscore += score[1];
1312 }
1313 return(fullscore);
1314 
1315 } /* hapdist */
1316 
1317 /********************************************************
1318  * This reads in the "true" haplotypes for the purposes *
1319  * of comparing them with derived ones.  Input file is  *
1320  * truehaps.                                            */
readtruehaps(option_struct * op,data_fmt * hap,long numpop,long numloci,long numind,long * numsites)1321 void readtruehaps(option_struct *op, data_fmt *hap, long numpop, long
1322   numloci, long numind, long *numsites)
1323 {
1324 char *s;
1325 FILE *truehaps;
1326 
1327 s = (char *)calloc(LINESIZE,sizeof(char));
1328 
1329 truehaps = fopen("truehaps","r");
1330 /* read off first 3 lines of standard input file as unneeded */
1331 read_line(truehaps,s); read_line(truehaps,s); read_line(truehaps,s);
1332 
1333 setupdata(hap,op->datatype,numpop,numloci,numind,numsites,"True Haplotypes");
1334 setpopstuff(hap,0L,numind,"poptitle",op->datatype);
1335 read_popdata(truehaps,hap,0L,op);
1336 
1337 free(s);
1338 
1339 } /* readtruehaps */
1340 
1341 /********************************************************
1342  *  This copies a set of haplotypes into a new data     *
1343  *  structure which it allocates.                       */
copyhaps(option_struct * op,data_fmt * oldhap,data_fmt * newhap,long numpop,long numloci,long numind,long * numsites)1344 void copyhaps(option_struct *op, data_fmt *oldhap, data_fmt *newhap,
1345   long numpop, long numloci, long numind, long *numsites)
1346 {
1347 long pop, seq, site, loc;
1348 
1349 setupdata(newhap,op->datatype,numpop, numloci, numind,
1350   numsites, "Copy Haplotypes");
1351 setpopstuff(newhap,0L,numind,"poptitle",op->datatype);
1352 
1353 
1354 for (pop = 0; pop < numpop; pop++) {
1355   for (loc = 0; loc < numloci; loc++) {
1356     for (seq = 0; seq < numind; seq++) {
1357       for (site = 0; site < numsites[locus]; site++) {
1358         newhap->dnaptr->seqs[pop][locus][seq][site] =
1359         oldhap->dnaptr->seqs[pop][locus][seq][site];
1360       }
1361     }
1362   }
1363 }
1364 
1365 } /* copyhaps */
1366