1 /*
2  *  Copyright (C) 2010  Regents of the University of Michigan
3  *
4  *   This program is free software: you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation, either version 3 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *   GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include "PedigreeDescription.h"
19 #include "MapFunction.h"
20 #include "MathVector.h"
21 #include "Constant.h"
22 #include "FortranFormat.h"
23 #include "Error.h"
24 
25 #include <stdlib.h>
26 #include <ctype.h>
27 #include <string.h>
28 #include <math.h>
29 
PedigreeDescription()30 PedigreeDescription::PedigreeDescription()
31 {
32     columnCount = 0;
33     mendelFormat = false;
34 }
35 
~PedigreeDescription()36 PedigreeDescription::~PedigreeDescription()
37 { };
38 
operator =(PedigreeDescription & rhs)39 PedigreeDescription & PedigreeDescription::operator = (PedigreeDescription & rhs)
40 {
41     columnCount = rhs.columnCount;
42 
43     columns = rhs.columns;
44     columnHash = rhs.columnHash;
45 
46     return *this;
47 };
48 
Load(IFILE & input,bool warnIfLinkage)49 void PedigreeDescription::Load(IFILE & input, bool warnIfLinkage)
50 {
51     // Check if we are dealing with a linkage format data file
52     String      buffer;
53     StringArray tokens;
54 
55     mendelFormat = false;
56 
57     ReadLineHelper(input, buffer, tokens);
58     ifrewind(input);
59 
60     if (tokens.Length() == 4 && isdigit(tokens[0][0]))
61     {
62         if (warnIfLinkage) printf("Data file looks like a LINKAGE format file...\n\n");
63         LoadLinkageDataFile(input);
64         return;
65     }
66 
67     if (buffer.Length() > 18
68             && (buffer.SubStr(8,8).SlowCompare("AUTOSOME") == 0 ||
69                 buffer.SubStr(8,8).SlowCompare("X-LINKED") == 0)
70             && (isdigit(buffer[16])  || isdigit(buffer[17]))
71             && (isdigit(buffer[18])  || isdigit(buffer[19]) ||
72                 (buffer.Length() > 19 && isdigit(buffer[20]))))
73     {
74         printf("Data file looks like a MENDEL format file...\n"
75                "   Activating EXPERIMENTAL support for this format\n\n");
76         LoadMendelDataFile(input);
77         return;
78     }
79 
80     // Reset things
81     ifrewind(input);
82     int done = 0;
83     int line = 0;
84 
85     columns.Clear();
86     columnHash.Clear();
87     columnCount = 0;
88 
89     while (!ifeof(input) && !done)
90     {
91         int   i;
92 
93         buffer.ReadLine(input);
94         line++;
95 
96         tokens.Clear();
97         tokens.AddTokens(buffer, WHITESPACE);
98 
99         if (tokens.Length() < 1) continue;
100 
101         if (tokens.Length() == 1)
102             error("Problem reading data file:\n"
103                   "Item #%d (of type %s) has no name.",
104                   columnCount+1, (const char *) tokens[0]);
105 
106         switch (toupper(tokens[0][0]))
107         {
108             case 'A' :
109                 columnHash.Push(GetAffectionID(tokens[1]));
110                 columns.Push(pcAffection);
111                 columnCount++;
112                 break;
113             case 'M' :
114                 columnHash.Push(GetMarkerID(tokens[1]));
115                 columns.Push(pcMarker);
116                 columnCount++;
117                 break;
118             case 'T' :
119                 columnHash.Push(GetTraitID(tokens[1]));
120                 columns.Push(pcTrait);
121                 columnCount++;
122                 break;
123             case 'C' :
124                 columnHash.Push(GetCovariateID(tokens[1]));
125                 columns.Push(pcCovariate);
126                 columnCount++;
127                 break;
128             case '$' :
129                 columnHash.Push(GetStringID(tokens[1]));
130                 columns.Push(pcString);
131                 columnCount++;
132                 break;
133             case 'S' :
134                 i = (int) tokens[0].SubStr(1);
135                 i = i > 0 ? i : 1;
136                 while (i--)
137                 {
138                     columns.Push(pcSkip);
139                     columnHash.Push(0);
140                     columnCount++;
141                 }
142                 break;
143             case 'Z' :
144                 columnHash.Push(0);
145                 columns.Push(pcZygosity);
146                 columnCount++;
147                 break;
148             case 'V' :
149                 GetMarkerID(tokens[1]);
150                 break;
151             case 'E' :
152                 done = 1;
153                 break;
154             case 'U' :
155                 if (toupper(tokens[0][1]) == 'T' && toupper(tokens[0][2]) == 'C')
156                 {
157                     int c = GetCovariateID(tokens[1]);
158                     int t = GetTraitID(tokens[1]);
159 
160                     if (c >= 32767 || t >= 32767)
161                         error("Internal error processing data file\n");
162 
163                     columnHash.Push(t * 32768 + c);
164                     columns.Push(pcUndocumentedTraitCovariate);
165                     columnCount++;
166                     break;
167                 }
168             default :
169                 error("Problem in data file (line %d):\n%s\n",
170                       line, (const char *) buffer);
171         }
172     }
173 
174     columns.Push(pcEnd);
175     columnHash.Push(0);
176 };
177 
Load(const char * iFilename,bool warnIfLinkage)178 void PedigreeDescription::Load(const char * iFilename, bool warnIfLinkage)
179 {
180     IFILE f = ifopen(iFilename, "rb");
181 
182     if (f == NULL)
183         error(
184             "The datafile %s cannot be opened\n\n"
185             "Common causes for this problem are:\n"
186             "  * You might not have used the correct options to specify input file names,\n"
187             "    please check the program documentation for information on how to do this\n\n"
188             "  * The file doesn't exist or the filename might have been misspelt\n\n"
189             "  * The file exists but it is being used by another program which you will need\n"
190             "    to close before continuing\n\n"
191             "  * The file is larger than 2GB and you haven't compiled this application with\n"
192             "    large file support.\n\n",
193             iFilename);
194 
195     Load(f, warnIfLinkage);
196     ifclose(f);
197 
198     filename = iFilename;
199 };
200 
LoadMap(const char * iFilename)201 void PedigreeDescription::LoadMap(const char * iFilename)
202 {
203     IFILE f = ifopen(iFilename, "rb");
204 
205     if (f == NULL)
206         error(
207             "The mapfile %s cannot be opened\n\n"
208             "Please check that the file exists and is not being used by another program\n"
209             "To find out how to set input filenames, check the documentation\n",
210             iFilename);
211 
212     LoadMap(f);
213     ifclose(f);
214 };
215 
LoadMap(IFILE & input)216 void PedigreeDescription::LoadMap(IFILE & input)
217 {
218     columns.Clear();
219     columnHash.Clear();
220     columnCount = 0;
221 
222     int         lastposition = 0;
223     String      buffer;
224     StringArray tokens;
225 
226     buffer.ReadLine(input);
227     tokens.AddTokens(buffer, WHITESPACE);
228 
229     while (tokens.Length() == 0 && !ifeof(input))
230     {
231         buffer.ReadLine(input);
232         tokens.AddTokens(buffer, WHITESPACE);
233     }
234 
235     if (tokens.Length() != 3)
236         error("Error reading map file header, which has %d columns.\n"
237               "Three columns were expected, corresponding to\n"
238               "MARKER_ID, MARKER_NAME and BASE_PAIR_POSITION\n"
239               "The offending header is transcribed below:\n\n"
240               "%s", tokens.Length(), (const char *) buffer);
241     else
242         printf("Map file column labels\n"
243                "  -- COLUMN 1, Expecting MARKER_ID, Read %s\n"
244                "  -- COLUMN 2, Expecting MARKER_NAME, Read %s\n"
245                "  -- COLUMN 3, Expection BASE_PAIR_POSITION, Read %s\n\n",
246                (const char *)(tokens[0]), (const char *)(tokens[1]),
247                (const char *)(tokens[2]));
248 
249     int line = 1;
250     while (!ifeof(input))
251     {
252         int    serial;
253         long   position;
254 
255         buffer.ReadLine(input);
256         line++;
257 
258         tokens.Clear();
259         tokens.AddTokens(buffer, WHITESPACE);
260 
261         if (tokens.Length() < 1) continue;
262         if (tokens.Length() != 3)
263             error("Each line in the map file should have 3 tokens, corresponding\n"
264                   "to MARKER_ID, MARKER_NAME and BASE_PAIR_POSITION respectively\n"
265                   "However, there are %d tokens in line %d, transcribed below:\n\n"
266                   "%s", tokens.Length(), line, (const char *) buffer);
267 
268         serial = (int) tokens[0];
269         if (serial != columnCount + 1)
270             error("Reading Marker Index from Map File...\n"
271                   "Markers should be indexed consecutively starting at 1\n"
272                   "Marker %d does not fit this pattern\n", columnCount + 1);
273 
274         position = (int) tokens[2];
275         if (position < lastposition)
276             error("Reading Marker Position from Map File...\n"
277                   "Marker position should be in base-pairs\n"
278                   "and markers should be in map order\n");
279 
280         // TODO -- store marker locations somewhere!
281         lastposition = position;
282 
283         columnHash.Push(GetMarkerID(tokens[1]));
284         columns.Push(pcMarker);
285         columnCount++;
286 
287         GetMarkerInfo(tokens[1])->position = position * 1e-8;
288     }
289 
290     columns.Push(pcEnd);
291     columnHash.Push(0);
292 };
293 
CountTextColumns()294 int PedigreeDescription::CountTextColumns()
295 {
296     int count = 0;
297 
298     for (int i = 0; i < columnCount; i++, count++)
299         if (columns[i] == pcMarker)
300             count++;
301 
302     return count;
303 }
304 
LoadLinkageDataFile(const char * iFilename)305 void PedigreeDescription::LoadLinkageDataFile(const char * iFilename)
306 {
307     IFILE f = ifopen(iFilename, "rb");
308 
309     if (f == NULL)
310         error(
311             "The linkage format datafile %s cannot be opened\n\n"
312             "Please check that the file exists and is not being used by another program\n"
313             "To find out how to set input filenames, check the documentation\n",
314             iFilename);
315 
316     LoadLinkageDataFile(f);
317     ifclose(f);
318 
319     filename = iFilename;
320 };
321 
LoadLinkageDataFile(IFILE & input)322 void PedigreeDescription::LoadLinkageDataFile(IFILE & input)
323 {
324     columns.Clear();
325     columnHash.Clear();
326     columnCount = 0;
327 
328     String      buffer, label;
329     StringArray tokens;
330 
331     ReadLineHelper(input, buffer, tokens);
332 
333     if (tokens.Length() != 4 || tokens[2].AsInteger() != (int) chromosomeX ||
334             tokens[0].AsInteger() < 0)
335         error("Cannot handle first line of data file\n\n"
336               "Expecting four (4) numeric values, which correspond to:\n"
337               "   num-loci   -- number of loci in the pedigree\n"
338               "                 this value must be positive\n"
339               "   risk-locus -- locus for which risks should be calculated\n"
340               "                 this value will be ignored\n"
341               "   sex-link   -- are the loci sex linked [0 - No, 1 - Yes]\n"
342               "                 %s\n"
343               "   program    -- which LINKAGE program do you want to use?\n"
344               "                 this value will also be ignored\n\n"
345               "The actual input read:\n%s\n",
346               chromosomeX ? "expecting X-linked data, so this value must be ONE (1)"
347               : "expecting autosomal data, so this must be ZERO (0)",
348               (const char *) buffer);
349 
350     int numloci = tokens[0];
351 
352     ReadLineHelper(input, buffer, tokens);
353 
354     if (tokens.Length() != 4 ||
355             tokens[0].AsInteger() != 0 ||
356             tokens[3].AsInteger() != 0)
357         error("Cannot handle second line of data file\n\n"
358               "Expecting four (4) numeric values, which correspond to:\n"
359               "   mutation-model         -- must be zero, corresponding to no mutation\n"
360               "   male-mutation-rate     -- ignored\n"
361               "   female-mutation-rate   -- ignored\n"
362               "   linkage-disequilibrium -- must be zero, may be used in the future to\n"
363               "                             read haplotype frequencies\n\n"
364               "The actual input read:\n%s\n", (const char *) buffer);
365 
366     StringArray markerOrder;
367     int         unknown = 0;
368 
369     ReadLineHelper(input, buffer, markerOrder);
370 
371     if (markerOrder.Length() > numloci)
372         error("The third line of the data file lists marker order\n\n"
373               "Although %d loci are defined [in the first line],\n"
374               "this line includes %d values:\n%s\n",
375               numloci, markerOrder.Length(), (const char *) buffer);
376 
377     IntArray    locus;
378     bool need_blank_line = false;
379 
380     while (!ifeof(input) && numloci--)
381     {
382         if (ReadLineHelper(input, buffer, tokens) == 0)
383             error("Linkage data file ends unexpectedly");
384 
385         if (tokens.Length() < 2)
386             error("Incomplete locus information in data file\n"
387                   "Information for each locus should include 2 or more fiels\n"
388                   "The expected fields are:\n"
389                   "   field_type  -- indicator of locus type (trait, marker,...)\n"
390                   "   alleles     -- number of alleles\n"
391                   "   name        -- locus name, preceded by hash (#) sign\n\n"
392                   "The actual input read:\n%s\n", (const char *) buffer);
393 
394         int locus_type = (int) tokens[0];
395         int alleles    = (int) tokens[1];
396 
397         String locus_name("LOCUS");
398         locus_name += ++unknown;
399 
400         if (tokens.Length() > 2 && tokens[2][0] == '#')
401         {
402             if (tokens[2][1] != 0)
403                 locus_name = tokens[2].SubStr(1);
404             else if (tokens.Length() > 3)
405                 locus_name = tokens[3];
406         }
407 
408         if ((locus_type == 4 && alleles == 0) ||
409                 (locus_type == 4 && alleles == 1))
410         {
411             columnHash.Push(GetCovariateID(locus_name));
412             columns.Push(pcCovariate);
413             columnCount++;
414             continue;
415         }
416 
417         if (locus_type == 0 && alleles == 0)
418         {
419             columnHash.Push(GetTraitID(locus_name));
420             columns.Push(pcTrait);
421             columnCount++;
422             continue;
423         }
424 
425         if (ReadLineHelper(input, buffer, tokens) != alleles)
426             error("Expecting %d allele frequencies, but input has %d columns:\n"
427                   "%s\n", alleles, tokens.Length(), (const char *) buffer);
428 
429         Vector frequencies(alleles + 1);
430 
431         frequencies[0] = 0.0;
432         for (int i = 1; i <= alleles; i++)
433             frequencies[i] = (double) tokens[i - 1];
434 
435         double sum = frequencies.Sum();
436 
437         if (sum <= 0.0)
438             error("Allele frequencies at %s sum to %f, which doesn't make sense\n",
439                   (const char *) locus_name, sum);
440 
441         if (fabs(sum - 1.0) > 1.2e-5)
442         {
443             printf("Allele frequencies at %s sum to %f, adjusted to 1.0\n",
444                    (const char *) locus_name, sum);
445             need_blank_line = true;
446         }
447 
448         if (sum != 1.0)
449             frequencies *= 1.0 / sum;
450 
451         switch (locus_type)
452         {
453             case 1 :
454             {
455                 // Affection
456                 columnHash.Push(GetAffectionID(locus_name));
457                 columns.Push(pcAffection);
458                 columnCount++;
459 
460                 // Read number of liability classes
461                 if (ReadLineHelper(input, buffer, tokens) == 0)
462                     error("Linkage data file ends unexpectedly\n");
463 
464                 // Skip liability class data
465                 int classes = tokens[0];
466                 if (classes > 1)
467                 {
468                     columnHash.Push(0);
469                     columns.Push(pcSkip);
470                     columnCount++;
471                 }
472 
473                 // Separate liability class rows for males and females for X-linked data
474                 if (chromosomeX) classes *= 2;
475 
476                 while (classes--)
477                     if (ReadLineHelper(input, buffer, tokens) == 0)
478                         error("Linkage data file ends unexpectedly\n");
479 
480                 // Ignore map location for quantitative variables
481                 locus.Push(-1);
482             }
483             break;
484             case 3 :
485             {
486                 columnHash.Push(GetMarkerID(locus_name));
487                 columns.Push(pcMarker);
488                 columnCount++;
489 
490                 // Store allele frequencies
491                 MarkerInfo * info = GetMarkerInfo(locus_name);
492 
493                 info->freq = frequencies;
494 
495                 // Initialize allele labels
496                 info->alleleLabels.Clear();
497                 for (int i = 0; i < frequencies.Length(); i++)
498                     info->alleleLabels.Push(label = i);
499                 info->IndexAlleles();
500 
501                 // Store marker id, so that we can track map location
502                 locus.Push(GetMarkerID(locus_name));
503             }
504             break;
505             case 0 :
506             {
507                 // Read number of quantitative variables
508                 if (ReadLineHelper(input, buffer, tokens) == 0)
509                     error("Linkage data file ends unexpectedly\n");
510 
511                 // Add each quantitative variable to pedigree
512                 // Discard information on means
513                 for (int vars = tokens[0], i = 0; i < vars; i++)
514                 {
515                     if (ReadLineHelper(input, buffer, tokens) == 0)
516                         error("Linkage data file ends unexpectedly\n");
517 
518                     String trait_name(locus_name);
519 
520                     if (i)
521                     {
522                         trait_name += ".";
523                         trait_name += i + 1;
524                     }
525 
526                     columnHash.Push(GetTraitID(trait_name));
527                     columns.Push(pcTrait);
528                     columnCount++;
529                 }
530 
531                 // Skip var-covar matrix
532                 if (ReadLineHelper(input, buffer, tokens) == 0)
533                     error("Linkage data file ends unexpectedly\n");
534 
535                 // Skip heterozygote scaling factor for var-covar matrix
536                 if (ReadLineHelper(input, buffer, tokens) == 0)
537                     error("Linkage data file ends unexpectedly\n");
538 
539                 // Ignore map location for quantitative variables
540                 locus.Push(-1);
541             }
542             break;
543             case 2 :
544                 error("The data file includes binary factors\n"
545                       "Regretably, loci of this type are not supported\n\n");
546                 break;
547             default :
548                 error("Unsupported locus type [%d] in data file", locus_type);
549                 break;
550         }
551     }
552 
553     if (need_blank_line) printf("\n");
554 
555     columns.Push(pcEnd);
556     columnHash.Push(0);
557 
558     ReadLineHelper(input, buffer, tokens);
559     int sexDifference = tokens.Length() ? tokens[0].AsInteger() : -1;
560     if (tokens.Length() != 2 ||
561             (sexDifference != 0 && sexDifference != 2) ||
562             tokens[1].AsInteger() != 0)
563         error("Error retrieving recombination information\n\n"
564               "Expecting two (2) numeric values, which correspond to:\n"
565               "   sex-difference   -- must be zero (no difference) or two (sex specific recombination)\n"
566               "   map-function     -- must be zero, that is, no interference\n"
567               "The actual input read:\n%s\n", (const char *) buffer);
568 
569     Vector distances[2];
570     bool distance_in_centimorgans = false;
571 
572     for (int r = 0; r <= sexDifference; r += 2)
573     {
574         ReadLineHelper(input, buffer, tokens);
575         if (tokens.Length() != markerOrder.Length() - 1)
576             error("Error retrieving recombination information\n\n"
577                   "Expecting %d recombination fractions (current map includes %d loci)\n"
578                   "Instead the following line was input:\n%s\n",
579                   markerOrder.Length() - 1, markerOrder.Length(), (const char *) buffer);
580 
581         distances[r >> 1].Dimension(tokens.Length());
582         for (int i = 0; i < tokens.Length(); i++)
583             distances[r >> 1][i] = (double) tokens[i];
584 
585         if (distances[r >> 1].Min() < 0.0)
586             error("Linkage datafile specifies negative recombination fractions");
587 
588         bool centimorgans = distances[r >> 1].Max() > 0.5;
589 
590         if (centimorgans && !distance_in_centimorgans)
591             printf("  Some recombination fractions in datafile are greater than 0.5,\n"
592                    "  so recombination fractions will be interpreted as cM distances\n\n");
593 
594         distance_in_centimorgans |= centimorgans;
595     }
596 
597     double position = 0.0, positionMale = 0.0;
598 
599     for (int i = 0, moving = false; i < markerOrder.Length(); i++)
600     {
601         int m = markerOrder[i].AsInteger() - 1;
602 
603         if (m < 0 || m >= locus.Length())
604             error("The marker order in the linkage datafile is invalid\n");
605 
606         m = locus[m];
607 
608         if (m != -1)
609         {
610             MarkerInfo * info = GetMarkerInfo(m);
611             info->chromosome = chromosomeX ? 9999 : 0;
612 
613             if (sexDifference == 2)
614                 info->position = (position + positionMale) * 0.5,
615                                  info->positionFemale = position,
616                                                         info->positionMale = positionMale;
617             else
618                 info->position = info->positionMale = info->positionFemale = position;
619 
620             moving = true;
621         }
622 
623         if (i < markerOrder.Length() - 1 && moving)
624             position += distance_in_centimorgans ?
625                         0.01 * distances[0][i] : RecombinationToDistance(distances[0][i]);
626 
627         if (sexDifference == 2 && i < markerOrder.Length() - 1 && moving)
628             positionMale += distance_in_centimorgans ?
629                             0.01 * distances[1][i] : RecombinationToDistance(distances[1][i]);
630     }
631 }
632 
ReadLineHelper(IFILE & input,String & buffer,StringArray & tokens)633 int PedigreeDescription::ReadLineHelper(IFILE & input,
634                                         String & buffer,
635                                         StringArray & tokens)
636 {
637     do
638     {
639         // Read Line
640         buffer.ReadLine(input);
641         buffer.Trim();
642 
643         // Strip comments marked with >>
644         int pos = buffer.FastFind(">>");
645         if (pos == -1) pos = buffer.FastFind("<<");
646         if (pos == -1) pos = buffer.Length() + 1;
647         if (buffer[0] == '#') pos = 0;
648 
649         // Find space/tab delimited tokens
650         tokens.Clear();
651         tokens.AddTokens(buffer.Left(pos - 1), WHITESPACE);
652 
653     }
654     while (tokens.Length() == 0 && !ifeof(input));
655 
656     return tokens.Length();
657 }
658 
LoadMendelDataFile(const char * iFilename)659 void PedigreeDescription::LoadMendelDataFile(const char * iFilename)
660 {
661     IFILE f = ifopen(iFilename, "rb");
662 
663     if (f == NULL)
664         error(
665             "The MENDEL format datafile %s cannot be opened\n\n"
666             "Please check that the file exists and is not being used by another program\n"
667             "To find out how to set input filenames, check the documentation\n",
668             iFilename);
669 
670     LoadMendelDataFile(f);
671     ifclose(f);
672 };
673 
LoadMendelDataFile(IFILE & file)674 void PedigreeDescription::LoadMendelDataFile(IFILE & file)
675 {
676     // Processes mendel format file
677     mendelFormat = true;
678 
679     // Codominant markers are mapped to markers
680     // Non-codominant markers are mapped into multiple "affection status"
681     // (Y/N) variables
682     columns.Clear();
683     columnHash.Clear();
684     columnCount = 0;
685 
686     FortranFormat parser;
687 
688     // Variables for storing parsed input
689     String locusName;
690     String locusType;
691     String alleleLabel;
692     String alleleFreq;
693     String phenotype;
694     String genotype;
695     int phenoCount;
696     int alleleCount;
697 
698     while (!ifeof(file))
699     {
700         // Cycle through headers for each locus
701         parser.SetInputFile(file);
702         parser.SetFormat("(2A8,I2,I3)");
703 
704         // After retrieving locus name, check that we haven't tried to
705         // read past the end-of-file
706         parser.GetNextField(locusName);
707         parser.GetNextField(locusType);
708         alleleCount = parser.GetNextInteger();
709         phenoCount = parser.GetNextInteger();
710 
711         if (locusName.IsEmpty() && locusType.IsEmpty() && alleleCount == 0 &&
712                 phenoCount == 0 && ifeof(file))
713             break;
714 
715         // Only recognize autosomal and x-linked loci
716         if (locusType.Compare("AUTOSOME") != 0 && locusType.Compare("X-LINKED"))
717             error("Unrecognized locus type '%s' in Mendel data file\n\n"
718                   "Recognized locus types are \"AUTOSOME\" and \"X-LINKED\".",
719                   (const char *) locusType);
720 
721         if (locusType.Compare("AUTOSOME") == 0 && chromosomeX)
722             error("The data file indicates that locus %s is AUTOSOMAL, but\n"
723                   "X-LINKED loci were expected as input\n",
724                   (const char *) locusName);
725 
726         if (locusType.Compare("X-LINKED") == 0 && !chromosomeX)
727             error("The data file indicates that locus %s is X-LINKED, but\n"
728                   "AUTOSOMAL loci were expected as input\n",
729                   (const char *) locusName);
730 
731         if (locusName.IsEmpty())
732             error("Blank locus name encountered in data file\n");
733 
734         if (phenoCount == 0)
735         {
736             // Co-dominant marker
737             columns.Push(pcMarker);
738             columnHash.Push(GetMarkerID(locusName));
739             columnCount++;
740 
741             // Update marker info with allele labels and frequencies
742             MarkerInfo * info = GetMarkerInfo(locusName);
743 
744             info->alleleLabels.Clear();
745             info->alleleLabels.Push("");
746             info->freq.Clear();
747 
748             parser.SetFormat("(2A8)");
749 
750             // Mendel allows allele names to be specified with frequencies
751             // left blank
752             for (int i = 0; i < alleleCount; i++)
753             {
754                 parser.GetNextField(alleleLabel);
755                 parser.GetNextField(alleleFreq);
756 
757                 if (alleleLabel.IsEmpty())
758                     error("Locus %s is missing allele label for allele #%d\n",
759                           (const char *) locusName, i+1);
760 
761                 info->alleleLabels.Push(alleleLabel);
762 
763                 if (!alleleFreq.IsEmpty())
764                 {
765                     if (info->freq.Length() == 0)
766                         info->freq.Push(0.0);
767 
768                     info->freq.Push(alleleFreq.AsDouble());
769                 }
770             }
771             info->IndexAlleles();
772 
773             if (info->alleleLabels.Length() != info->freq.Length() &&
774                     info->freq.Length() != 0)
775                 error("Locus %s is missing allele frequency information for %d alleles\n",
776                       (const char *) locusName,
777                       info->alleleLabels.Length() - info->freq.Length());
778         }
779         else
780         {
781             // Non-codominant marker, which we decompose into multiple traits...
782             parser.SetFormat("(2A8)");
783 
784             // First skip allele frequency information
785             for (int i = 0; i < alleleCount; i++)
786             {
787                 parser.GetNextField(alleleLabel);
788                 parser.GetNextField(alleleFreq);
789             }
790 
791             // Then read in each phenotype
792             for (int i = 0; i < alleleCount; i++)
793             {
794                 parser.SetFormat("(A8,I3)");
795                 parser.GetNextField(phenotype);
796                 int genoCount = parser.GetNextInteger();
797 
798                 parser.SetFormat("(A17)");
799                 for (int j = 0; j < genoCount; j++)
800                     parser.GetNextField(genotype);
801 
802                 columns.Push(pcAffection);
803                 columnHash.Push(GetAffectionID(locusName + "->" + phenotype));
804                 columnCount++;
805             }
806         }
807     }
808 
809     columns.Push(pcEnd);
810     columnHash.Push(0);
811 }
812 
CountColumns(int type)813 int PedigreeDescription::CountColumns(int type)
814 {
815     int count = 0;
816 
817     for (int i = 0; i < columns.Length(); i++)
818         if (columns[i] == type)
819             count++;
820 
821     return count;
822 }
823 
ColumnSummary(String & string)824 const char * PedigreeDescription::ColumnSummary(String & string)
825 {
826     string.Clear();
827     UpdateSummary(string, pcMarker, " markers [x2 cols]");
828     UpdateSummary(string, pcTrait, " traits");
829     UpdateSummary(string, pcAffection, " discrete traits");
830     UpdateSummary(string, pcCovariate, " covariates");
831     UpdateSummary(string, pcString, " strings");
832     UpdateSummary(string, pcZygosity, " zygosity");
833     UpdateSummary(string, pcSkip, " skipped");
834     return string;
835 }
836 
UpdateSummary(String & string,int type,const char * label)837 void PedigreeDescription::UpdateSummary(String & string, int type, const char * label)
838 {
839     int count = CountColumns(type);
840 
841     if (count)
842     {
843         if (string.Length())
844             string += ", ";
845         string += count;
846         string += label;
847     }
848 }
849 
850 
AddMarkerColumn(const char * markerName)851 void PedigreeDescription::AddMarkerColumn(const char * markerName)
852 {
853     if (columns.Last() == pcEnd)
854     {
855         columns.Pop();
856         columnHash.Pop();
857     }
858 
859     columnHash.Push(GetMarkerID(markerName));
860     columns.Push(pcMarker);
861     columnCount++;
862 }
863 
AddCovariateColumn(const char * covariateName)864 void PedigreeDescription::AddCovariateColumn(const char * covariateName)
865 {
866     if (columns.Last() == pcEnd)
867     {
868         columns.Pop();
869         columnHash.Pop();
870     }
871 
872     columnHash.Push(GetCovariateID(covariateName));
873     columns.Push(pcCovariate);
874     columnCount++;
875 }
876 
AddTraitColumn(const char * traitName)877 void PedigreeDescription::AddTraitColumn(const char * traitName)
878 {
879     if (columns.Last() == pcEnd)
880     {
881         columns.Pop();
882         columnHash.Pop();
883     }
884 
885     columnHash.Push(GetCovariateID(traitName));
886     columns.Push(pcTrait);
887     columnCount++;
888 }
889 
AddAffectionColumn(const char * affectionName)890 void PedigreeDescription::AddAffectionColumn(const char * affectionName)
891 {
892     if (columns.Last() == pcEnd)
893     {
894         columns.Pop();
895         columnHash.Pop();
896     }
897 
898     columnHash.Push(GetAffectionID(affectionName));
899     columns.Push(pcAffection);
900     columnCount++;
901 }
902 
AddStringColumn(const char * stringName)903 void PedigreeDescription::AddStringColumn(const char * stringName)
904 {
905     if (columns.Last() == pcEnd)
906     {
907         columns.Pop();
908         columnHash.Pop();
909     }
910 
911     columnHash.Push(GetStringID(stringName));
912     columns.Push(pcString);
913     columnCount++;
914 }
915 
AddZygosityColumn()916 void PedigreeDescription::AddZygosityColumn()
917 {
918     if (columns.Last() == pcEnd)
919     {
920         columns.Pop();
921         columnHash.Pop();
922     }
923 
924     columnHash.Push(0);
925     columns.Push(pcZygosity);
926     columnCount++;
927 }
928 
AddSkippedColumn()929 void PedigreeDescription::AddSkippedColumn()
930 {
931     if (columns.Last() == pcEnd)
932     {
933         columns.Pop();
934         columnHash.Pop();
935     }
936 
937     columnHash.Push(0);
938     columns.Push(pcSkip);
939     columnCount++;
940 }
941 
942