1 // $Id: HapConverter.cpp,v 1.24 2011/03/07 06:08:48 bobgian Exp $
2 
3 /*
4   Copyright 2002 Patrick Colacurcio, Peter Beerli, Mary Kuhner,
5   Jon Yamato and Joseph Felsenstein
6 
7   This software is distributed free of charge for non-commercial use
8   and is copyrighted.  Of course, we do not guarantee that the software
9   works, and are not responsible for any damage you may cause or have.
10 */
11 
12 #include <cassert>
13 
14 #include "Converter_HapConverter.h"
15 #include "Converter_DataSourceException.h"
16 #include "Converter_ParserUtil.h"
17 #include "random.h"
18 #include "constants.h"
19 
20 //------------------------------------------------------------------------------------
21 
HapConverter(RegionDS & region,Random & random)22 HapConverter::HapConverter(RegionDS& region, Random& random) :
23     ConverterIf(), m_region(region), m_random(random),
24     m_hapnames(region.GetAllSeqNames())
25 {
26     // m_hapnames should end up with the same order of sequences as
27     // they existed in the user file, don't know if this works!
28 
29 } // HapConverter::ctor
30 
31 //------------------------------------------------------------------------------------
32 
~HapConverter()33 HapConverter::~HapConverter()
34 {
35 } // HapConverter::dtor
36 
37 //------------------------------------------------------------------------------------
38 
ParsePhaseInfo(ifstream & input) const39 vector<long> HapConverter::ParsePhaseInfo(ifstream& input) const
40 // EWFIX.P3.BUG.351 -- this doesn't handle the case where you have no
41 // phase known/unknown info, but you're just trying to assign
42 // samples to individuals
43 {
44     skipWhiteSpace(input);
45     string theline;
46     getLine(input, theline);
47 
48     istringstream linestream(theline);
49     string somestring;
50     vector<long> sites;
51     getNumber(linestream,somestring);
52 
53     while(!somestring.empty())
54     {
55         long position = FlagCheck(somestring,"marker position");
56         sites.push_back(position);
57         skipWhiteSpace(linestream);
58         somestring.erase();
59         getNumber(linestream,somestring);
60     }
61 
62     return sites;
63 
64 } // HapConverter::ParsePhaseInfo
65 
66 //------------------------------------------------------------------------------------
67 
PopHapNames(long ntopop)68 vector<string> HapConverter::PopHapNames(long ntopop)
69 {
70     vector<string>::iterator start = m_hapnames.begin(),
71         stop = m_hapnames.begin()+ntopop;
72     vector<string> hnames(start,stop);
73     m_hapnames.erase(start,stop);
74 
75     return hnames;
76 
77 } // HapConverter::PopHapNames
78 
79 //------------------------------------------------------------------------------------
80 
PopTilDelimiter(istream & input,const char & dlm) const81 string HapConverter::PopTilDelimiter(istream& input, const char& dlm) const
82 {
83     char ch(input.get());
84     string buffer;
85 
86     while((ch != EOF) && (ch != dlm))
87     {
88         buffer += ch;
89         ch = input.get();
90     }
91 
92     if (ch == EOF) input.putback(ch);
93 
94     return buffer;
95 
96 } // HapConverter::PopTilDelimiter
97 
98 //------------------------------------------------------------------------------------
99 
ParseFirstLine(istringstream & firstline,const string & filename,ifstream & filestr)100 IndDSVec HapConverter::ParseFirstLine(istringstream& firstline, const string& filename, ifstream& filestr)
101 {
102     string somestring;
103     IndDSVec individuals;
104 
105     if (getNumber(firstline,somestring))
106     {
107         m_nindividuals = FlagCheck(somestring, string("number of individuals"));
108     }
109     else
110     {
111         string errormsg = "The file " + filename + " is incorrectedly ";
112         errormsg += "formated.\nThe first token must be an integer ";
113         errormsg += "equal to the number of individuals in the file.";
114         throw FileFormatError(errormsg);
115     }
116 
117     //  Its possible that there will be an adjacency key.
118     somestring.erase();
119     if (getWord(firstline, somestring))
120     {
121         if (CaselessStrCmp(somestring,"adjacent"))
122         {
123             long nhaps;
124             // adjacent sequences are haplotypes
125             somestring.erase();
126             if (getNumber(firstline,somestring))
127                 nhaps = FlagCheck(somestring,string("number of haplotypes"));
128             else
129             {
130                 string errormsg = "The file " + filename + " is ";
131                 errormsg += "incorrectly formated.\n";
132                 errormsg += "The converter expected to find the";
133                 errormsg += " global number of haplotypes for each";
134                 errormsg += " individual.";
135                 throw FileFormatError(errormsg);
136             }
137 
138             // now get phase information for all individuals
139             long nsites = m_region.getNmarkers();
140             vector<long> psites(nsites);
141             somestring.erase();
142             if (getWord(firstline,somestring))
143             {
144                 if (CaselessStrCmp(somestring,"all"))
145                 {
146                     for(long site=0; site < nsites; ++site)
147                         psites[site] = site;
148                 }
149                 else
150                 {
151                     string errormsg = "The file " + filename + " is ";
152                     errormsg += "incorrectly formated.\nThe unknown ";
153                     errormsg += "keyword, " + somestring + ", was ";
154                     errormsg += "encountered.";
155                     throw FileFormatError(errormsg);
156                 }
157             }
158             else
159             {
160                 psites = ParsePhaseInfo(filestr);
161             }
162 
163             unsigned long totalhaps = m_nindividuals*nhaps;
164             if (m_hapnames.size() != totalhaps)
165             {
166                 string errormsg = "The first line of " + filename;
167                 errormsg += " specifies " + ToString(m_nindividuals);
168                 errormsg += " individuals\nwith " + ToString(nhaps);
169                 errormsg += " haplotypes each (requiring a total of ";
170                 errormsg += ToString(totalhaps) + " haplotypes).\nThe";
171                 errormsg += " genetic data provides only ";
172                 errormsg += ToString(m_hapnames.size()) + " haplotypes.";
173                 throw FileFormatError(errormsg);
174             }
175 
176             // now setup the individuals
177             for(int ind = 0; ind < m_nindividuals; ++ind)
178             {
179                 string randomname = m_random.Name();
180                 IndividualDS individual(randomname);
181                 vector<string> hnames(PopHapNames(nhaps));
182                 individual.AddHap(hnames);
183                 individual.SetUnknownPhase(psites);
184                 individuals.push_back(individual);
185             }
186         }
187         else
188         {
189             string errormsg = "The file " + filename + " is ";
190             errormsg += "incorrectly formated.\nThe unknown ";
191             errormsg += "keyword, " + somestring + ", was ";
192             errormsg += "encountered.";
193             throw FileFormatError(errormsg);
194         }
195     }
196 
197     return individuals;
198 
199 } // HapConverter::ParseFirstLine
200 
201 //------------------------------------------------------------------------------------
202 
ReadHapInfo(const string & filename)203 IndDSVec HapConverter::ReadHapInfo(const string& filename)
204 {
205     ifstream haplotypefile(filename.c_str(),ios::in);
206 
207     if (!haplotypefile)
208     {
209         string errormsg = "Couldn't find the file: " + filename;
210         throw FileFormatError (errormsg);
211     }
212 
213     string line;
214 
215     if (!getLine(haplotypefile,line))
216     {
217         string errormsg = "The file " + filename + " appears to";
218         errormsg += " be empty.";
219         throw FileFormatError (errormsg);
220     }
221 
222     istringstream linestream(line);
223 
224     IndDSVec individuals(ParseFirstLine(linestream,filename,haplotypefile));
225 
226     if (!individuals.empty()) return individuals;
227 
228     // read in the haplotype name delimiter character;
229     line.erase();
230     getLine(haplotypefile,line);
231     const char hapname_dlm(line[0]);
232 
233     // EWFIX.P3 -- need to insist that hapname_dlm is not a digit
234 
235     // Now start parsing the individuals
236     for(int ind = 0; ind < m_nindividuals; ++ind)
237     {
238         line.erase();
239         if (getLine(haplotypefile,line))
240         {
241             istringstream linestr(line);
242             string somestring;
243             getName(linestr,somestring);
244             IndividualDS individual(somestring);
245 
246             somestring.erase();
247             getNumber(linestr,somestring);
248             long int nhapnames = FlagCheck(somestring,string("number of haplotypes"));
249 
250             for(int hname = 0; hname < nhapnames; ++hname)
251             {
252                 somestring = PopTilDelimiter(linestr,hapname_dlm);
253                 StripLeadingSpaces(somestring);
254                 StripTrailingSpaces(somestring);
255                 individual.AddHap(somestring);
256             }
257 
258             vector<long> sites(ParsePhaseInfo(haplotypefile));
259             individual.SetUnknownPhase(sites);
260 
261             individuals.push_back(individual);
262         }
263         else
264         {
265             string errormsg = "The haplotypefile is incorrectedly formated.\n";
266             errormsg += "The converter expected to find ";
267             errormsg += ToString(m_nindividuals) + " individuals ";
268             errormsg += "but only found " + indexToKey(ind);
269             throw FileFormatError(errormsg);
270         }
271     }
272 
273     return individuals;
274 
275 } // HapConverter::ReadHapInfo
276 
277 //------------------------------------------------------------------------------------
278 
ReplaceIndividualsWith(IndDSVec & individuals)279 void HapConverter::ReplaceIndividualsWith(IndDSVec& individuals)
280 {
281 
282     vector<string> popnames(individuals.size());
283     unsigned long whichind;
284     for(whichind = 0; whichind < individuals.size(); ++whichind)
285     {
286         // can't do population assignment in FailToAdd due to identical
287         // sequence names possibly present
288         if (m_region.FailToAdd(individuals[whichind],popnames[whichind]))
289         {
290             string errormsg = "The haplotypes, ";
291             errormsg += individuals[whichind].GetHapNamesForPrint();
292             errormsg += " were not part of the data set for the";
293             errormsg += " region, " + m_region.getName();
294             throw FileFormatError(errormsg);
295         }
296     }
297 
298     for(whichind = 0; whichind < individuals.size(); ++whichind)
299     {
300         m_region.AddIndividual(individuals[whichind],popnames[whichind]);
301     }
302 
303 } // HapConverter::ReplaceIndividualsWith
304 
305 //------------------------------------------------------------------------------------
306 
addConvertedLamarcDS(LamarcDS &)307 void HapConverter::addConvertedLamarcDS(LamarcDS&)
308 {
309     // DEBUG debug -- currently a no-op function!
310     // lamarc.mergeTo(m_lamarc);
311     assert(false);  // this code never ought to be called
312 } // HapConverter::addConvertedLamarcDS
313 
314 //____________________________________________________________________________________
315