1 // $Id: HapConverter.cpp,v 1.24 2011/03/07 06:08:48 bobgian Exp $
2
3 /*
4 Copyright 2002 Patrick Colacurcio, Peter Beerli, Mary Kuhner,
5 Jon Yamato and Joseph Felsenstein
6
7 This software is distributed free of charge for non-commercial use
8 and is copyrighted. Of course, we do not guarantee that the software
9 works, and are not responsible for any damage you may cause or have.
10 */
11
12 #include <cassert>
13
14 #include "Converter_HapConverter.h"
15 #include "Converter_DataSourceException.h"
16 #include "Converter_ParserUtil.h"
17 #include "random.h"
18 #include "constants.h"
19
20 //------------------------------------------------------------------------------------
21
HapConverter(RegionDS & region,Random & random)22 HapConverter::HapConverter(RegionDS& region, Random& random) :
23 ConverterIf(), m_region(region), m_random(random),
24 m_hapnames(region.GetAllSeqNames())
25 {
26 // m_hapnames should end up with the same order of sequences as
27 // they existed in the user file, don't know if this works!
28
29 } // HapConverter::ctor
30
31 //------------------------------------------------------------------------------------
32
~HapConverter()33 HapConverter::~HapConverter()
34 {
35 } // HapConverter::dtor
36
37 //------------------------------------------------------------------------------------
38
ParsePhaseInfo(ifstream & input) const39 vector<long> HapConverter::ParsePhaseInfo(ifstream& input) const
40 // EWFIX.P3.BUG.351 -- this doesn't handle the case where you have no
41 // phase known/unknown info, but you're just trying to assign
42 // samples to individuals
43 {
44 skipWhiteSpace(input);
45 string theline;
46 getLine(input, theline);
47
48 istringstream linestream(theline);
49 string somestring;
50 vector<long> sites;
51 getNumber(linestream,somestring);
52
53 while(!somestring.empty())
54 {
55 long position = FlagCheck(somestring,"marker position");
56 sites.push_back(position);
57 skipWhiteSpace(linestream);
58 somestring.erase();
59 getNumber(linestream,somestring);
60 }
61
62 return sites;
63
64 } // HapConverter::ParsePhaseInfo
65
66 //------------------------------------------------------------------------------------
67
PopHapNames(long ntopop)68 vector<string> HapConverter::PopHapNames(long ntopop)
69 {
70 vector<string>::iterator start = m_hapnames.begin(),
71 stop = m_hapnames.begin()+ntopop;
72 vector<string> hnames(start,stop);
73 m_hapnames.erase(start,stop);
74
75 return hnames;
76
77 } // HapConverter::PopHapNames
78
79 //------------------------------------------------------------------------------------
80
PopTilDelimiter(istream & input,const char & dlm) const81 string HapConverter::PopTilDelimiter(istream& input, const char& dlm) const
82 {
83 char ch(input.get());
84 string buffer;
85
86 while((ch != EOF) && (ch != dlm))
87 {
88 buffer += ch;
89 ch = input.get();
90 }
91
92 if (ch == EOF) input.putback(ch);
93
94 return buffer;
95
96 } // HapConverter::PopTilDelimiter
97
98 //------------------------------------------------------------------------------------
99
ParseFirstLine(istringstream & firstline,const string & filename,ifstream & filestr)100 IndDSVec HapConverter::ParseFirstLine(istringstream& firstline, const string& filename, ifstream& filestr)
101 {
102 string somestring;
103 IndDSVec individuals;
104
105 if (getNumber(firstline,somestring))
106 {
107 m_nindividuals = FlagCheck(somestring, string("number of individuals"));
108 }
109 else
110 {
111 string errormsg = "The file " + filename + " is incorrectedly ";
112 errormsg += "formated.\nThe first token must be an integer ";
113 errormsg += "equal to the number of individuals in the file.";
114 throw FileFormatError(errormsg);
115 }
116
117 // Its possible that there will be an adjacency key.
118 somestring.erase();
119 if (getWord(firstline, somestring))
120 {
121 if (CaselessStrCmp(somestring,"adjacent"))
122 {
123 long nhaps;
124 // adjacent sequences are haplotypes
125 somestring.erase();
126 if (getNumber(firstline,somestring))
127 nhaps = FlagCheck(somestring,string("number of haplotypes"));
128 else
129 {
130 string errormsg = "The file " + filename + " is ";
131 errormsg += "incorrectly formated.\n";
132 errormsg += "The converter expected to find the";
133 errormsg += " global number of haplotypes for each";
134 errormsg += " individual.";
135 throw FileFormatError(errormsg);
136 }
137
138 // now get phase information for all individuals
139 long nsites = m_region.getNmarkers();
140 vector<long> psites(nsites);
141 somestring.erase();
142 if (getWord(firstline,somestring))
143 {
144 if (CaselessStrCmp(somestring,"all"))
145 {
146 for(long site=0; site < nsites; ++site)
147 psites[site] = site;
148 }
149 else
150 {
151 string errormsg = "The file " + filename + " is ";
152 errormsg += "incorrectly formated.\nThe unknown ";
153 errormsg += "keyword, " + somestring + ", was ";
154 errormsg += "encountered.";
155 throw FileFormatError(errormsg);
156 }
157 }
158 else
159 {
160 psites = ParsePhaseInfo(filestr);
161 }
162
163 unsigned long totalhaps = m_nindividuals*nhaps;
164 if (m_hapnames.size() != totalhaps)
165 {
166 string errormsg = "The first line of " + filename;
167 errormsg += " specifies " + ToString(m_nindividuals);
168 errormsg += " individuals\nwith " + ToString(nhaps);
169 errormsg += " haplotypes each (requiring a total of ";
170 errormsg += ToString(totalhaps) + " haplotypes).\nThe";
171 errormsg += " genetic data provides only ";
172 errormsg += ToString(m_hapnames.size()) + " haplotypes.";
173 throw FileFormatError(errormsg);
174 }
175
176 // now setup the individuals
177 for(int ind = 0; ind < m_nindividuals; ++ind)
178 {
179 string randomname = m_random.Name();
180 IndividualDS individual(randomname);
181 vector<string> hnames(PopHapNames(nhaps));
182 individual.AddHap(hnames);
183 individual.SetUnknownPhase(psites);
184 individuals.push_back(individual);
185 }
186 }
187 else
188 {
189 string errormsg = "The file " + filename + " is ";
190 errormsg += "incorrectly formated.\nThe unknown ";
191 errormsg += "keyword, " + somestring + ", was ";
192 errormsg += "encountered.";
193 throw FileFormatError(errormsg);
194 }
195 }
196
197 return individuals;
198
199 } // HapConverter::ParseFirstLine
200
201 //------------------------------------------------------------------------------------
202
ReadHapInfo(const string & filename)203 IndDSVec HapConverter::ReadHapInfo(const string& filename)
204 {
205 ifstream haplotypefile(filename.c_str(),ios::in);
206
207 if (!haplotypefile)
208 {
209 string errormsg = "Couldn't find the file: " + filename;
210 throw FileFormatError (errormsg);
211 }
212
213 string line;
214
215 if (!getLine(haplotypefile,line))
216 {
217 string errormsg = "The file " + filename + " appears to";
218 errormsg += " be empty.";
219 throw FileFormatError (errormsg);
220 }
221
222 istringstream linestream(line);
223
224 IndDSVec individuals(ParseFirstLine(linestream,filename,haplotypefile));
225
226 if (!individuals.empty()) return individuals;
227
228 // read in the haplotype name delimiter character;
229 line.erase();
230 getLine(haplotypefile,line);
231 const char hapname_dlm(line[0]);
232
233 // EWFIX.P3 -- need to insist that hapname_dlm is not a digit
234
235 // Now start parsing the individuals
236 for(int ind = 0; ind < m_nindividuals; ++ind)
237 {
238 line.erase();
239 if (getLine(haplotypefile,line))
240 {
241 istringstream linestr(line);
242 string somestring;
243 getName(linestr,somestring);
244 IndividualDS individual(somestring);
245
246 somestring.erase();
247 getNumber(linestr,somestring);
248 long int nhapnames = FlagCheck(somestring,string("number of haplotypes"));
249
250 for(int hname = 0; hname < nhapnames; ++hname)
251 {
252 somestring = PopTilDelimiter(linestr,hapname_dlm);
253 StripLeadingSpaces(somestring);
254 StripTrailingSpaces(somestring);
255 individual.AddHap(somestring);
256 }
257
258 vector<long> sites(ParsePhaseInfo(haplotypefile));
259 individual.SetUnknownPhase(sites);
260
261 individuals.push_back(individual);
262 }
263 else
264 {
265 string errormsg = "The haplotypefile is incorrectedly formated.\n";
266 errormsg += "The converter expected to find ";
267 errormsg += ToString(m_nindividuals) + " individuals ";
268 errormsg += "but only found " + indexToKey(ind);
269 throw FileFormatError(errormsg);
270 }
271 }
272
273 return individuals;
274
275 } // HapConverter::ReadHapInfo
276
277 //------------------------------------------------------------------------------------
278
ReplaceIndividualsWith(IndDSVec & individuals)279 void HapConverter::ReplaceIndividualsWith(IndDSVec& individuals)
280 {
281
282 vector<string> popnames(individuals.size());
283 unsigned long whichind;
284 for(whichind = 0; whichind < individuals.size(); ++whichind)
285 {
286 // can't do population assignment in FailToAdd due to identical
287 // sequence names possibly present
288 if (m_region.FailToAdd(individuals[whichind],popnames[whichind]))
289 {
290 string errormsg = "The haplotypes, ";
291 errormsg += individuals[whichind].GetHapNamesForPrint();
292 errormsg += " were not part of the data set for the";
293 errormsg += " region, " + m_region.getName();
294 throw FileFormatError(errormsg);
295 }
296 }
297
298 for(whichind = 0; whichind < individuals.size(); ++whichind)
299 {
300 m_region.AddIndividual(individuals[whichind],popnames[whichind]);
301 }
302
303 } // HapConverter::ReplaceIndividualsWith
304
305 //------------------------------------------------------------------------------------
306
addConvertedLamarcDS(LamarcDS &)307 void HapConverter::addConvertedLamarcDS(LamarcDS&)
308 {
309 // DEBUG debug -- currently a no-op function!
310 // lamarc.mergeTo(m_lamarc);
311 assert(false); // this code never ought to be called
312 } // HapConverter::addConvertedLamarcDS
313
314 //____________________________________________________________________________________
315