1 // $Id: gc_migrate.cpp,v 1.33 2011/03/08 19:22:00 bobgian Exp $
2
3 /*
4 Copyright 2002 Mary Kuhner, Jon Yamato, and Joseph Felsenstein
5
6 This software is distributed free of charge for non-commercial use
7 and is copyrighted. Of course, we do not guarantee that the software
8 works, and are not responsible for any damage you may cause or have.
9 */
10
11 #include <cassert>
12
13 #include "gc_data.h"
14 #include "gc_datastore.h"
15 #include "gc_errhandling.h"
16 #include "gc_file.h"
17 #include "gc_file_util.h"
18 #include "gc_infile_err.h"
19 #include "gc_migrate.h"
20 #include "gc_parse_block.h"
21 #include "gc_strings_infile.h"
22
23 #include "wx/log.h"
24 #include "wx/tokenzr.h"
25 #include "wx/txtstrm.h"
26 #include "wx/wfstream.h"
27
28 //------------------------------------------------------------------------------------
29
GCMigrateParser(const GCDataStore & ds)30 GCMigrateParser::GCMigrateParser(const GCDataStore& ds)
31 : GCParser(ds)
32 {
33 }
34
35 GCParse *
Parse(GCFile & fileRef,gcGeneralDataType dataType,GCInterleaving interleaving)36 GCMigrateParser::Parse( GCFile & fileRef,
37 gcGeneralDataType dataType,
38 GCInterleaving interleaving)
39 {
40 SetUpStreams(fileRef.GetName());
41 assert( !( dataType.HasAllelic() && dataType.HasNucleic()));
42 if(dataType.HasAllelic()) return AlleleParse(fileRef,dataType,interleaving);
43 if(dataType.HasNucleic()) return NucParse(fileRef,dataType,interleaving);
44 assert(false);
45 return NULL;
46 }
47
48 GCParse *
NucParse(GCFile & fileRef,gcGeneralDataType dataType,GCInterleaving interleaving)49 GCMigrateParser::NucParse(GCFile & fileRef, gcGeneralDataType dataType, GCInterleaving interleaving)
50 {
51
52 GCParse & parseData = MakeParse(fileRef,format_MIGRATE,dataType,interleaving);
53
54 gcSpecificDataType dataTypeSpecInFile;
55 size_t numPops;
56 size_t numLoci;
57 wxString comment;
58
59 try
60 {
61 ParseMigrateFirstLine(dataTypeSpecInFile,numPops,numLoci,comment);
62 SetDataTypeFromFile(parseData,dataTypeSpecInFile);
63
64 std::vector<size_t> locusLengths = ParseMigrateLocusLengths();
65 for(size_t i=0;
66 i < locusLengths.size();
67 i++)
68 {
69 AddLocus(parseData,i,locusLengths[i]);
70 }
71
72 for(size_t popIndex = 0;
73 popIndex < numPops;
74 popIndex++)
75 {
76 wxString popComment;
77 std::vector<size_t> numSamples = ParseMigratePopulationInfo(popComment,locusLengths.size());
78 assert(numSamples.size() == locusLengths.size());
79 AddPop(parseData,popIndex,popComment);
80
81 for(size_t locIndex = 0;
82 locIndex < numLoci;
83 locIndex++)
84 {
85 FillData(parseData,popIndex,locIndex,interleaving,numSamples[locIndex]);
86 }
87 }
88 CheckNoExtraData();
89 return &parseData;
90 }
91 catch(gc_eof& e)
92 {
93 if(CompleteParse(parseData))
94 {
95 return &parseData;
96 }
97 else
98 {
99 delete &parseData;
100 e.setFile(fileRef.GetName());
101 throw;
102 }
103 }
104 catch(gc_infile_err& f)
105 {
106 delete &parseData;
107 f.setFile(fileRef.GetName());
108 f.setRow(m_linesRead);
109 throw;
110 }
111 assert(false);
112 return NULL;
113
114 }
115
116 GCParse *
AlleleParse(GCFile & fileRef,gcGeneralDataType dataType,GCInterleaving interleaving)117 GCMigrateParser::AlleleParse( GCFile & fileRef,
118 gcGeneralDataType dataType,
119 GCInterleaving interleaving)
120 {
121 gcSpecificDataType dataTypeSpecInFile;
122 size_t numPops;
123 size_t numSites;
124 wxString delimiter;
125 wxString comment;
126
127 ParseMigrateFirstLine(dataTypeSpecInFile,numPops,numSites,delimiter,comment);
128 GCParse & parseData = MakeParse(fileRef,format_MIGRATE,dataType,interleaving,delimiter);
129 SetDataTypeFromFile(parseData,dataTypeSpecInFile);
130
131 try
132 {
133 AddLocus(parseData,0,numSites);
134 for(size_t popIndex = 0;
135 popIndex < numPops;
136 popIndex++)
137 {
138 wxString popComment;
139 std::vector<size_t> numSamples = ParseMigratePopulationInfo(popComment,1); // EWFIX.P3 -- constant
140 assert(numSamples.size() == 1);
141 AddPop(parseData,popIndex,popComment);
142 FillData(parseData,popIndex,0,interleaving,numSamples[0]);
143 }
144 CheckNoExtraData();
145 return &parseData;
146 }
147 catch(gc_eof& e)
148 {
149 if(CompleteParse(parseData))
150 {
151 return &parseData;
152 }
153 else
154 {
155 delete &parseData;
156 e.setFile(fileRef.GetName());
157 throw;
158 }
159 }
160 catch(gc_infile_err& f)
161 {
162 delete &parseData;
163 f.setFile(fileRef.GetName());
164 f.setRow(m_linesRead);
165 throw;
166 }
167 assert(false);
168 return NULL;
169 }
170
~GCMigrateParser()171 GCMigrateParser::~GCMigrateParser()
172 {
173 }
174
175 void
ParseMigrateFirstLine(gcSpecificDataType & dataTypeSpecInFile,size_t & numPopsRef,size_t & numLociRef,wxString & comment)176 GCMigrateParser::ParseMigrateFirstLine(
177 gcSpecificDataType & dataTypeSpecInFile,
178 size_t & numPopsRef,
179 size_t & numLociRef,
180 wxString & comment)
181 {
182 wxString firstLine = ReadLine();
183 wxStringTokenizer tokenizer(firstLine);
184
185 dataTypeSpecInFile = sdatatype_NONE_SET;
186 wxString word = tokenizer.GetNextToken();
187 if(!word.IsNumber() && word.Len() == 1)
188 // we're looking for an optional single char token indicating
189 // the data type in this file. If it's a number, then the
190 // token is not here
191 {
192 if(word.IsSameAs("a",false)) dataTypeSpecInFile = sdatatype_KALLELE;
193 if(word.IsSameAs("e",false)) dataTypeSpecInFile = sdatatype_KALLELE;
194 if(word.IsSameAs("m",false)) dataTypeSpecInFile = sdatatype_MICROSAT;
195 if(word.IsSameAs("n",false)) dataTypeSpecInFile = sdatatype_SNP;
196 if(word.IsSameAs("s",false)) dataTypeSpecInFile = sdatatype_DNA;
197 if(dataTypeSpecInFile == sdatatype_NONE_SET)
198 {
199 wxString msg = wxString::Format(gcerr_migrate::firstToken,word.c_str());
200 m_dataStore.GCWarning(msg);
201 }
202 word = tokenizer.GetNextToken();
203 }
204
205 // OK. Now word should be a number indicating the number of populations
206 long longVal;
207 if(!word.ToLong(&longVal))
208 {
209 throw gc_migrate_bad_pop_count(word);
210 }
211 if(longVal <= 0)
212 {
213 throw gc_migrate_bad_pop_count(word);
214 }
215 numPopsRef = (size_t)longVal;
216
217 // The next word should be a number indicating the number of loci
218 word = tokenizer.GetNextToken();
219 if(!word.ToLong(&longVal) || longVal <= 0)
220 {
221 throw gc_migrate_bad_locus_count(word);
222 }
223 numLociRef = (size_t)longVal;
224
225 comment = tokenizer.GetString();
226 }
227
228 void
ParseMigrateFirstLine(gcSpecificDataType & dataTypeSpecInFile,size_t & numPopsRef,size_t & numLociRef,wxString & delimiter,wxString & comment)229 GCMigrateParser::ParseMigrateFirstLine( gcSpecificDataType& dataTypeSpecInFile,
230 size_t & numPopsRef,
231 size_t & numLociRef,
232 wxString & delimiter,
233 wxString & comment)
234 {
235 // this gets us the default values, which is that there is no
236 // delimiter specified.
237 delimiter.Empty();
238 ParseMigrateFirstLine(dataTypeSpecInFile,numPopsRef,numLociRef,comment);
239
240 wxStringTokenizer tokenizer(comment);
241 if(tokenizer.HasMoreTokens())
242 {
243 wxString mayBeDelimiter = tokenizer.GetNextToken();
244 if(IsLegalDelimiter(mayBeDelimiter))
245 {
246 delimiter = mayBeDelimiter;
247 comment = tokenizer.GetString();
248 }
249 }
250 }
251
252 bool
IsLegalDelimiter(wxString delimCandidate)253 GCMigrateParser::IsLegalDelimiter(wxString delimCandidate)
254 {
255 if(delimCandidate.Length() != 1) return false;
256 if(delimCandidate[0] == gcstr_migrate::missingData)
257 {
258 throw gc_migrate_bad_delimiter(delimCandidate);
259 return false;
260 }
261 return true;
262 }
263
264 std::vector<size_t>
ParseMigrateLocusLengths()265 GCMigrateParser::ParseMigrateLocusLengths()
266 {
267 wxString lociLengthLine = ReadLine();
268 wxStringTokenizer tokenizer(lociLengthLine);
269 std::vector<size_t> locusLengths;
270
271 size_t index = 0;
272 while(tokenizer.CountTokens() != 0)
273 {
274 wxString token = tokenizer.GetNextToken();
275 long longVal;
276 if(!token.ToLong(&longVal))
277 {
278 throw gc_migrate_locus_length_not_positive(token);
279 }
280 if(longVal <= 0)
281 {
282 throw gc_migrate_locus_length_not_positive(token);
283 }
284 size_t locusLength = (size_t)longVal;
285 locusLengths.push_back(locusLength);
286 index++;
287 }
288 return locusLengths;
289 }
290
291 std::vector<size_t>
ParseMigratePopulationInfo(wxString & populationName,size_t locusCount)292 GCMigrateParser::ParseMigratePopulationInfo(wxString & populationName, size_t locusCount)
293 {
294 std::vector<size_t> numSamplesForEachLocus;
295
296 wxString line = ReadLine();
297 wxStringTokenizer tokenizer(line);
298 wxString lastToken = wxEmptyString;
299 bool shouldUseLastToken = false;
300
301 try
302 {
303 for(size_t i = 0;
304 i < locusCount ;
305 i++)
306 {
307 lastToken = tokenizer.GetNextToken();
308 long longVal;
309 if(!lastToken.ToLong(&longVal))
310 {
311 throw gc_migrate_missing_sequence_count(lastToken);
312 }
313 if(longVal <= 0)
314 {
315 throw gc_migrate_bad_sequence_count(lastToken);
316 }
317 size_t sequenceCount = (size_t)longVal;
318 numSamplesForEachLocus.push_back(sequenceCount);
319 }
320 }
321 catch (const gc_migrate_missing_sequence_count & e)
322 {
323 if(numSamplesForEachLocus.size() == 1)
324 {
325 for(size_t i=1;
326 i < locusCount;
327 i++)
328 {
329 numSamplesForEachLocus.push_back(numSamplesForEachLocus[0]);
330 }
331 shouldUseLastToken = true;
332 }
333 else
334 {
335 throw gc_migrate_too_few_sequence_lengths(locusCount,line);
336 }
337
338 }
339 assert(numSamplesForEachLocus.size() == locusCount);
340
341 populationName = tokenizer.GetString();
342 if(shouldUseLastToken)
343 {
344 populationName = wxString::Format("%s %s",
345 lastToken.c_str(),
346 populationName.c_str());
347 }
348
349 populationName.Trim(true);
350 populationName.Trim(false);
351 return numSamplesForEachLocus;
352 }
353
354 bool
CompleteParse(GCParse & parseData)355 GCMigrateParser::CompleteParse(GCParse & parseData)
356 {
357 // check we have pops
358 size_t pcount = parseData.GetPopCount();
359 if(pcount < 1) return false;
360
361 // check we have a locus
362 size_t lcount = parseData.GetLociCount();
363 if(lcount < 1) return false;
364
365 // check we have a block for each
366 constBlockVector blocks = parseData.GetBlocks();
367 if(blocks.size() != pcount * lcount) return false;
368
369 // check block has correct number of sequences
370 constBlockVector::const_iterator i;
371 for(i=blocks.begin(); i != blocks.end(); i++)
372 {
373 const GCParseBlock * blockP = *i;
374 if(blockP == NULL) return false;
375 size_t expectedNumSequences = blockP->GetExpectedNumSequences();
376 const GCParseSamples & samples = blockP->GetSamples();
377 if(samples.size() != expectedNumSequences) return false;
378
379 // check block has correct number of sites
380 if(blockP->HasIncompleteSequences()) return false;
381 }
382
383 return true;
384 }
385
386 //____________________________________________________________________________________
387