1 // $Id: gc_migrate.cpp,v 1.33 2011/03/08 19:22:00 bobgian Exp $
2 
3 /*
4   Copyright 2002  Mary Kuhner, Jon Yamato, and Joseph Felsenstein
5 
6   This software is distributed free of charge for non-commercial use
7   and is copyrighted.  Of course, we do not guarantee that the software
8   works, and are not responsible for any damage you may cause or have.
9 */
10 
11 #include <cassert>
12 
13 #include "gc_data.h"
14 #include "gc_datastore.h"
15 #include "gc_errhandling.h"
16 #include "gc_file.h"
17 #include "gc_file_util.h"
18 #include "gc_infile_err.h"
19 #include "gc_migrate.h"
20 #include "gc_parse_block.h"
21 #include "gc_strings_infile.h"
22 
23 #include "wx/log.h"
24 #include "wx/tokenzr.h"
25 #include "wx/txtstrm.h"
26 #include "wx/wfstream.h"
27 
28 //------------------------------------------------------------------------------------
29 
GCMigrateParser(const GCDataStore & ds)30 GCMigrateParser::GCMigrateParser(const GCDataStore& ds)
31     :   GCParser(ds)
32 {
33 }
34 
35 GCParse *
Parse(GCFile & fileRef,gcGeneralDataType dataType,GCInterleaving interleaving)36 GCMigrateParser::Parse( GCFile &            fileRef,
37                         gcGeneralDataType   dataType,
38                         GCInterleaving      interleaving)
39 {
40     SetUpStreams(fileRef.GetName());
41     assert( !( dataType.HasAllelic() && dataType.HasNucleic()));
42     if(dataType.HasAllelic()) return AlleleParse(fileRef,dataType,interleaving);
43     if(dataType.HasNucleic()) return NucParse(fileRef,dataType,interleaving);
44     assert(false);
45     return NULL;
46 }
47 
48 GCParse *
NucParse(GCFile & fileRef,gcGeneralDataType dataType,GCInterleaving interleaving)49 GCMigrateParser::NucParse(GCFile & fileRef, gcGeneralDataType dataType, GCInterleaving interleaving)
50 {
51 
52     GCParse & parseData = MakeParse(fileRef,format_MIGRATE,dataType,interleaving);
53 
54     gcSpecificDataType  dataTypeSpecInFile;
55     size_t              numPops;
56     size_t              numLoci;
57     wxString            comment;
58 
59     try
60     {
61         ParseMigrateFirstLine(dataTypeSpecInFile,numPops,numLoci,comment);
62         SetDataTypeFromFile(parseData,dataTypeSpecInFile);
63 
64         std::vector<size_t> locusLengths = ParseMigrateLocusLengths();
65         for(size_t i=0;
66             i < locusLengths.size();
67             i++)
68         {
69             AddLocus(parseData,i,locusLengths[i]);
70         }
71 
72         for(size_t popIndex = 0;
73             popIndex < numPops;
74             popIndex++)
75         {
76             wxString popComment;
77             std::vector<size_t> numSamples = ParseMigratePopulationInfo(popComment,locusLengths.size());
78             assert(numSamples.size() == locusLengths.size());
79             AddPop(parseData,popIndex,popComment);
80 
81             for(size_t locIndex = 0;
82                 locIndex < numLoci;
83                 locIndex++)
84             {
85                 FillData(parseData,popIndex,locIndex,interleaving,numSamples[locIndex]);
86             }
87         }
88         CheckNoExtraData();
89         return &parseData;
90     }
91     catch(gc_eof& e)
92     {
93         if(CompleteParse(parseData))
94         {
95             return &parseData;
96         }
97         else
98         {
99             delete &parseData;
100             e.setFile(fileRef.GetName());
101             throw;
102         }
103     }
104     catch(gc_infile_err& f)
105     {
106         delete &parseData;
107         f.setFile(fileRef.GetName());
108         f.setRow(m_linesRead);
109         throw;
110     }
111     assert(false);
112     return NULL;
113 
114 }
115 
116 GCParse *
AlleleParse(GCFile & fileRef,gcGeneralDataType dataType,GCInterleaving interleaving)117 GCMigrateParser::AlleleParse(   GCFile &            fileRef,
118                                 gcGeneralDataType   dataType,
119                                 GCInterleaving      interleaving)
120 {
121     gcSpecificDataType  dataTypeSpecInFile;
122     size_t              numPops;
123     size_t              numSites;
124     wxString            delimiter;
125     wxString            comment;
126 
127     ParseMigrateFirstLine(dataTypeSpecInFile,numPops,numSites,delimiter,comment);
128     GCParse & parseData = MakeParse(fileRef,format_MIGRATE,dataType,interleaving,delimiter);
129     SetDataTypeFromFile(parseData,dataTypeSpecInFile);
130 
131     try
132     {
133         AddLocus(parseData,0,numSites);
134         for(size_t popIndex = 0;
135             popIndex < numPops;
136             popIndex++)
137         {
138             wxString popComment;
139             std::vector<size_t> numSamples = ParseMigratePopulationInfo(popComment,1);    // EWFIX.P3 -- constant
140             assert(numSamples.size() == 1);
141             AddPop(parseData,popIndex,popComment);
142             FillData(parseData,popIndex,0,interleaving,numSamples[0]);
143         }
144         CheckNoExtraData();
145         return &parseData;
146     }
147     catch(gc_eof& e)
148     {
149         if(CompleteParse(parseData))
150         {
151             return &parseData;
152         }
153         else
154         {
155             delete &parseData;
156             e.setFile(fileRef.GetName());
157             throw;
158         }
159     }
160     catch(gc_infile_err& f)
161     {
162         delete &parseData;
163         f.setFile(fileRef.GetName());
164         f.setRow(m_linesRead);
165         throw;
166     }
167     assert(false);
168     return NULL;
169 }
170 
~GCMigrateParser()171 GCMigrateParser::~GCMigrateParser()
172 {
173 }
174 
175 void
ParseMigrateFirstLine(gcSpecificDataType & dataTypeSpecInFile,size_t & numPopsRef,size_t & numLociRef,wxString & comment)176 GCMigrateParser::ParseMigrateFirstLine(
177     gcSpecificDataType &        dataTypeSpecInFile,
178     size_t &            numPopsRef,
179     size_t &            numLociRef,
180     wxString &          comment)
181 {
182     wxString firstLine = ReadLine();
183     wxStringTokenizer tokenizer(firstLine);
184 
185     dataTypeSpecInFile = sdatatype_NONE_SET;
186     wxString word = tokenizer.GetNextToken();
187     if(!word.IsNumber() && word.Len() == 1)
188         // we're looking for an optional single char token indicating
189         // the data type in this file. If it's a number, then the
190         // token is not here
191     {
192         if(word.IsSameAs("a",false)) dataTypeSpecInFile = sdatatype_KALLELE;
193         if(word.IsSameAs("e",false)) dataTypeSpecInFile = sdatatype_KALLELE;
194         if(word.IsSameAs("m",false)) dataTypeSpecInFile = sdatatype_MICROSAT;
195         if(word.IsSameAs("n",false)) dataTypeSpecInFile = sdatatype_SNP;
196         if(word.IsSameAs("s",false)) dataTypeSpecInFile = sdatatype_DNA;
197         if(dataTypeSpecInFile == sdatatype_NONE_SET)
198         {
199             wxString msg = wxString::Format(gcerr_migrate::firstToken,word.c_str());
200             m_dataStore.GCWarning(msg);
201         }
202         word = tokenizer.GetNextToken();
203     }
204 
205     // OK. Now word should be a number indicating the number of populations
206     long longVal;
207     if(!word.ToLong(&longVal))
208     {
209         throw gc_migrate_bad_pop_count(word);
210     }
211     if(longVal <= 0)
212     {
213         throw gc_migrate_bad_pop_count(word);
214     }
215     numPopsRef = (size_t)longVal;
216 
217     // The next word should be a number indicating the number of loci
218     word = tokenizer.GetNextToken();
219     if(!word.ToLong(&longVal) || longVal <= 0)
220     {
221         throw gc_migrate_bad_locus_count(word);
222     }
223     numLociRef = (size_t)longVal;
224 
225     comment = tokenizer.GetString();
226 }
227 
228 void
ParseMigrateFirstLine(gcSpecificDataType & dataTypeSpecInFile,size_t & numPopsRef,size_t & numLociRef,wxString & delimiter,wxString & comment)229 GCMigrateParser::ParseMigrateFirstLine( gcSpecificDataType& dataTypeSpecInFile,
230                                         size_t &            numPopsRef,
231                                         size_t &            numLociRef,
232                                         wxString &          delimiter,
233                                         wxString &          comment)
234 {
235     // this gets us the default values, which is that there is no
236     // delimiter specified.
237     delimiter.Empty();
238     ParseMigrateFirstLine(dataTypeSpecInFile,numPopsRef,numLociRef,comment);
239 
240     wxStringTokenizer tokenizer(comment);
241     if(tokenizer.HasMoreTokens())
242     {
243         wxString mayBeDelimiter = tokenizer.GetNextToken();
244         if(IsLegalDelimiter(mayBeDelimiter))
245         {
246             delimiter = mayBeDelimiter;
247             comment = tokenizer.GetString();
248         }
249     }
250 }
251 
252 bool
IsLegalDelimiter(wxString delimCandidate)253 GCMigrateParser::IsLegalDelimiter(wxString delimCandidate)
254 {
255     if(delimCandidate.Length() != 1) return false;
256     if(delimCandidate[0] == gcstr_migrate::missingData)
257     {
258         throw gc_migrate_bad_delimiter(delimCandidate);
259         return false;
260     }
261     return true;
262 }
263 
264 std::vector<size_t>
ParseMigrateLocusLengths()265 GCMigrateParser::ParseMigrateLocusLengths()
266 {
267     wxString lociLengthLine = ReadLine();
268     wxStringTokenizer tokenizer(lociLengthLine);
269     std::vector<size_t> locusLengths;
270 
271     size_t index = 0;
272     while(tokenizer.CountTokens() != 0)
273     {
274         wxString token = tokenizer.GetNextToken();
275         long longVal;
276         if(!token.ToLong(&longVal))
277         {
278             throw gc_migrate_locus_length_not_positive(token);
279         }
280         if(longVal <= 0)
281         {
282             throw gc_migrate_locus_length_not_positive(token);
283         }
284         size_t locusLength = (size_t)longVal;
285         locusLengths.push_back(locusLength);
286         index++;
287     }
288     return locusLengths;
289 }
290 
291 std::vector<size_t>
ParseMigratePopulationInfo(wxString & populationName,size_t locusCount)292 GCMigrateParser::ParseMigratePopulationInfo(wxString & populationName, size_t locusCount)
293 {
294     std::vector<size_t> numSamplesForEachLocus;
295 
296     wxString line = ReadLine();
297     wxStringTokenizer tokenizer(line);
298     wxString lastToken = wxEmptyString;
299     bool shouldUseLastToken = false;
300 
301     try
302     {
303         for(size_t i = 0;
304             i < locusCount ;
305             i++)
306         {
307             lastToken = tokenizer.GetNextToken();
308             long longVal;
309             if(!lastToken.ToLong(&longVal))
310             {
311                 throw gc_migrate_missing_sequence_count(lastToken);
312             }
313             if(longVal <= 0)
314             {
315                 throw gc_migrate_bad_sequence_count(lastToken);
316             }
317             size_t sequenceCount = (size_t)longVal;
318             numSamplesForEachLocus.push_back(sequenceCount);
319         }
320     }
321     catch (const gc_migrate_missing_sequence_count & e)
322     {
323         if(numSamplesForEachLocus.size() == 1)
324         {
325             for(size_t i=1;
326                 i < locusCount;
327                 i++)
328             {
329                 numSamplesForEachLocus.push_back(numSamplesForEachLocus[0]);
330             }
331             shouldUseLastToken = true;
332         }
333         else
334         {
335             throw gc_migrate_too_few_sequence_lengths(locusCount,line);
336         }
337 
338     }
339     assert(numSamplesForEachLocus.size() == locusCount);
340 
341     populationName = tokenizer.GetString();
342     if(shouldUseLastToken)
343     {
344         populationName = wxString::Format("%s %s",
345                                           lastToken.c_str(),
346                                           populationName.c_str());
347     }
348 
349     populationName.Trim(true);
350     populationName.Trim(false);
351     return numSamplesForEachLocus;
352 }
353 
354 bool
CompleteParse(GCParse & parseData)355 GCMigrateParser::CompleteParse(GCParse & parseData)
356 {
357     // check we have pops
358     size_t pcount = parseData.GetPopCount();
359     if(pcount < 1) return false;
360 
361     // check we have a locus
362     size_t lcount = parseData.GetLociCount();
363     if(lcount < 1) return false;
364 
365     // check we have a block for each
366     constBlockVector blocks = parseData.GetBlocks();
367     if(blocks.size() != pcount * lcount) return false;
368 
369     // check block has correct number of sequences
370     constBlockVector::const_iterator i;
371     for(i=blocks.begin(); i != blocks.end(); i++)
372     {
373         const GCParseBlock * blockP = *i;
374         if(blockP == NULL) return false;
375         size_t expectedNumSequences = blockP->GetExpectedNumSequences();
376         const GCParseSamples & samples = blockP->GetSamples();
377         if(samples.size() != expectedNumSequences) return false;
378 
379         // check block has correct number of sites
380         if(blockP->HasIncompleteSequences()) return false;
381     }
382 
383     return true;
384 }
385 
386 //____________________________________________________________________________________
387