1 // $Id: gc_parse.cpp,v 1.23 2011/03/08 19:22:00 bobgian Exp $
2 
3 /*
4   Copyright 2002  Mary Kuhner, Jon Yamato, and Joseph Felsenstein
5 
6   This software is distributed free of charge for non-commercial use
7   and is copyrighted.  Of course, we do not guarantee that the software
8   works, and are not responsible for any damage you may cause or have.
9 */
10 
11 #include <cassert>
12 
13 #include "gc_data.h"
14 #include "gc_default.h"
15 #include "gc_file.h"
16 #include "gc_file_util.h"
17 #include "gc_infile_err.h"
18 #include "gc_parse.h"
19 #include "gc_parse_block.h"
20 #include "gc_parse_locus.h"
21 #include "gc_parse_pop.h"
22 #include "gc_parse_sample.h"
23 #include "gc_strings.h"
24 #include "gc_types.h"
25 #include "wx/log.h"
26 
27 //------------------------------------------------------------------------------------
28 
GCParse(GCFile & file,GCFileFormat format,gcGeneralDataType dtype,GCInterleaving interleaving,wxString delim)29 GCParse::GCParse(   GCFile &            file,
30                     GCFileFormat        format,
31                     gcGeneralDataType   dtype,
32                     GCInterleaving      interleaving,
33                     wxString            delim)
34     :
35     m_filePointer(&file),
36     m_format(format),
37     m_dataType(dtype),
38     m_interleaving(interleaving),
39     m_delimiter(delim),
40     m_multiLineSeenInFile(false),
41     m_hasSpacesInNames(false)
42 {
43 }
44 
~GCParse()45 GCParse::~GCParse()
46 {
47     for(GCParseLoci::iterator i=m_loci.begin(); i != m_loci.end(); i++)
48     {
49         delete *i;
50     }
51     for(GCParsePops::iterator i=m_pops.begin(); i != m_pops.end(); i++)
52     {
53         delete *i;
54     }
55     for(GCParseBlocks::iterator i=m_blocks.begin(); i != m_blocks.end(); i++)
56     {
57         delete *i;
58     }
59 }
60 
61 wxString
GetName() const62 GCParse::GetName() const
63 {
64     assert(m_filePointer != NULL);
65     return wxString::Format(gcstr::parseSettingsForFile,
66                             m_filePointer->GetShortName().c_str(),
67                             GetSettings().c_str());
68 }
69 
70 wxString
GetSettings() const71 GCParse::GetSettings() const
72 {
73     wxString desc =
74         wxString::Format(gcstr::parseSettings,
75                          GetFormatString().c_str(),
76                          GetDataTypeString().c_str(),
77                          GetInterleavingString().c_str());
78 
79     return desc;
80 }
81 
82 gcGeneralDataType
GetDataType() const83 GCParse::GetDataType() const
84 {
85     return m_dataType;
86 }
87 
88 GCFileFormat
GetFormat() const89 GCParse::GetFormat() const
90 {
91     return m_format;
92 }
93 
94 bool
GetHasSpacesInNames() const95 GCParse::GetHasSpacesInNames() const
96 {
97     return m_hasSpacesInNames;
98 }
99 
100 GCInterleaving
GetInterleaving() const101 GCParse::GetInterleaving() const
102 {
103     return m_interleaving;
104 }
105 
106 wxString
GetDelimiter() const107 GCParse::GetDelimiter() const
108 {
109     return m_delimiter;
110 }
111 
112 bool
GetMultiLineSeenInFile() const113 GCParse::GetMultiLineSeenInFile() const
114 {
115     return m_multiLineSeenInFile;
116 }
117 
118 void
SetDataTypeFromFile(gcSpecificDataType dtype)119 GCParse::SetDataTypeFromFile(gcSpecificDataType dtype)
120 {
121     if(dtype != sdatatype_NONE_SET)
122     {
123         gcGeneralDataType::iterator i = m_dataType.find(dtype);
124         if(i == m_dataType.end())
125         {
126             throw gc_parse_data_type_spec_mismatch(dtype,GetDataType());
127         }
128         m_dataType.clear();
129         m_dataType.insert(dtype);
130     }
131 }
132 
133 const GCParseLocus &
GetParseLocus(size_t locusIndex) const134 GCParse::GetParseLocus(size_t locusIndex) const
135 {
136     assert(locusIndex < m_loci.size());
137     return *(m_loci[locusIndex]);
138 }
139 
140 GCParseLocus &
GetParseLocus(size_t locusIndex)141 GCParse::GetParseLocus(size_t locusIndex)
142 {
143     assert(locusIndex < m_loci.size());
144     return *(m_loci[locusIndex]);
145 }
146 
147 const GCParsePop &
GetParsePop(size_t popIndex) const148 GCParse::GetParsePop(size_t popIndex) const
149 {
150     assert(popIndex < m_pops.size());
151     return *(m_pops[popIndex]);
152 }
153 
154 constBlockVector
GetBlocks() const155 GCParse::GetBlocks() const
156 {
157     constBlockVector retVal;
158     for(GCParseBlocks::const_iterator i = m_blocks.begin();
159         i != m_blocks.end(); i ++)
160     {
161         const GCParseBlock * blockP = *i;
162         retVal.push_back(blockP);
163     }
164     return retVal;
165 }
166 
167 const GCParseBlock &
GetBlock(size_t popId,size_t locusId) const168 GCParse::GetBlock(size_t popId, size_t locusId) const
169 {
170     // rather wasteful, but correct
171     for(GCParseBlocks::const_iterator i = m_blocks.begin();
172         i != m_blocks.end(); i ++)
173     {
174         const GCParseBlock & block = **i;
175         size_t blockPopId = block.GetPopRef().GetIndexInParse();
176         size_t blockLocId = block.GetLocusRef().GetIndexInParse();
177         if((popId == blockPopId) && (locusId == blockLocId))
178         {
179             return block;
180         }
181     }
182     wxString msg = wxString::Format(gcerr::noBlockForPopLocus,(int)popId,(int)locusId);
183     gc_implementation_error e(msg.c_str());
184     throw e;
185 }
186 
187 const GCFile &
GetFileRef() const188 GCParse::GetFileRef() const
189 {
190     return *m_filePointer;
191 }
192 
193 size_t
GetPopCount() const194 GCParse::GetPopCount() const
195 {
196     return m_pops.size();
197 }
198 
199 size_t
GetLociCount() const200 GCParse::GetLociCount() const
201 {
202     return m_loci.size();
203 }
204 
205 void
DebugDump(wxString prefix) const206 GCParse::DebugDump(wxString prefix) const
207 {
208     wxLogDebug("%sGCParse:%s",prefix.c_str(),GetSettings().c_str());    // EWDUMPOK
209     wxLogDebug("%sPopulations:",(prefix+gcstr::indent).c_str());    // EWDUMPOK
210     for(size_t i = 0; i < m_pops.size() ; i++)
211     {
212         const GCParsePop popRef = GetParsePop(i);
213         wxLogDebug("%s%5d:\"%s\"",  // EWDUMPOK
214                    (prefix+gcstr::indent+gcstr::indent).c_str(),
215                    (int)(popRef.GetIndexInParse()),
216                    (popRef.GetName()).c_str());
217     }
218     wxLogDebug("%sLoci:",(prefix+gcstr::indent).c_str());   // EWDUMPOK
219     for(size_t i = 0; i < m_loci.size() ; i++)
220     {
221         const GCParseLocus locRef = GetParseLocus(i);
222         wxLogDebug("%s%5d:%5d markers of type %s",    // EWDUMPOK
223                    (prefix+gcstr::indent+gcstr::indent).c_str(),
224                    (int)(locRef.GetIndexInParse()),
225                    (int)(locRef.GetNumMarkers()),
226                    (ToWxString(locRef.GetDataType())).c_str());
227     }
228     wxLogDebug("%sBlocks:",(prefix+gcstr::indent).c_str()); // EWDUMPOK
229     for(size_t i = 0; i < m_blocks.size() ; i++)
230     {
231         const GCParseBlock & blockRef = *(m_blocks[i]);
232         blockRef.DebugDump(prefix+gcstr::indent+gcstr::indent);
233     }
234 }
235 
236 gcIdSet
IdsOfAllBlocks() const237 GCParse::IdsOfAllBlocks() const
238 {
239     gcIdSet blockIds;
240     for(size_t i=0; i < m_blocks.size(); i++)
241     {
242         const GCParseBlock & block = *(m_blocks[i]);
243         blockIds.insert(block.GetId());
244     }
245     return blockIds;
246 }
247 
248 wxString
GetFormatString() const249 GCParse::GetFormatString() const
250 {
251     return wxString::Format(gcstr::parseFormat,ToWxString(GetFormat()).c_str());
252 }
253 
254 wxString
GetDataTypeString() const255 GCParse::GetDataTypeString() const
256 {
257     return wxString::Format(gcstr::parseDataType,ToWxString(GetDataType()).c_str());
258 }
259 
260 wxString
GetInterleavingString() const261 GCParse::GetInterleavingString() const
262 {
263     GCInterleaving il = GetInterleaving();
264     if(il == interleaving_MOOT) il = interleaving_SEQUENTIAL; // EWFIX.P3 -- make blank ??
265     return wxString::Format(gcstr::parseInterleaving,ToWxString(il).c_str());
266 }
267 
268 gcPhaseInfo *
GetDefaultPhaseRecords() const269 GCParse::GetDefaultPhaseRecords() const
270 {
271     gcPhaseInfo * phaseRecords = new gcPhaseInfo();
272 
273     const wxString & fileName = GetFileRef().GetName();
274     for(GCParseBlocks::const_iterator i = m_blocks.begin(); i != m_blocks.end(); i++)
275     {
276         const GCParseBlock * pbP = *i;
277         const GCParseSamples & samples = pbP->GetSamples();
278         for(GCParseSamples::const_iterator j=samples.begin(); j != samples.end(); j++)
279         {
280             const GCParseSample * sampP = *j;
281             if(sampP->GetSequencesPerLabel() > 1)
282             {
283                 gcPhaseRecord rec
284                     = gcPhaseRecord::MakeAllelicPhaseRecord(fileName,
285                                                             sampP->GetLine(),
286                                                             sampP->GetLabel(),
287                                                             sampP->GetSequencesPerLabel());
288                 phaseRecords->AddRecord(rec);
289             }
290         }
291     }
292     return phaseRecords;
293 }
294 
295 gcPhaseInfo *
GetPhaseRecordsForAdjacency(size_t adj) const296 GCParse::GetPhaseRecordsForAdjacency(size_t adj) const
297 {
298     gcPhaseInfo * phaseRecords = new gcPhaseInfo();
299 
300     const wxString & fileName = GetFileRef().GetName();
301     for(GCParseBlocks::const_iterator i = m_blocks.begin(); i != m_blocks.end(); i++)
302     {
303         const GCParseBlock * pbP = *i;
304         const GCParseSamples & samples = pbP->GetSamples();
305         wxArrayString holdingArray;
306         const GCParseSample * sampP = NULL;
307         for(GCParseSamples::const_iterator j=samples.begin(); j != samples.end(); j++)
308         {
309             sampP = *j;
310             if(sampP->GetSequencesPerLabel() > 1)
311             {
312                 delete phaseRecords;
313                 throw gc_adjacent_phase_resolution_for_multisample_input(GetFileRef().GetName());
314             }
315             else
316             {
317                 holdingArray.Add(sampP->GetLabel());
318                 if(holdingArray.Count() == adj)
319                 {
320                     gcPhaseRecord rec
321                         = gcPhaseRecord::MakeAdjacentPhaseRecord(fileName,sampP->GetLine(),holdingArray);
322                     phaseRecords->AddRecord(rec);
323                     holdingArray.Empty();
324                 }
325             }
326         }
327         if(! holdingArray.IsEmpty())
328         {
329             assert(sampP != NULL);
330             size_t lineNum = sampP->GetLine();
331             wxString fname = GetFileRef().GetName();
332             size_t numSamples = samples.size();
333             delete phaseRecords;
334             throw gc_individual_sample_adj_mismatch(lineNum,fname,numSamples,adj);
335         }
336     }
337     return phaseRecords;
338 }
339 
340 void
SetCannotBeMsat()341 GCParse::SetCannotBeMsat()
342 {
343     m_dataType.Disallow(sdatatype_MICROSAT);
344 }
345 
346 void
SetHasSpacesInNames()347 GCParse::SetHasSpacesInNames()
348 {
349     m_hasSpacesInNames = true;
350 }
351 
GCParseVec()352 GCParseVec::GCParseVec()
353     :
354     std::vector<GCParse*>()
355 {
356 }
357 
~GCParseVec()358 GCParseVec::~GCParseVec()
359 {
360 }
361 
362 void
NukeContents()363 GCParseVec::NukeContents()
364 {
365     for(iterator i=begin(); i != end(); i++)
366     {
367         delete *i;
368     }
369 }
370 
371 bool
MungeParses(GCParseVec::iterator i1,GCParseVec::iterator i2)372 GCParseVec::MungeParses(GCParseVec::iterator i1, GCParseVec::iterator i2)
373 {
374     GCParse & p1 = **(i1);
375     GCParse & p2 = **(i2);
376     if(p1.GetFormat() != p2.GetFormat()) return false;
377     if(p1.GetDataType() != p2.GetDataType()) return false;
378 
379     if(p1.GetMultiLineSeenInFile()) return false;
380     if(p2.GetMultiLineSeenInFile()) return false;
381 
382     p1.m_interleaving = interleaving_MOOT;
383     return true;
384 }
385 
386 bool
MungeParses()387 GCParseVec::MungeParses()
388 {
389     bool mungedAnything = false;
390     std::vector<GCParse*>::iterator outerIter = begin();
391     while(outerIter != end())
392     {
393         std::vector<GCParse*>::iterator innerIter = outerIter;
394         innerIter++;
395         while(innerIter != end())
396         {
397             if(MungeParses(outerIter,innerIter))
398             {
399                 mungedAnything = true;
400                 delete *innerIter;
401                 erase(innerIter);
402             }
403             else
404             {
405                 innerIter++;
406             }
407         }
408         outerIter++;
409     }
410     return mungedAnything;
411 }
412 
413 //____________________________________________________________________________________
414