1 // $Id: gc_parse.cpp,v 1.23 2011/03/08 19:22:00 bobgian Exp $
2
3 /*
4 Copyright 2002 Mary Kuhner, Jon Yamato, and Joseph Felsenstein
5
6 This software is distributed free of charge for non-commercial use
7 and is copyrighted. Of course, we do not guarantee that the software
8 works, and are not responsible for any damage you may cause or have.
9 */
10
11 #include <cassert>
12
13 #include "gc_data.h"
14 #include "gc_default.h"
15 #include "gc_file.h"
16 #include "gc_file_util.h"
17 #include "gc_infile_err.h"
18 #include "gc_parse.h"
19 #include "gc_parse_block.h"
20 #include "gc_parse_locus.h"
21 #include "gc_parse_pop.h"
22 #include "gc_parse_sample.h"
23 #include "gc_strings.h"
24 #include "gc_types.h"
25 #include "wx/log.h"
26
27 //------------------------------------------------------------------------------------
28
GCParse(GCFile & file,GCFileFormat format,gcGeneralDataType dtype,GCInterleaving interleaving,wxString delim)29 GCParse::GCParse( GCFile & file,
30 GCFileFormat format,
31 gcGeneralDataType dtype,
32 GCInterleaving interleaving,
33 wxString delim)
34 :
35 m_filePointer(&file),
36 m_format(format),
37 m_dataType(dtype),
38 m_interleaving(interleaving),
39 m_delimiter(delim),
40 m_multiLineSeenInFile(false),
41 m_hasSpacesInNames(false)
42 {
43 }
44
~GCParse()45 GCParse::~GCParse()
46 {
47 for(GCParseLoci::iterator i=m_loci.begin(); i != m_loci.end(); i++)
48 {
49 delete *i;
50 }
51 for(GCParsePops::iterator i=m_pops.begin(); i != m_pops.end(); i++)
52 {
53 delete *i;
54 }
55 for(GCParseBlocks::iterator i=m_blocks.begin(); i != m_blocks.end(); i++)
56 {
57 delete *i;
58 }
59 }
60
61 wxString
GetName() const62 GCParse::GetName() const
63 {
64 assert(m_filePointer != NULL);
65 return wxString::Format(gcstr::parseSettingsForFile,
66 m_filePointer->GetShortName().c_str(),
67 GetSettings().c_str());
68 }
69
70 wxString
GetSettings() const71 GCParse::GetSettings() const
72 {
73 wxString desc =
74 wxString::Format(gcstr::parseSettings,
75 GetFormatString().c_str(),
76 GetDataTypeString().c_str(),
77 GetInterleavingString().c_str());
78
79 return desc;
80 }
81
82 gcGeneralDataType
GetDataType() const83 GCParse::GetDataType() const
84 {
85 return m_dataType;
86 }
87
88 GCFileFormat
GetFormat() const89 GCParse::GetFormat() const
90 {
91 return m_format;
92 }
93
94 bool
GetHasSpacesInNames() const95 GCParse::GetHasSpacesInNames() const
96 {
97 return m_hasSpacesInNames;
98 }
99
100 GCInterleaving
GetInterleaving() const101 GCParse::GetInterleaving() const
102 {
103 return m_interleaving;
104 }
105
106 wxString
GetDelimiter() const107 GCParse::GetDelimiter() const
108 {
109 return m_delimiter;
110 }
111
112 bool
GetMultiLineSeenInFile() const113 GCParse::GetMultiLineSeenInFile() const
114 {
115 return m_multiLineSeenInFile;
116 }
117
118 void
SetDataTypeFromFile(gcSpecificDataType dtype)119 GCParse::SetDataTypeFromFile(gcSpecificDataType dtype)
120 {
121 if(dtype != sdatatype_NONE_SET)
122 {
123 gcGeneralDataType::iterator i = m_dataType.find(dtype);
124 if(i == m_dataType.end())
125 {
126 throw gc_parse_data_type_spec_mismatch(dtype,GetDataType());
127 }
128 m_dataType.clear();
129 m_dataType.insert(dtype);
130 }
131 }
132
133 const GCParseLocus &
GetParseLocus(size_t locusIndex) const134 GCParse::GetParseLocus(size_t locusIndex) const
135 {
136 assert(locusIndex < m_loci.size());
137 return *(m_loci[locusIndex]);
138 }
139
140 GCParseLocus &
GetParseLocus(size_t locusIndex)141 GCParse::GetParseLocus(size_t locusIndex)
142 {
143 assert(locusIndex < m_loci.size());
144 return *(m_loci[locusIndex]);
145 }
146
147 const GCParsePop &
GetParsePop(size_t popIndex) const148 GCParse::GetParsePop(size_t popIndex) const
149 {
150 assert(popIndex < m_pops.size());
151 return *(m_pops[popIndex]);
152 }
153
154 constBlockVector
GetBlocks() const155 GCParse::GetBlocks() const
156 {
157 constBlockVector retVal;
158 for(GCParseBlocks::const_iterator i = m_blocks.begin();
159 i != m_blocks.end(); i ++)
160 {
161 const GCParseBlock * blockP = *i;
162 retVal.push_back(blockP);
163 }
164 return retVal;
165 }
166
167 const GCParseBlock &
GetBlock(size_t popId,size_t locusId) const168 GCParse::GetBlock(size_t popId, size_t locusId) const
169 {
170 // rather wasteful, but correct
171 for(GCParseBlocks::const_iterator i = m_blocks.begin();
172 i != m_blocks.end(); i ++)
173 {
174 const GCParseBlock & block = **i;
175 size_t blockPopId = block.GetPopRef().GetIndexInParse();
176 size_t blockLocId = block.GetLocusRef().GetIndexInParse();
177 if((popId == blockPopId) && (locusId == blockLocId))
178 {
179 return block;
180 }
181 }
182 wxString msg = wxString::Format(gcerr::noBlockForPopLocus,(int)popId,(int)locusId);
183 gc_implementation_error e(msg.c_str());
184 throw e;
185 }
186
187 const GCFile &
GetFileRef() const188 GCParse::GetFileRef() const
189 {
190 return *m_filePointer;
191 }
192
193 size_t
GetPopCount() const194 GCParse::GetPopCount() const
195 {
196 return m_pops.size();
197 }
198
199 size_t
GetLociCount() const200 GCParse::GetLociCount() const
201 {
202 return m_loci.size();
203 }
204
205 void
DebugDump(wxString prefix) const206 GCParse::DebugDump(wxString prefix) const
207 {
208 wxLogDebug("%sGCParse:%s",prefix.c_str(),GetSettings().c_str()); // EWDUMPOK
209 wxLogDebug("%sPopulations:",(prefix+gcstr::indent).c_str()); // EWDUMPOK
210 for(size_t i = 0; i < m_pops.size() ; i++)
211 {
212 const GCParsePop popRef = GetParsePop(i);
213 wxLogDebug("%s%5d:\"%s\"", // EWDUMPOK
214 (prefix+gcstr::indent+gcstr::indent).c_str(),
215 (int)(popRef.GetIndexInParse()),
216 (popRef.GetName()).c_str());
217 }
218 wxLogDebug("%sLoci:",(prefix+gcstr::indent).c_str()); // EWDUMPOK
219 for(size_t i = 0; i < m_loci.size() ; i++)
220 {
221 const GCParseLocus locRef = GetParseLocus(i);
222 wxLogDebug("%s%5d:%5d markers of type %s", // EWDUMPOK
223 (prefix+gcstr::indent+gcstr::indent).c_str(),
224 (int)(locRef.GetIndexInParse()),
225 (int)(locRef.GetNumMarkers()),
226 (ToWxString(locRef.GetDataType())).c_str());
227 }
228 wxLogDebug("%sBlocks:",(prefix+gcstr::indent).c_str()); // EWDUMPOK
229 for(size_t i = 0; i < m_blocks.size() ; i++)
230 {
231 const GCParseBlock & blockRef = *(m_blocks[i]);
232 blockRef.DebugDump(prefix+gcstr::indent+gcstr::indent);
233 }
234 }
235
236 gcIdSet
IdsOfAllBlocks() const237 GCParse::IdsOfAllBlocks() const
238 {
239 gcIdSet blockIds;
240 for(size_t i=0; i < m_blocks.size(); i++)
241 {
242 const GCParseBlock & block = *(m_blocks[i]);
243 blockIds.insert(block.GetId());
244 }
245 return blockIds;
246 }
247
248 wxString
GetFormatString() const249 GCParse::GetFormatString() const
250 {
251 return wxString::Format(gcstr::parseFormat,ToWxString(GetFormat()).c_str());
252 }
253
254 wxString
GetDataTypeString() const255 GCParse::GetDataTypeString() const
256 {
257 return wxString::Format(gcstr::parseDataType,ToWxString(GetDataType()).c_str());
258 }
259
260 wxString
GetInterleavingString() const261 GCParse::GetInterleavingString() const
262 {
263 GCInterleaving il = GetInterleaving();
264 if(il == interleaving_MOOT) il = interleaving_SEQUENTIAL; // EWFIX.P3 -- make blank ??
265 return wxString::Format(gcstr::parseInterleaving,ToWxString(il).c_str());
266 }
267
268 gcPhaseInfo *
GetDefaultPhaseRecords() const269 GCParse::GetDefaultPhaseRecords() const
270 {
271 gcPhaseInfo * phaseRecords = new gcPhaseInfo();
272
273 const wxString & fileName = GetFileRef().GetName();
274 for(GCParseBlocks::const_iterator i = m_blocks.begin(); i != m_blocks.end(); i++)
275 {
276 const GCParseBlock * pbP = *i;
277 const GCParseSamples & samples = pbP->GetSamples();
278 for(GCParseSamples::const_iterator j=samples.begin(); j != samples.end(); j++)
279 {
280 const GCParseSample * sampP = *j;
281 if(sampP->GetSequencesPerLabel() > 1)
282 {
283 gcPhaseRecord rec
284 = gcPhaseRecord::MakeAllelicPhaseRecord(fileName,
285 sampP->GetLine(),
286 sampP->GetLabel(),
287 sampP->GetSequencesPerLabel());
288 phaseRecords->AddRecord(rec);
289 }
290 }
291 }
292 return phaseRecords;
293 }
294
295 gcPhaseInfo *
GetPhaseRecordsForAdjacency(size_t adj) const296 GCParse::GetPhaseRecordsForAdjacency(size_t adj) const
297 {
298 gcPhaseInfo * phaseRecords = new gcPhaseInfo();
299
300 const wxString & fileName = GetFileRef().GetName();
301 for(GCParseBlocks::const_iterator i = m_blocks.begin(); i != m_blocks.end(); i++)
302 {
303 const GCParseBlock * pbP = *i;
304 const GCParseSamples & samples = pbP->GetSamples();
305 wxArrayString holdingArray;
306 const GCParseSample * sampP = NULL;
307 for(GCParseSamples::const_iterator j=samples.begin(); j != samples.end(); j++)
308 {
309 sampP = *j;
310 if(sampP->GetSequencesPerLabel() > 1)
311 {
312 delete phaseRecords;
313 throw gc_adjacent_phase_resolution_for_multisample_input(GetFileRef().GetName());
314 }
315 else
316 {
317 holdingArray.Add(sampP->GetLabel());
318 if(holdingArray.Count() == adj)
319 {
320 gcPhaseRecord rec
321 = gcPhaseRecord::MakeAdjacentPhaseRecord(fileName,sampP->GetLine(),holdingArray);
322 phaseRecords->AddRecord(rec);
323 holdingArray.Empty();
324 }
325 }
326 }
327 if(! holdingArray.IsEmpty())
328 {
329 assert(sampP != NULL);
330 size_t lineNum = sampP->GetLine();
331 wxString fname = GetFileRef().GetName();
332 size_t numSamples = samples.size();
333 delete phaseRecords;
334 throw gc_individual_sample_adj_mismatch(lineNum,fname,numSamples,adj);
335 }
336 }
337 return phaseRecords;
338 }
339
340 void
SetCannotBeMsat()341 GCParse::SetCannotBeMsat()
342 {
343 m_dataType.Disallow(sdatatype_MICROSAT);
344 }
345
346 void
SetHasSpacesInNames()347 GCParse::SetHasSpacesInNames()
348 {
349 m_hasSpacesInNames = true;
350 }
351
GCParseVec()352 GCParseVec::GCParseVec()
353 :
354 std::vector<GCParse*>()
355 {
356 }
357
~GCParseVec()358 GCParseVec::~GCParseVec()
359 {
360 }
361
362 void
NukeContents()363 GCParseVec::NukeContents()
364 {
365 for(iterator i=begin(); i != end(); i++)
366 {
367 delete *i;
368 }
369 }
370
371 bool
MungeParses(GCParseVec::iterator i1,GCParseVec::iterator i2)372 GCParseVec::MungeParses(GCParseVec::iterator i1, GCParseVec::iterator i2)
373 {
374 GCParse & p1 = **(i1);
375 GCParse & p2 = **(i2);
376 if(p1.GetFormat() != p2.GetFormat()) return false;
377 if(p1.GetDataType() != p2.GetDataType()) return false;
378
379 if(p1.GetMultiLineSeenInFile()) return false;
380 if(p2.GetMultiLineSeenInFile()) return false;
381
382 p1.m_interleaving = interleaving_MOOT;
383 return true;
384 }
385
386 bool
MungeParses()387 GCParseVec::MungeParses()
388 {
389 bool mungedAnything = false;
390 std::vector<GCParse*>::iterator outerIter = begin();
391 while(outerIter != end())
392 {
393 std::vector<GCParse*>::iterator innerIter = outerIter;
394 innerIter++;
395 while(innerIter != end())
396 {
397 if(MungeParses(outerIter,innerIter))
398 {
399 mungedAnything = true;
400 delete *innerIter;
401 erase(innerIter);
402 }
403 else
404 {
405 innerIter++;
406 }
407 }
408 outerIter++;
409 }
410 return mungedAnything;
411 }
412
413 //____________________________________________________________________________________
414