1 // $Id: gc_datastore_files.cpp,v 1.36 2011/06/22 18:22:22 jmcgill Exp $
2
3 /*
4 Copyright 2002 Mary Kuhner, Jon Yamato, and Joseph Felsenstein
5
6 This software is distributed free of charge for non-commercial use
7 and is copyrighted. Of course, we do not guarantee that the software
8 works, and are not responsible for any damage you may cause or have.
9 */
10
11 #include <cassert>
12
13 #include "gc_creation_info.h"
14 #include "gc_data.h"
15 #include "gc_datastore.h"
16 #include "gc_file.h"
17 #include "gc_infile_err.h"
18 #include "gc_loci_match.h"
19 #include "gc_locus_err.h"
20 #include "gc_parse_block.h"
21 #include "gc_parse_locus.h"
22 #include "gc_parse_sample.h"
23 #include "gc_pop_match.h"
24 #include "gc_strings.h"
25 #include "gc_structures_err.h"
26
27 #include "wx/file.h"
28 #include "wx/filename.h"
29 #include "wx/log.h"
30
31 //------------------------------------------------------------------------------------
32
33 GCFile &
AddDataFile(wxString fullPathFileName,GCFileFormat format,gcGeneralDataType dataType,GCInterleaving interleaving)34 GCDataStore::AddDataFile(wxString fullPathFileName,
35 GCFileFormat format,
36 gcGeneralDataType dataType,
37 GCInterleaving interleaving)
38 // version to use when reading from batch command file
39 {
40 // test for readability
41 if(! ::wxFileExists(fullPathFileName))
42 {
43 throw missing_file_error(fullPathFileName.c_str());
44 }
45
46 GCFile * file = new GCFile(*this,fullPathFileName);
47 GCParseVec * parseVecP = new GCParseVec();
48
49 try
50 {
51 GCParse * parse = OneParse(*file,format,dataType,interleaving);
52 parseVecP->push_back(parse);
53 }
54 catch(const gc_ex& e)
55 {
56 parseVecP->NukeContents();
57 delete parseVecP;
58 delete file;
59 throw;
60 }
61
62 file->SetParses(parseVecP);
63 m_structures.AddFile(*file);
64
65 m_dataFiles.insert(file);
66 wxLogVerbose(gcverbose::addedFile,fullPathFileName.c_str());
67 return *file;
68 }
69
70 GCFile &
AddDataFile(wxString fullPathFileName)71 GCDataStore::AddDataFile(wxString fullPathFileName)
72 {
73 // test for readability
74 if(! ::wxFileExists(fullPathFileName))
75 {
76 throw missing_file_error (fullPathFileName.c_str());
77 }
78
79 GCFile * file = new GCFile(*this,fullPathFileName);
80
81 GCParseVec * parseVecP = AllParsesForFile(*file);
82 assert(parseVecP != NULL);
83 if (parseVecP->size() == 0)
84 {
85 delete parseVecP;
86 delete file;
87 throw unparsable_file_error(fullPathFileName.c_str());
88 }
89
90 file->SetParses(parseVecP);
91 m_structures.AddFile(*file);
92 m_dataFiles.insert(file);
93 wxLogVerbose(gcverbose::addedFile,fullPathFileName.c_str());
94
95 try
96 {
97 if(file->GetParseCount() == 1)
98 {
99 GCPopMatcher p(popmatch_DEFAULT);
100 GCLocusMatcher l(locmatch_DEFAULT);
101 SetParseChoice(file->GetParse(0),p,l);
102 }
103 }
104 catch (const duplicate_file_error& e)
105 {
106 GCWarning(e.what());
107 }
108 catch (const missing_file_error& e)
109 {
110 GCWarning(e.what());
111 }
112 catch (const unparsable_file_error& e)
113 {
114 GCWarning(e.what());
115 }
116
117 return *file;
118 }
119
120 const GCParseBlock *
GetParseBlock(size_t blockId) const121 GCDataStore::GetParseBlock(size_t blockId) const
122 {
123 for(dataFileSet::const_iterator iter=m_dataFiles.begin();
124 iter != m_dataFiles.end(); iter++)
125 {
126 const GCFile & fileRef = **iter;
127 for(size_t index=0; index < fileRef.GetParseCount(); index++)
128 {
129 const GCParse & parseRef = fileRef.GetParse(index);
130 constBlockVector blocks = parseRef.GetBlocks();
131 for(constBlockVector::const_iterator biter = blocks.begin();
132 biter != blocks.end(); biter++)
133 {
134 const GCParseBlock * blockP = *biter;
135 if(blockId == blockP->GetId())
136 {
137 return blockP;
138 }
139 }
140 }
141
142 }
143 assert(false);
144 return NULL;
145 }
146
147 constBlockVector
GetBlocks(size_t popId,size_t locusId) const148 GCDataStore::GetBlocks(size_t popId, size_t locusId) const
149 {
150 gcIdSet ids = m_structures.GetBlockIds(popId,locusId);
151 constBlockVector retVal;
152
153 for(dataFileSet::const_iterator iter=m_dataFiles.begin();
154 iter != m_dataFiles.end(); iter++)
155 {
156 const GCFile & fileRef = **iter;
157 for(size_t index=0; index < fileRef.GetParseCount(); index++)
158 {
159 const GCParse & parseRef = fileRef.GetParse(index);
160 constBlockVector blocks = parseRef.GetBlocks();
161 for(constBlockVector::const_iterator biter = blocks.begin();
162 biter != blocks.end(); biter++)
163 {
164 const GCParseBlock * blockP = *biter;
165 size_t blockId = blockP->GetId();
166 if(ids.find(blockId) != ids.end())
167 {
168 retVal.push_back(blockP);
169 }
170 }
171 }
172
173 }
174 return retVal;
175 }
176
177 constBlockVector
GetBlocksForLocus(size_t locusId) const178 GCDataStore::GetBlocksForLocus(size_t locusId) const
179 {
180 gcIdSet ids = m_structures.GetBlocksForLocus(locusId);
181 constBlockVector retVal;
182
183 for(dataFileSet::const_iterator iter=m_dataFiles.begin();
184 iter != m_dataFiles.end(); iter++)
185 {
186 const GCFile & fileRef = **iter;
187 for(size_t index=0; index < fileRef.GetParseCount(); index++)
188 {
189 const GCParse & parseRef = fileRef.GetParse(index);
190 constBlockVector blocks = parseRef.GetBlocks();
191 for(constBlockVector::const_iterator biter = blocks.begin();
192 biter != blocks.end(); biter++)
193 {
194 const GCParseBlock * blockP = *biter;
195 size_t blockId = blockP->GetId();
196 if(ids.find(blockId) != ids.end())
197 {
198 retVal.push_back(blockP);
199 }
200 }
201 }
202
203 }
204 return retVal;
205 }
206 size_t
GetDataFileCount() const207 GCDataStore::GetDataFileCount() const
208 {
209 return m_dataFiles.size();
210 }
211
212 GCFile &
GetDataFile(size_t fileIndex)213 GCDataStore::GetDataFile(size_t fileIndex)
214 {
215 for(dataFileSet::iterator i = m_dataFiles.begin(); i != m_dataFiles.end(); i++)
216 {
217 GCFile & dataFile = **i;
218 if(dataFile.GetId() == fileIndex)
219 {
220 return dataFile;
221 }
222 }
223 wxString msg = wxString::Format(gcerr::missingFileId,(int)fileIndex);
224 throw gc_implementation_error (msg.c_str());
225 }
226
227 const GCFile &
GetDataFile(size_t fileIndex) const228 GCDataStore::GetDataFile(size_t fileIndex) const
229 {
230 for(dataFileSet::const_iterator i = m_dataFiles.begin(); i != m_dataFiles.end(); i++)
231 {
232 const GCFile & dataFile = **i;
233 if(dataFile.GetId() == fileIndex)
234 {
235 return dataFile;
236 }
237 }
238 wxString msg = wxString::Format(gcerr::missingFileId,(int)fileIndex);
239 throw gc_implementation_error (msg.c_str());
240 }
241
242 const GCParse &
GetParse(size_t parseIndex) const243 GCDataStore::GetParse(size_t parseIndex) const
244 {
245 for(dataFileSet::const_iterator i = m_dataFiles.begin(); i != m_dataFiles.end(); i++)
246 {
247 const GCFile & dataFile = **i;
248 size_t numParses = dataFile.GetParseCount();
249 for(size_t p=0; p < numParses; p++)
250 {
251 const GCParse & parseRef = dataFile.GetParse(p);
252 if(parseRef.GetId() == parseIndex)
253 {
254 return parseRef;
255 }
256 }
257 }
258 wxString msg = wxString::Format(gcerr::missingParseId,(int)parseIndex);
259 throw gc_implementation_error(msg.c_str());
260 }
261
262 const dataFileSet &
GetDataFiles() const263 GCDataStore::GetDataFiles() const
264 {
265 return m_dataFiles;
266 }
267
268 size_t
GetSelectedDataFileCount() const269 GCDataStore::GetSelectedDataFileCount() const
270 {
271 return m_structures.SelectedFileCount();
272 }
273
274 bool
HasNoDataFiles() const275 GCDataStore::HasNoDataFiles() const
276 {
277 return m_dataFiles.empty();
278 }
279
280 bool
HasUnparsedFiles() const281 GCDataStore::HasUnparsedFiles() const
282 {
283 return m_structures.HasUnparsedFiles();
284 }
285
286 void
RemoveDataFile(GCFile & fileRef)287 GCDataStore::RemoveDataFile(GCFile & fileRef)
288 {
289 dataFileSet::iterator matching = m_dataFiles.find(&fileRef);
290 if(matching != m_dataFiles.end())
291 {
292 wxLogVerbose(gcverbose::removedFile,fileRef.GetName().c_str());
293 gcIdSet blocks = fileRef.IdsOfAllBlocks();
294 m_structures.RemoveBlocks(blocks);
295 m_dataFiles.erase(matching);
296 delete &fileRef;
297 // EWFIX.P3 -- need an ignore method?? causes UNDO/REDO problem
298 }
299 }
300
301 void
RemoveFiles(bool selectedFilesOnly)302 GCDataStore::RemoveFiles(bool selectedFilesOnly)
303 {
304 for(dataFileSet::iterator iter=m_dataFiles.begin();
305 iter != m_dataFiles.end(); iter++)
306 {
307 GCFile & fileRef = **iter;
308 size_t fileId = fileRef.GetId();
309 if(!selectedFilesOnly || m_structures.GetFileSelection(fileId))
310 {
311 m_structures.RemoveFile(fileId);
312 RemoveDataFile(fileRef);
313 }
314
315 }
316 }
317
318 void
SelectAllFiles()319 GCDataStore::SelectAllFiles()
320 {
321 m_structures.AllFileSelectionsTo(true);
322 }
323
324 void
UnselectAllFiles()325 GCDataStore::UnselectAllFiles()
326 {
327 m_structures.AllFileSelectionsTo(false);
328 }
329
330 bool
GetSelected(const GCFile & fileRef) const331 GCDataStore::GetSelected(const GCFile & fileRef) const
332 {
333 return m_structures.GetFileSelection(fileRef.GetId());
334 }
335
336 void
SetSelected(const GCFile & fileRef,bool selected)337 GCDataStore::SetSelected(const GCFile & fileRef, bool selected)
338 {
339 m_structures.SetFileSelection(fileRef.GetId(),selected);
340 }
341
342 locVector
GetLociFor(const GCParse & parseRef,const GCLocusMatcher & locMatch)343 GCDataStore::GetLociFor(const GCParse & parseRef, const GCLocusMatcher & locMatch)
344 {
345 locVector loci;
346
347 for(size_t i=0; i < parseRef.GetLociCount(); i++)
348 {
349 GCLocusSpec spec = locMatch.GetLocSpec(i,parseRef);
350 try
351 {
352 gcLocus & loc = m_structures.GetLocus(spec.GetLocusName());
353 loci.push_back(&loc);
354 }
355 catch(const gc_missing_locus& e)
356 {
357 loc_match locmatchType = locMatch.GetLocMatchType();
358 if(locmatchType == locmatch_DEFAULT || locmatchType == locmatch_LINKED)
359 // EWFIX.BUG674 -- take out single region
360 {
361 size_t lineNumber = parseRef.GetParseLocus(i).GetLineNumber();
362 wxString fileName = parseRef.GetFileRef().GetName();
363 gcCreationInfo creationInfo = gcCreationInfo::MakeDataFileCreationInfo(lineNumber,fileName);
364 try
365 {
366 gcRegion & region = m_structures.GetRegion(spec.GetRegionName());
367 gcLocus & loc = m_structures.MakeLocus(region,spec.GetLocusName(),spec.GetBlessedLocus(),creationInfo);
368 gcGeneralDataType dType = parseRef.GetDataType();
369 if(dType.size()== 1)
370 {
371 loc.SetDataType(*dType.begin());
372 }
373 loci.push_back(&loc);
374 }
375 catch(const missing_region& r)
376 {
377 gcRegion & region = m_structures.MakeRegion(spec.GetRegionName(),spec.GetBlessedRegion());
378 gcLocus & loc = m_structures.MakeLocus(region,spec.GetLocusName(),spec.GetBlessedLocus(),creationInfo);
379 gcGeneralDataType dType = parseRef.GetDataType();
380 if(dType.size() == 1)
381 {
382 loc.SetDataType(*dType.begin());
383 }
384 loci.push_back(&loc);
385 }
386 }
387 else
388 {
389 throw;
390 }
391 }
392 }
393
394 return loci;
395 }
396
397 popVector
GetPopsFor(const GCParse & parseRef,const GCPopMatcher & popMatch)398 GCDataStore::GetPopsFor(const GCParse & parseRef, const GCPopMatcher & popMatch)
399 {
400 popVector pops;
401
402 for(size_t i=0; i < parseRef.GetPopCount(); i++)
403 {
404 GCPopSpec spec = popMatch.GetPopSpec(i,parseRef);
405 try
406 {
407 gcPopulation & pop = m_structures.GetPop(spec.GetName());
408 pops.push_back(&pop);
409 }
410 catch(const gc_missing_population& e)
411 {
412 if(popMatch.GetPopMatchType() == popmatch_DEFAULT)
413 {
414 gcPopulation & pop = m_structures.MakePop(spec.GetName(),spec.GetBlessed());
415 pops.push_back(&pop);
416 }
417 else
418 {
419 throw;
420 }
421 }
422 }
423
424 return pops;
425 }
426
427 bool
CanAssignParseLocus(const GCParseLocus & pLocus,const gcLocus & locus) const428 GCDataStore::CanAssignParseLocus(const GCParseLocus & pLocus, const gcLocus & locus) const
429 {
430 // EWFIX.P3 -- refactor with AssignParseLocus
431 gcSpecificDataType locusType = locus.GetDataType();
432 gcGeneralDataType parseType = pLocus.GetDataType();
433
434 if(locusType != sdatatype_NONE_SET)
435 {
436 if(parseType.find(locusType) == parseType.end())
437 {
438 return false;
439 }
440 }
441 if(locus.HasNumMarkers())
442 {
443 if(locus.GetNumMarkers() != pLocus.GetNumMarkers())
444 {
445 return false;
446 }
447 }
448 return true;
449 }
450
451 void
AssignParseLocus(const GCParseLocus & pLocus,gcLocus & locus)452 GCDataStore::AssignParseLocus(const GCParseLocus & pLocus, gcLocus & locus)
453 {
454 gcSpecificDataType locusType = locus.GetDataType();
455 gcGeneralDataType parseType = pLocus.GetDataType();
456
457 if(locusType != sdatatype_NONE_SET)
458 {
459 if(parseType.find(locusType) == parseType.end())
460 {
461 throw gc_locus_types_mismatch(pLocus.GetName(),locus.GetName(),ToWxString(parseType),ToWxString(locusType));
462 }
463 }
464 else
465 {
466 if(parseType.size() == 1)
467 {
468 locus.SetDataType(*(parseType.begin()));
469 }
470 }
471
472 // number of markers
473 if(locus.HasNumMarkers())
474 {
475 if(locus.GetNumMarkers() != pLocus.GetNumMarkers())
476 {
477 throw gc_locus_site_count_mismatch(pLocus.GetLongName(),locus.GetLongName(),pLocus.GetNumMarkers(),locus.GetNumMarkers());
478 }
479 }
480 else
481 {
482 locus.SetNumMarkers(pLocus.GetNumMarkers());
483 }
484 }
485
486 void
AssignPop(const GCParsePop & pPop,gcPopulation & pop)487 GCDataStore::AssignPop(const GCParsePop & pPop, gcPopulation & pop)
488 {
489 // EWFIX.P5 LATER -- right now all pops are mergeable
490 }
491
492 void
SetParseChoice(const GCParse & parseRef,const GCPopMatcher & popMatch,const GCLocusMatcher & locMatch)493 GCDataStore::SetParseChoice(const GCParse & parseRef,
494 const GCPopMatcher & popMatch,
495 const GCLocusMatcher & locMatch)
496 {
497 // remove assignments to old parse
498 const GCFile & fileRef = parseRef.GetFileRef();
499 if(m_structures.HasParse(fileRef))
500 {
501 const GCParse & oldParse = m_structures.GetParse(fileRef);
502 m_structures.UnsetParse(oldParse);
503 UnsetParseChoice(m_structures.GetParse(fileRef));
504 }
505
506 // check we can assign to these pops
507 popVector pops = GetPopsFor(parseRef,popMatch);
508 assert (pops.size() == parseRef.GetPopCount());
509 for(size_t i=0; i < pops.size(); i++)
510 {
511 const GCParsePop & parsePop = parseRef.GetParsePop(i);
512 gcPopulation & pop = *(pops[i]);
513 AssignPop(parsePop,pop);
514 }
515
516 locVector locs = GetLociFor(parseRef,locMatch);
517 assert (locs.size() == parseRef.GetLociCount());
518
519 for(size_t lIndex = 0; lIndex < locs.size(); lIndex++)
520 {
521 const GCParseLocus & parseLocus = parseRef.GetParseLocus(lIndex);
522 gcLocus & locus = *(locs[lIndex]);
523 AssignParseLocus(parseLocus,locus);
524
525 for(size_t pIndex = 0; pIndex < pops.size(); pIndex++)
526 {
527 const GCParseBlock & block = parseRef.GetBlock(pIndex,lIndex);
528 size_t blockId = block.GetId();
529 m_structures.AssignBlock(blockId,pops[pIndex]->GetId(),locs[lIndex]->GetId());
530 }
531 }
532
533 // if we make it this far, update the parse in structures
534 m_structures.SetParse(parseRef);
535
536 // and document the pop and locus matchers in file info
537 GetStructures().SetPopMatcher(fileRef,popMatch);
538 GetStructures().SetLocusMatcher(fileRef,locMatch);
539
540 }
541
542 void
UnsetParseChoice(const GCParse & parseRef)543 GCDataStore::UnsetParseChoice(const GCParse & parseRef)
544 {
545 //
546 constBlockVector blocks = parseRef.GetBlocks();
547 for(constBlockVector::iterator i=blocks.begin(); i != blocks.end(); i++)
548 {
549 const GCParseBlock & block = *(*i);
550 m_structures.RemoveBlockAssignment(block.GetId());
551 }
552 m_structures.UnsetParse(parseRef);
553 }
554
555 const GCParse &
GetParse(const GCFile & file) const556 GCDataStore::GetParse(const GCFile & file) const
557 {
558 return m_structures.GetParse(file);
559 }
560
561 const GCParse &
GetParse(const GCFile & file,size_t index) const562 GCDataStore::GetParse(const GCFile & file, size_t index) const
563 {
564 return file.GetParse(index);
565 }
566
567 bool
HasParse(const GCFile & file) const568 GCDataStore::HasParse(const GCFile & file) const
569 {
570 return m_structures.HasParse(file);
571 }
572
573 gcGeneralDataType
GetLegalLocusTypes(size_t locusId) const574 GCDataStore::GetLegalLocusTypes(size_t locusId) const
575 {
576 gcGeneralDataType allowedTypes = gcdata::allDataTypes();
577 constBlockVector blocks = GetBlocksForLocus(locusId);
578
579 for(constBlockVector::const_iterator i = blocks.begin(); i != blocks.end(); i++)
580 {
581 const GCParseBlock & pb = **i;
582 const GCParse & parse = pb.GetParse();
583 gcGeneralDataType thisType = parse.GetDataType();
584 allowedTypes.Intersect(thisType);
585 }
586 return allowedTypes;
587 }
588
589 bool
FileInducesHaps(size_t fileId) const590 GCDataStore::FileInducesHaps(size_t fileId) const
591 {
592 const GCFile & fileRef = GetDataFile(fileId);
593
594 for(size_t index=0; index < fileRef.GetParseCount(); index++)
595 {
596 const GCParse & parseRef = fileRef.GetParse(index);
597 constBlockVector blocks = parseRef.GetBlocks();
598 for(constBlockVector::const_iterator biter = blocks.begin();
599 biter != blocks.end(); biter++)
600 {
601 const GCParseBlock & pb = **biter;
602 const GCParseSamples & samples = pb.GetSamples();
603 for(size_t j = 0; j < samples.size(); j++)
604 {
605 const GCParseSample & s = *(samples[j]);
606 if(s.GetSequencesPerLabel() > 1)
607 {
608 return true;
609 }
610 }
611 }
612 }
613 return false;
614 }
615
616 //____________________________________________________________________________________
617