1 // $Id: gc_datastore_files.cpp,v 1.36 2011/06/22 18:22:22 jmcgill Exp $
2 
3 /*
4   Copyright 2002  Mary Kuhner, Jon Yamato, and Joseph Felsenstein
5 
6   This software is distributed free of charge for non-commercial use
7   and is copyrighted.  Of course, we do not guarantee that the software
8   works, and are not responsible for any damage you may cause or have.
9 */
10 
11 #include <cassert>
12 
13 #include "gc_creation_info.h"
14 #include "gc_data.h"
15 #include "gc_datastore.h"
16 #include "gc_file.h"
17 #include "gc_infile_err.h"
18 #include "gc_loci_match.h"
19 #include "gc_locus_err.h"
20 #include "gc_parse_block.h"
21 #include "gc_parse_locus.h"
22 #include "gc_parse_sample.h"
23 #include "gc_pop_match.h"
24 #include "gc_strings.h"
25 #include "gc_structures_err.h"
26 
27 #include "wx/file.h"
28 #include "wx/filename.h"
29 #include "wx/log.h"
30 
31 //------------------------------------------------------------------------------------
32 
33 GCFile &
AddDataFile(wxString fullPathFileName,GCFileFormat format,gcGeneralDataType dataType,GCInterleaving interleaving)34 GCDataStore::AddDataFile(wxString           fullPathFileName,
35                          GCFileFormat    format,
36                          gcGeneralDataType   dataType,
37                          GCInterleaving  interleaving)
38 // version to use when reading from batch command file
39 {
40     // test for readability
41     if(! ::wxFileExists(fullPathFileName))
42     {
43         throw missing_file_error(fullPathFileName.c_str());
44     }
45 
46     GCFile * file = new GCFile(*this,fullPathFileName);
47     GCParseVec * parseVecP = new GCParseVec();
48 
49     try
50     {
51         GCParse * parse = OneParse(*file,format,dataType,interleaving);
52         parseVecP->push_back(parse);
53     }
54     catch(const gc_ex& e)
55     {
56         parseVecP->NukeContents();
57         delete parseVecP;
58         delete file;
59         throw;
60     }
61 
62     file->SetParses(parseVecP);
63     m_structures.AddFile(*file);
64 
65     m_dataFiles.insert(file);
66     wxLogVerbose(gcverbose::addedFile,fullPathFileName.c_str());
67     return *file;
68 }
69 
70 GCFile &
AddDataFile(wxString fullPathFileName)71 GCDataStore::AddDataFile(wxString fullPathFileName)
72 {
73     // test for readability
74     if(! ::wxFileExists(fullPathFileName))
75     {
76         throw missing_file_error (fullPathFileName.c_str());
77     }
78 
79     GCFile * file = new GCFile(*this,fullPathFileName);
80 
81     GCParseVec * parseVecP = AllParsesForFile(*file);
82     assert(parseVecP != NULL);
83     if (parseVecP->size() == 0)
84     {
85         delete parseVecP;
86         delete file;
87         throw unparsable_file_error(fullPathFileName.c_str());
88     }
89 
90     file->SetParses(parseVecP);
91     m_structures.AddFile(*file);
92     m_dataFiles.insert(file);
93     wxLogVerbose(gcverbose::addedFile,fullPathFileName.c_str());
94 
95     try
96     {
97         if(file->GetParseCount() == 1)
98         {
99             GCPopMatcher p(popmatch_DEFAULT);
100             GCLocusMatcher l(locmatch_DEFAULT);
101             SetParseChoice(file->GetParse(0),p,l);
102         }
103     }
104     catch (const duplicate_file_error& e)
105     {
106         GCWarning(e.what());
107     }
108     catch (const missing_file_error& e)
109     {
110         GCWarning(e.what());
111     }
112     catch (const unparsable_file_error& e)
113     {
114         GCWarning(e.what());
115     }
116 
117     return *file;
118 }
119 
120 const GCParseBlock *
GetParseBlock(size_t blockId) const121 GCDataStore::GetParseBlock(size_t blockId) const
122 {
123     for(dataFileSet::const_iterator iter=m_dataFiles.begin();
124         iter != m_dataFiles.end(); iter++)
125     {
126         const GCFile & fileRef = **iter;
127         for(size_t index=0; index < fileRef.GetParseCount(); index++)
128         {
129             const GCParse & parseRef = fileRef.GetParse(index);
130             constBlockVector blocks = parseRef.GetBlocks();
131             for(constBlockVector::const_iterator biter = blocks.begin();
132                 biter != blocks.end(); biter++)
133             {
134                 const GCParseBlock * blockP = *biter;
135                 if(blockId == blockP->GetId())
136                 {
137                     return blockP;
138                 }
139             }
140         }
141 
142     }
143     assert(false);
144     return NULL;
145 }
146 
147 constBlockVector
GetBlocks(size_t popId,size_t locusId) const148 GCDataStore::GetBlocks(size_t popId, size_t locusId) const
149 {
150     gcIdSet ids = m_structures.GetBlockIds(popId,locusId);
151     constBlockVector retVal;
152 
153     for(dataFileSet::const_iterator iter=m_dataFiles.begin();
154         iter != m_dataFiles.end(); iter++)
155     {
156         const GCFile & fileRef = **iter;
157         for(size_t index=0; index < fileRef.GetParseCount(); index++)
158         {
159             const GCParse & parseRef = fileRef.GetParse(index);
160             constBlockVector blocks = parseRef.GetBlocks();
161             for(constBlockVector::const_iterator biter = blocks.begin();
162                 biter != blocks.end(); biter++)
163             {
164                 const GCParseBlock * blockP = *biter;
165                 size_t blockId = blockP->GetId();
166                 if(ids.find(blockId) != ids.end())
167                 {
168                     retVal.push_back(blockP);
169                 }
170             }
171         }
172 
173     }
174     return retVal;
175 }
176 
177 constBlockVector
GetBlocksForLocus(size_t locusId) const178 GCDataStore::GetBlocksForLocus(size_t locusId) const
179 {
180     gcIdSet ids = m_structures.GetBlocksForLocus(locusId);
181     constBlockVector retVal;
182 
183     for(dataFileSet::const_iterator iter=m_dataFiles.begin();
184         iter != m_dataFiles.end(); iter++)
185     {
186         const GCFile & fileRef = **iter;
187         for(size_t index=0; index < fileRef.GetParseCount(); index++)
188         {
189             const GCParse & parseRef = fileRef.GetParse(index);
190             constBlockVector blocks = parseRef.GetBlocks();
191             for(constBlockVector::const_iterator biter = blocks.begin();
192                 biter != blocks.end(); biter++)
193             {
194                 const GCParseBlock * blockP = *biter;
195                 size_t blockId = blockP->GetId();
196                 if(ids.find(blockId) != ids.end())
197                 {
198                     retVal.push_back(blockP);
199                 }
200             }
201         }
202 
203     }
204     return retVal;
205 }
206 size_t
GetDataFileCount() const207 GCDataStore::GetDataFileCount() const
208 {
209     return m_dataFiles.size();
210 }
211 
212 GCFile &
GetDataFile(size_t fileIndex)213 GCDataStore::GetDataFile(size_t fileIndex)
214 {
215     for(dataFileSet::iterator i = m_dataFiles.begin(); i != m_dataFiles.end(); i++)
216     {
217         GCFile & dataFile = **i;
218         if(dataFile.GetId() == fileIndex)
219         {
220             return dataFile;
221         }
222     }
223     wxString msg = wxString::Format(gcerr::missingFileId,(int)fileIndex);
224     throw gc_implementation_error (msg.c_str());
225 }
226 
227 const GCFile &
GetDataFile(size_t fileIndex) const228 GCDataStore::GetDataFile(size_t fileIndex) const
229 {
230     for(dataFileSet::const_iterator i = m_dataFiles.begin(); i != m_dataFiles.end(); i++)
231     {
232         const GCFile & dataFile = **i;
233         if(dataFile.GetId() == fileIndex)
234         {
235             return dataFile;
236         }
237     }
238     wxString msg = wxString::Format(gcerr::missingFileId,(int)fileIndex);
239     throw gc_implementation_error (msg.c_str());
240 }
241 
242 const GCParse &
GetParse(size_t parseIndex) const243 GCDataStore::GetParse(size_t parseIndex) const
244 {
245     for(dataFileSet::const_iterator i = m_dataFiles.begin(); i != m_dataFiles.end(); i++)
246     {
247         const GCFile & dataFile = **i;
248         size_t numParses = dataFile.GetParseCount();
249         for(size_t p=0; p < numParses; p++)
250         {
251             const GCParse & parseRef = dataFile.GetParse(p);
252             if(parseRef.GetId() == parseIndex)
253             {
254                 return parseRef;
255             }
256         }
257     }
258     wxString msg = wxString::Format(gcerr::missingParseId,(int)parseIndex);
259     throw gc_implementation_error(msg.c_str());
260 }
261 
262 const dataFileSet &
GetDataFiles() const263 GCDataStore::GetDataFiles() const
264 {
265     return m_dataFiles;
266 }
267 
268 size_t
GetSelectedDataFileCount() const269 GCDataStore::GetSelectedDataFileCount() const
270 {
271     return m_structures.SelectedFileCount();
272 }
273 
274 bool
HasNoDataFiles() const275 GCDataStore::HasNoDataFiles() const
276 {
277     return m_dataFiles.empty();
278 }
279 
280 bool
HasUnparsedFiles() const281 GCDataStore::HasUnparsedFiles() const
282 {
283     return m_structures.HasUnparsedFiles();
284 }
285 
286 void
RemoveDataFile(GCFile & fileRef)287 GCDataStore::RemoveDataFile(GCFile & fileRef)
288 {
289     dataFileSet::iterator matching = m_dataFiles.find(&fileRef);
290     if(matching != m_dataFiles.end())
291     {
292         wxLogVerbose(gcverbose::removedFile,fileRef.GetName().c_str());
293         gcIdSet blocks = fileRef.IdsOfAllBlocks();
294         m_structures.RemoveBlocks(blocks);
295         m_dataFiles.erase(matching);
296         delete &fileRef;
297         // EWFIX.P3 -- need an ignore method?? causes UNDO/REDO problem
298     }
299 }
300 
301 void
RemoveFiles(bool selectedFilesOnly)302 GCDataStore::RemoveFiles(bool selectedFilesOnly)
303 {
304     for(dataFileSet::iterator iter=m_dataFiles.begin();
305         iter != m_dataFiles.end(); iter++)
306     {
307         GCFile & fileRef = **iter;
308         size_t fileId = fileRef.GetId();
309         if(!selectedFilesOnly || m_structures.GetFileSelection(fileId))
310         {
311             m_structures.RemoveFile(fileId);
312             RemoveDataFile(fileRef);
313         }
314 
315     }
316 }
317 
318 void
SelectAllFiles()319 GCDataStore::SelectAllFiles()
320 {
321     m_structures.AllFileSelectionsTo(true);
322 }
323 
324 void
UnselectAllFiles()325 GCDataStore::UnselectAllFiles()
326 {
327     m_structures.AllFileSelectionsTo(false);
328 }
329 
330 bool
GetSelected(const GCFile & fileRef) const331 GCDataStore::GetSelected(const GCFile & fileRef) const
332 {
333     return m_structures.GetFileSelection(fileRef.GetId());
334 }
335 
336 void
SetSelected(const GCFile & fileRef,bool selected)337 GCDataStore::SetSelected(const GCFile & fileRef, bool selected)
338 {
339     m_structures.SetFileSelection(fileRef.GetId(),selected);
340 }
341 
342 locVector
GetLociFor(const GCParse & parseRef,const GCLocusMatcher & locMatch)343 GCDataStore::GetLociFor(const GCParse & parseRef, const GCLocusMatcher & locMatch)
344 {
345     locVector loci;
346 
347     for(size_t i=0; i < parseRef.GetLociCount(); i++)
348     {
349         GCLocusSpec spec = locMatch.GetLocSpec(i,parseRef);
350         try
351         {
352             gcLocus & loc = m_structures.GetLocus(spec.GetLocusName());
353             loci.push_back(&loc);
354         }
355         catch(const gc_missing_locus& e)
356         {
357             loc_match locmatchType = locMatch.GetLocMatchType();
358             if(locmatchType == locmatch_DEFAULT || locmatchType == locmatch_LINKED)
359                 // EWFIX.BUG674 -- take out single region
360             {
361                 size_t lineNumber = parseRef.GetParseLocus(i).GetLineNumber();
362                 wxString fileName = parseRef.GetFileRef().GetName();
363                 gcCreationInfo creationInfo = gcCreationInfo::MakeDataFileCreationInfo(lineNumber,fileName);
364                 try
365                 {
366                     gcRegion & region = m_structures.GetRegion(spec.GetRegionName());
367                     gcLocus & loc = m_structures.MakeLocus(region,spec.GetLocusName(),spec.GetBlessedLocus(),creationInfo);
368                     gcGeneralDataType dType = parseRef.GetDataType();
369                     if(dType.size()== 1)
370                     {
371                         loc.SetDataType(*dType.begin());
372                     }
373                     loci.push_back(&loc);
374                 }
375                 catch(const missing_region& r)
376                 {
377                     gcRegion & region = m_structures.MakeRegion(spec.GetRegionName(),spec.GetBlessedRegion());
378                     gcLocus & loc = m_structures.MakeLocus(region,spec.GetLocusName(),spec.GetBlessedLocus(),creationInfo);
379                     gcGeneralDataType dType = parseRef.GetDataType();
380                     if(dType.size() == 1)
381                     {
382                         loc.SetDataType(*dType.begin());
383                     }
384                     loci.push_back(&loc);
385                 }
386             }
387             else
388             {
389                 throw;
390             }
391         }
392     }
393 
394     return loci;
395 }
396 
397 popVector
GetPopsFor(const GCParse & parseRef,const GCPopMatcher & popMatch)398 GCDataStore::GetPopsFor(const GCParse & parseRef, const GCPopMatcher & popMatch)
399 {
400     popVector pops;
401 
402     for(size_t i=0; i < parseRef.GetPopCount(); i++)
403     {
404         GCPopSpec spec = popMatch.GetPopSpec(i,parseRef);
405         try
406         {
407             gcPopulation & pop = m_structures.GetPop(spec.GetName());
408             pops.push_back(&pop);
409         }
410         catch(const gc_missing_population& e)
411         {
412             if(popMatch.GetPopMatchType() == popmatch_DEFAULT)
413             {
414                 gcPopulation & pop = m_structures.MakePop(spec.GetName(),spec.GetBlessed());
415                 pops.push_back(&pop);
416             }
417             else
418             {
419                 throw;
420             }
421         }
422     }
423 
424     return pops;
425 }
426 
427 bool
CanAssignParseLocus(const GCParseLocus & pLocus,const gcLocus & locus) const428 GCDataStore::CanAssignParseLocus(const GCParseLocus & pLocus, const gcLocus & locus) const
429 {
430     // EWFIX.P3 -- refactor with AssignParseLocus
431     gcSpecificDataType  locusType   = locus.GetDataType();
432     gcGeneralDataType   parseType   = pLocus.GetDataType();
433 
434     if(locusType != sdatatype_NONE_SET)
435     {
436         if(parseType.find(locusType) == parseType.end())
437         {
438             return false;
439         }
440     }
441     if(locus.HasNumMarkers())
442     {
443         if(locus.GetNumMarkers() != pLocus.GetNumMarkers())
444         {
445             return false;
446         }
447     }
448     return true;
449 }
450 
451 void
AssignParseLocus(const GCParseLocus & pLocus,gcLocus & locus)452 GCDataStore::AssignParseLocus(const GCParseLocus & pLocus, gcLocus & locus)
453 {
454     gcSpecificDataType  locusType   = locus.GetDataType();
455     gcGeneralDataType   parseType   = pLocus.GetDataType();
456 
457     if(locusType != sdatatype_NONE_SET)
458     {
459         if(parseType.find(locusType) == parseType.end())
460         {
461             throw gc_locus_types_mismatch(pLocus.GetName(),locus.GetName(),ToWxString(parseType),ToWxString(locusType));
462         }
463     }
464     else
465     {
466         if(parseType.size() == 1)
467         {
468             locus.SetDataType(*(parseType.begin()));
469         }
470     }
471 
472     // number of markers
473     if(locus.HasNumMarkers())
474     {
475         if(locus.GetNumMarkers() != pLocus.GetNumMarkers())
476         {
477             throw gc_locus_site_count_mismatch(pLocus.GetLongName(),locus.GetLongName(),pLocus.GetNumMarkers(),locus.GetNumMarkers());
478         }
479     }
480     else
481     {
482         locus.SetNumMarkers(pLocus.GetNumMarkers());
483     }
484 }
485 
486 void
AssignPop(const GCParsePop & pPop,gcPopulation & pop)487 GCDataStore::AssignPop(const GCParsePop & pPop, gcPopulation & pop)
488 {
489     // EWFIX.P5 LATER -- right now all pops are mergeable
490 }
491 
492 void
SetParseChoice(const GCParse & parseRef,const GCPopMatcher & popMatch,const GCLocusMatcher & locMatch)493 GCDataStore::SetParseChoice(const GCParse &         parseRef,
494                             const GCPopMatcher &    popMatch,
495                             const GCLocusMatcher &  locMatch)
496 {
497     // remove assignments to old parse
498     const GCFile & fileRef = parseRef.GetFileRef();
499     if(m_structures.HasParse(fileRef))
500     {
501         const GCParse & oldParse = m_structures.GetParse(fileRef);
502         m_structures.UnsetParse(oldParse);
503         UnsetParseChoice(m_structures.GetParse(fileRef));
504     }
505 
506     // check we can assign to these pops
507     popVector pops = GetPopsFor(parseRef,popMatch);
508     assert (pops.size() == parseRef.GetPopCount());
509     for(size_t i=0; i < pops.size(); i++)
510     {
511         const GCParsePop & parsePop = parseRef.GetParsePop(i);
512         gcPopulation & pop = *(pops[i]);
513         AssignPop(parsePop,pop);
514     }
515 
516     locVector locs = GetLociFor(parseRef,locMatch);
517     assert (locs.size() == parseRef.GetLociCount());
518 
519     for(size_t lIndex = 0; lIndex < locs.size(); lIndex++)
520     {
521         const GCParseLocus & parseLocus = parseRef.GetParseLocus(lIndex);
522         gcLocus & locus = *(locs[lIndex]);
523         AssignParseLocus(parseLocus,locus);
524 
525         for(size_t pIndex = 0; pIndex < pops.size(); pIndex++)
526         {
527             const GCParseBlock & block = parseRef.GetBlock(pIndex,lIndex);
528             size_t blockId = block.GetId();
529             m_structures.AssignBlock(blockId,pops[pIndex]->GetId(),locs[lIndex]->GetId());
530         }
531     }
532 
533     // if we make it this far, update the parse in structures
534     m_structures.SetParse(parseRef);
535 
536     // and document the pop and locus matchers in file info
537     GetStructures().SetPopMatcher(fileRef,popMatch);
538     GetStructures().SetLocusMatcher(fileRef,locMatch);
539 
540 }
541 
542 void
UnsetParseChoice(const GCParse & parseRef)543 GCDataStore::UnsetParseChoice(const GCParse & parseRef)
544 {
545     //
546     constBlockVector blocks = parseRef.GetBlocks();
547     for(constBlockVector::iterator i=blocks.begin(); i != blocks.end(); i++)
548     {
549         const GCParseBlock & block = *(*i);
550         m_structures.RemoveBlockAssignment(block.GetId());
551     }
552     m_structures.UnsetParse(parseRef);
553 }
554 
555 const GCParse &
GetParse(const GCFile & file) const556 GCDataStore::GetParse(const GCFile & file) const
557 {
558     return m_structures.GetParse(file);
559 }
560 
561 const GCParse &
GetParse(const GCFile & file,size_t index) const562 GCDataStore::GetParse(const GCFile & file, size_t index) const
563 {
564     return file.GetParse(index);
565 }
566 
567 bool
HasParse(const GCFile & file) const568 GCDataStore::HasParse(const GCFile & file) const
569 {
570     return m_structures.HasParse(file);
571 }
572 
573 gcGeneralDataType
GetLegalLocusTypes(size_t locusId) const574 GCDataStore::GetLegalLocusTypes(size_t locusId) const
575 {
576     gcGeneralDataType allowedTypes = gcdata::allDataTypes();
577     constBlockVector blocks = GetBlocksForLocus(locusId);
578 
579     for(constBlockVector::const_iterator i = blocks.begin(); i != blocks.end(); i++)
580     {
581         const GCParseBlock & pb = **i;
582         const GCParse & parse = pb.GetParse();
583         gcGeneralDataType thisType = parse.GetDataType();
584         allowedTypes.Intersect(thisType);
585     }
586     return allowedTypes;
587 }
588 
589 bool
FileInducesHaps(size_t fileId) const590 GCDataStore::FileInducesHaps(size_t fileId) const
591 {
592     const GCFile & fileRef = GetDataFile(fileId);
593 
594     for(size_t index=0; index < fileRef.GetParseCount(); index++)
595     {
596         const GCParse & parseRef = fileRef.GetParse(index);
597         constBlockVector blocks = parseRef.GetBlocks();
598         for(constBlockVector::const_iterator biter = blocks.begin();
599             biter != blocks.end(); biter++)
600         {
601             const GCParseBlock & pb = **biter;
602             const GCParseSamples & samples = pb.GetSamples();
603             for(size_t j = 0; j < samples.size(); j++)
604             {
605                 const GCParseSample & s = *(samples[j]);
606                 if(s.GetSequencesPerLabel() > 1)
607                 {
608                     return true;
609                 }
610             }
611         }
612     }
613     return false;
614 }
615 
616 //____________________________________________________________________________________
617