1 // $Id: gc_datastore_writebatch.cpp,v 1.10 2011/06/22 18:22:22 jmcgill Exp $
2 
3 /*
4   Copyright 2002  Mary Kuhner, Jon Yamato, and Joseph Felsenstein
5 
6   This software is distributed free of charge for non-commercial use
7   and is copyrighted.  Of course, we do not guarantee that the software
8   works, and are not responsible for any damage you may cause or have.
9 */
10 
11 #include <cassert>
12 
13 #include "cnv_strings.h"
14 #include "gc_data.h"
15 #include "gc_datastore.h"
16 #include "gc_parse_block.h"
17 #include "gc_phase_info.h"
18 #include "gc_strings.h"
19 #include "tinyxml.h"
20 #include "wx/datetime.h"
21 
22 //------------------------------------------------------------------------------------
23 
24 TiXmlElement *
CmdExportIndividuals() const25 GCDataStore::CmdExportIndividuals() const
26 {
27     const stringToRecord & indRecords = m_phaseInfo.GetIndividualRecords();
28     if(indRecords.empty())
29     {
30         return NULL;
31     }
32     TiXmlElement * indsElem = new TiXmlElement(cnvstr::TAG_INDIVIDUALS.c_str());
33 
34     for(stringToRecord::const_iterator i=indRecords.begin(); i != indRecords.end(); i++)
35     {
36         const wxString & iName = (*i).first;
37         const gcPhaseRecord & rec = (*i).second;
38 
39         TiXmlElement * iElem = new TiXmlElement(cnvstr::TAG_INDIVIDUAL.c_str());
40         indsElem->LinkEndChild(iElem);
41 
42         TiXmlElement * name = new TiXmlElement(cnvstr::TAG_NAME.c_str());
43         iElem->LinkEndChild(name);
44         TiXmlText * iNameText = new TiXmlText(iName.c_str());
45         name->LinkEndChild(iNameText);
46 
47         const gcIdSet & phenoIds = rec.GetPhenotypeIds();
48         for(gcIdSet::const_iterator pIter = phenoIds.begin(); pIter != phenoIds.end(); pIter++)
49         {
50             const gcPhenotype & pheno = GetStructures().GetPhenotype(*pIter);
51             if(pheno.HasExplicitName())
52             {
53                 TiXmlElement * phenoRefE = new TiXmlElement(cnvstr::TAG_HAS_PHENOTYPE.c_str());
54                 iElem->LinkEndChild(phenoRefE);
55                 TiXmlText * phenoNameText = new TiXmlText(pheno.GetName().c_str());
56                 phenoRefE->LinkEndChild(phenoNameText);
57             }
58             else
59             {
60                 iElem->LinkEndChild(CmdExportGenoReso(pheno));
61             }
62         }
63 
64         const wxArrayString & sampleNames = rec.GetSamples();
65         for(size_t index=0; index < sampleNames.Count(); index++)
66         {
67             const wxString & sampleName = sampleNames[index];
68             TiXmlElement * sampleE = new TiXmlElement(cnvstr::TAG_SAMPLE.c_str());
69             iElem->LinkEndChild(sampleE);
70             TiXmlElement * nameE = new TiXmlElement(cnvstr::TAG_NAME.c_str());
71             sampleE->LinkEndChild(nameE);
72             TiXmlText * nameText = new TiXmlText(sampleName);
73             nameE->LinkEndChild(nameText);
74         }
75     }
76 
77     return indsElem;
78 }
79 
80 TiXmlElement *
CmdExportInfile(const GCFile & fileRef) const81 GCDataStore::CmdExportInfile(const GCFile& fileRef) const
82 {
83     TiXmlElement * fileE = new TiXmlElement(cnvstr::TAG_INFILE.c_str());
84 
85     // file attributes
86     GCFileFormat fform = fileRef.GetFormat();
87     fileE->SetAttribute(cnvstr::ATTR_FORMAT.c_str(),ToWxString(fform).c_str());
88 
89     gcGeneralDataType dtype;
90 
91     if(GetStructures().HasParse(fileRef))
92     {
93         const GCParse & parseRef = GetStructures().GetParse(fileRef);
94         constBlockVector blocks = parseRef.GetBlocks();
95         if(!blocks.empty())
96         {
97             const GCParseBlock * blockP = blocks[0];
98             assert(blockP != NULL);
99             size_t locusId = GetStructures().GetLocusForBlock(blockP->GetId());
100             const gcLocus & locusRef = GetStructures().GetLocus(locusId);
101             dtype = locusRef.GetDataType();
102         }
103         else
104         {
105             dtype = parseRef.GetDataType();
106         }
107     }
108 
109     if(dtype.size() != 1)
110     {
111         gcGeneralDataType dtype = fileRef.GetGeneralDataType();
112     }
113 
114     if(dtype.size() != 1)
115         // put comment in to tell user what to do
116     {
117         TiXmlComment * comment = new TiXmlComment();
118         comment->SetValue(gcstr::instructionsMultipleDataTypes.c_str());
119         fileE->LinkEndChild(comment);
120 
121     }
122     fileE->SetAttribute(cnvstr::ATTR_DATATYPE.c_str(),ToWxString(dtype).c_str());
123 
124     GCInterleaving inter = fileRef.GetInterleaving();
125     if(inter == interleaving_MOOT)
126         // sequences fit on one line, so assume
127         // it's sequential
128     {
129         inter = interleaving_SEQUENTIAL;
130     }
131     fileE->SetAttribute(cnvstr::ATTR_SEQUENCEALIGNMENT.c_str(),ToWxString(inter));
132 
133     // name
134     TiXmlElement * name = new TiXmlElement(cnvstr::TAG_NAME.c_str());
135     TiXmlText * fname = new TiXmlText(fileRef.GetName().c_str());
136     name->LinkEndChild(fname);
137     fileE->LinkEndChild(name);
138 
139     const GCLocusMatcher & locMatch = GetStructures().GetLocusMatcher(fileRef);
140     loc_match locMatchType = locMatch.GetLocMatchType();
141 
142     TiXmlElement * lMatch = new TiXmlElement(cnvstr::TAG_SEGMENTS_MATCHING.c_str());
143     fileE->LinkEndChild(lMatch);
144     lMatch->SetAttribute(cnvstr::ATTR_TYPE.c_str(),ToWxString(locMatchType).c_str());
145 
146     if(locMatchType != locmatch_DEFAULT)
147     {
148         if(locMatchType == locmatch_SINGLE)
149         {
150             const wxArrayString & locNames = locMatch.GetLociNames();
151             TiXmlText * lname = new TiXmlText(locNames[0].c_str());
152             lMatch->LinkEndChild(lname);
153         }
154 
155         if(locMatchType == locmatch_VECTOR)
156         {
157             const wxArrayString & locNames = locMatch.GetLociNames();
158             for(size_t i=0; i < locNames.Count(); i++)
159             {
160                 TiXmlElement * locName = new TiXmlElement(cnvstr::TAG_SEGMENT_NAME.c_str());
161                 lMatch->LinkEndChild(locName);
162                 TiXmlText * lname = new TiXmlText(locNames[i].c_str());
163                 locName->LinkEndChild(lname);
164             }
165         }
166     }
167 
168     const GCPopMatcher & popMatch = GetStructures().GetPopMatcher(fileRef);
169     pop_match popMatchType = popMatch.GetPopMatchType();
170 
171     TiXmlElement * pMatch = new TiXmlElement(cnvstr::TAG_POP_MATCHING.c_str());
172     fileE->LinkEndChild(pMatch);
173     pMatch->SetAttribute(cnvstr::ATTR_TYPE.c_str(),ToWxString(popMatchType).c_str());
174 
175     if(popMatchType != popmatch_DEFAULT)
176     {
177 
178         if(popMatchType == popmatch_SINGLE)
179         {
180             const wxArrayString & popNames = popMatch.GetPopNames();
181             TiXmlText * pname = new TiXmlText(popNames[0].c_str());
182             pMatch->LinkEndChild(pname);
183         }
184 
185         if(popMatchType == popmatch_VECTOR)
186         {
187             const wxArrayString & popNames = popMatch.GetPopNames();
188             for(size_t i=0; i < popNames.Count(); i++)
189             {
190                 TiXmlElement * popName = new TiXmlElement(cnvstr::TAG_POP_NAME.c_str());
191                 pMatch->LinkEndChild(popName);
192                 TiXmlText * pname = new TiXmlText(popNames[i].c_str());
193                 popName->LinkEndChild(pname);
194             }
195         }
196     }
197 
198     if(GetStructures().HasHapFileAdjacent(fileRef.GetId()))
199     {
200         size_t numAdj = GetStructures().GetHapFileAdjacent(fileRef.GetId());
201         TiXmlElement * adj = new TiXmlElement(cnvstr::TAG_INDIVIDUALS_FROM_SAMPLES.c_str());
202         adj->SetAttribute(cnvstr::ATTR_TYPE.c_str(),cnvstr::ATTR_VAL_BYADJACENCY.c_str());
203         fileE->LinkEndChild(adj);
204         TiXmlText * adjVal = new TiXmlText(wxString::Format("%d",(int)numAdj).c_str());
205         adj->LinkEndChild(adjVal);
206     }
207 
208     return fileE;
209 }
210 
211 TiXmlElement *
CmdExportLocus(const gcLocus & locusRef) const212 GCDataStore::CmdExportLocus(const gcLocus& locusRef) const
213 {
214     TiXmlElement * locE = new TiXmlElement(cnvstr::TAG_SEGMENT.c_str());
215     TiXmlText * name = new TiXmlText(locusRef.GetName().c_str());
216     locE->LinkEndChild(name);
217     return locE;
218 }
219 
220 TiXmlElement *
CmdExportPhenotype(const gcPhenotype & pheno) const221 GCDataStore::CmdExportPhenotype(const gcPhenotype & pheno) const
222 {
223     TiXmlElement * phenoE = new TiXmlElement(cnvstr::TAG_PHENOTYPE.c_str());
224 
225     // name
226     TiXmlElement * name = new TiXmlElement(cnvstr::TAG_NAME.c_str());
227     phenoE->LinkEndChild(name);
228     TiXmlText * nameText = new TiXmlText(pheno.GetName());
229     name->LinkEndChild(nameText);
230 
231     // geno reso
232     phenoE->LinkEndChild(CmdExportGenoReso(pheno));
233     return phenoE;
234 }
235 
236 TiXmlElement *
CmdExportGenoReso(const gcPhenotype & pheno) const237 GCDataStore::CmdExportGenoReso(const gcPhenotype & pheno) const
238 {
239     // geno reso
240     TiXmlElement * genoE = new TiXmlElement(cnvstr::TAG_GENO_RESOLUTIONS.c_str());
241 
242     assert(pheno.HasTraitId());
243     const gcTraitInfo & trait = GetStructures().GetTrait(pheno.GetTraitId());
244     TiXmlElement * tName = new TiXmlElement(cnvstr::TAG_TRAIT_NAME.c_str());
245     genoE->LinkEndChild(tName);
246     TiXmlText * tNameText = new TiXmlText(trait.GetName());
247     tName->LinkEndChild(tNameText);
248 
249     const std::vector<gcHapProbability> & hapProbs = pheno.GetHapProbabilities();
250     assert(!hapProbs.empty());
251     for(size_t i=0; i < hapProbs.size(); i++)
252     {
253         const gcHapProbability & hprob = hapProbs[i];
254         assert(hprob.HasPenetrance());
255 
256         TiXmlElement * hapE = new TiXmlElement(cnvstr::TAG_HAPLOTYPES.c_str());
257         genoE->LinkEndChild(hapE);
258 
259         TiXmlElement * pen = new TiXmlElement(cnvstr::TAG_PENETRANCE.c_str());
260         hapE->LinkEndChild(pen);
261         TiXmlText * penText = new TiXmlText(wxString::Format("%f",hprob.GetPenetrance()));
262         pen->LinkEndChild(penText);
263 
264         wxString alleleString = " ";
265         const gcIdVec & alleleIds = hprob.GetAlleleIds();
266         for(gcIdVec::const_iterator iter = alleleIds.begin(); iter != alleleIds.end(); iter++)
267         {
268             alleleString += GetStructures().GetAllele(*iter).GetName();
269             alleleString += " ";
270         }
271         TiXmlElement * alleles = new TiXmlElement(cnvstr::TAG_ALLELES.c_str());
272         hapE->LinkEndChild(alleles);
273         TiXmlText * allelesText = new TiXmlText(alleleString);
274         alleles->LinkEndChild(allelesText);
275     }
276 
277     return genoE;
278 }
279 
280 TiXmlElement *
CmdExportPop(const gcPopulation & popRef) const281 GCDataStore::CmdExportPop(const gcPopulation & popRef) const
282 {
283     TiXmlElement * popE = new TiXmlElement(cnvstr::TAG_POPULATION.c_str());
284     TiXmlText * name = new TiXmlText(popRef.GetName().c_str());
285     popE->LinkEndChild(name);
286     return popE;
287 }
288 
289 TiXmlElement *
CmdExportTrait(const gcTraitInfo & traitRef) const290 GCDataStore::CmdExportTrait(const gcTraitInfo & traitRef) const
291 {
292     TiXmlElement * traitE = new TiXmlElement(cnvstr::TAG_TRAIT_INFO.c_str());
293 
294     // one name
295     TiXmlElement * traitName = new TiXmlElement(cnvstr::TAG_NAME.c_str());
296     traitE->LinkEndChild(traitName);
297     TiXmlText * nameText = new TiXmlText(traitRef.GetName().c_str());
298     traitName->LinkEndChild(nameText);
299 
300     // many alleles
301     const gcIdSet & alleleIds = traitRef.GetAlleleIds();
302     for(gcIdSet::const_iterator i = alleleIds.begin(); i != alleleIds.end(); i++)
303     {
304         const gcTraitAllele & alleleRef = GetStructures().GetAllele(*i);
305         TiXmlElement * alleleElem = new TiXmlElement(cnvstr::TAG_ALLELE.c_str());
306         traitE->LinkEndChild(alleleElem);
307         TiXmlText * alleleText = new TiXmlText(alleleRef.GetName().c_str());
308         alleleElem->LinkEndChild(alleleText);
309     }
310 
311     return traitE;
312 }
313 
314 TiXmlElement *
CmdExportRegion(const gcRegion & regRef) const315 GCDataStore::CmdExportRegion(const gcRegion & regRef) const
316 {
317     TiXmlElement * regE = new TiXmlElement(cnvstr::TAG_REGION.c_str());
318 
319     // name
320     TiXmlElement * name = new TiXmlElement(cnvstr::TAG_NAME.c_str());
321     TiXmlText * rname = new TiXmlText(regRef.GetName().c_str());
322     name->LinkEndChild(rname);
323     regE->LinkEndChild(name);
324 
325     // effective pop size
326     if(regRef.HasEffectivePopulationSize())
327     {
328         TiXmlElement * effPop = new TiXmlElement(cnvstr::TAG_EFFECTIVE_POPSIZE.c_str());
329         TiXmlText * esize = new TiXmlText(wxString::Format("%f",regRef.GetEffectivePopulationSize()).c_str());
330         effPop->LinkEndChild(esize);
331         regE->LinkEndChild(effPop);
332     }
333 
334     // spacing -- EWFIX -- may drop down to next level
335     TiXmlElement * spacing = new TiXmlElement(cnvstr::TAG_SEGMENTS.c_str());
336     regE->LinkEndChild(spacing);
337     gcIdVec segIds = GetStructures().GetLocusIdsForRegionByMapPosition(regRef.GetId());
338     for(gcIdVec::iterator i=segIds.begin(); i != segIds.end(); i++)
339     {
340         size_t locusId = *i;
341         const gcLocus & locusRef = GetStructures().GetLocus(locusId);
342         spacing->LinkEndChild(CmdExportSegment(locusRef));
343     }
344 
345     // trait location
346     const GCTraitInfoSet & traits = regRef.GetTraitInfoSet();
347     for(GCTraitInfoSet::const_iterator i=traits.begin(); i != traits.end(); i++)
348     {
349         const gcTraitInfo & traitInfo = GetStructures().GetTrait(*i);
350         TiXmlElement * traitElem = new TiXmlElement(cnvstr::TAG_TRAIT_LOCATION.c_str());
351         regE->LinkEndChild(traitElem);
352         TiXmlElement * traitName = new TiXmlElement(cnvstr::TAG_TRAIT_NAME.c_str());
353         traitElem->LinkEndChild(traitName);
354         TiXmlText * traitNameText = new TiXmlText(traitInfo.GetName().c_str());
355         traitName->LinkEndChild(traitNameText);
356     }
357 
358     return regE;
359 }
360 
361 TiXmlElement *
CmdExportSegment(const gcLocus & locusRef) const362 GCDataStore::CmdExportSegment(const gcLocus& locusRef) const
363 {
364     TiXmlElement * locusE = new TiXmlElement(cnvstr::TAG_SEGMENT.c_str());
365 
366     locusE->SetAttribute(cnvstr::ATTR_DATATYPE.c_str(),locusRef.GetDataTypeString().c_str());
367     if(locusRef.HasLinkedUserValue())
368     {
369         locusE->SetAttribute(cnvstr::ATTR_PROXIMITY.c_str(),locusRef.GetLinkedUserValueString().c_str());
370     }
371 
372     TiXmlElement * nameE = new TiXmlElement(cnvstr::TAG_NAME.c_str());
373     TiXmlText * nameText = new TiXmlText(locusRef.GetName().c_str());
374     nameE->LinkEndChild(nameText);
375     locusE->LinkEndChild(nameE);
376 
377     if(locusRef.HasNumMarkers())
378     {
379         TiXmlElement * markersE = new TiXmlElement(cnvstr::TAG_MARKERS.c_str());
380         TiXmlText * markersText = new TiXmlText(locusRef.GetNumMarkersString().c_str());
381         markersE->LinkEndChild(markersText);
382         locusE->LinkEndChild(markersE);
383     }
384 
385     if(locusRef.HasOffset())
386     {
387         TiXmlElement * offsetE = new TiXmlElement(cnvstr::TAG_FIRST_POSITION_SCANNED.c_str());
388         TiXmlText * offsetText = new TiXmlText(locusRef.GetOffsetString().c_str());
389         offsetE->LinkEndChild(offsetText);
390         locusE->LinkEndChild(offsetE);
391     }
392 
393     if(locusRef.HasMapPosition())
394     {
395         TiXmlElement * mapE = new TiXmlElement(cnvstr::TAG_MAP_POSITION.c_str());
396         TiXmlText * mapText = new TiXmlText(locusRef.GetMapPositionString().c_str());
397         mapE->LinkEndChild(mapText);
398         locusE->LinkEndChild(mapE);
399     }
400 
401     if(locusRef.HasTotalLength())
402     {
403         TiXmlElement * lengthE = new TiXmlElement(cnvstr::TAG_SCANNED_LENGTH.c_str());
404         TiXmlText * lengthText = new TiXmlText(locusRef.GetTotalLengthString().c_str());
405         lengthE->LinkEndChild(lengthText);
406         locusE->LinkEndChild(lengthE);
407     }
408 
409     if(locusRef.HasLocations())
410     {
411         TiXmlElement * locationsE = new TiXmlElement(cnvstr::TAG_SCANNED_DATA_POSITIONS.c_str());
412         TiXmlText * locationsText = new TiXmlText(locusRef.GetLocationsAsString().c_str());
413         locationsE->LinkEndChild(locationsText);
414         locusE->LinkEndChild(locationsE);
415     }
416 
417     if(locusRef.HasUnphasedMarkers())
418     {
419         TiXmlElement * unphasedE = new TiXmlElement(cnvstr::TAG_UNRESOLVED_MARKERS.c_str());
420         TiXmlText * unphasedText = new TiXmlText(locusRef.GetUnphasedMarkersAsString().c_str());
421         unphasedE->LinkEndChild(unphasedText);
422         locusE->LinkEndChild(unphasedE);
423     }
424 
425     return locusE;
426 }
427 
428 TiXmlDocument *
ExportBatch() const429 GCDataStore::ExportBatch() const
430 {
431     TiXmlDocument * docP = new TiXmlDocument();
432     TiXmlDeclaration * decl = new TiXmlDeclaration( "1.0", "", "" );
433     docP->LinkEndChild( decl );
434 
435     TiXmlComment * comment = new TiXmlComment();
436     wxDateTime now = wxDateTime::Now();
437     comment->SetValue(wxString::Format(gcstr::batchOutComment,now.Format().c_str()).c_str());
438     docP->LinkEndChild(comment);
439 
440     TiXmlElement * top = new TiXmlElement( cnvstr::TAG_CONVERTER_CMD.c_str() );
441     docP->LinkEndChild( top );
442     top->SetAttribute(cnvstr::ATTR_VERSION.c_str(),VERSION);
443 
444     constObjVector traits = GetStructures().GetConstTraits();
445     if(!traits.empty())
446     {
447         TiXmlElement * traitsElem = new TiXmlElement( cnvstr::TAG_TRAITS.c_str() );
448         top->LinkEndChild( traitsElem );
449 
450         for(constObjVector::const_iterator iter = traits.begin(); iter != traits.end(); iter++)
451         {
452             const gcTraitInfo * traitP = dynamic_cast<const gcTraitInfo*>(*iter);
453             assert (traitP != NULL);
454             traitsElem->LinkEndChild(CmdExportTrait(*traitP));
455         }
456 
457         const gcPhenoMap & phenos = GetStructures().GetPhenotypeMap();
458         for(gcPhenoMap::const_iterator i = phenos.begin(); i != phenos.end(); i++)
459         {
460             const gcPhenotype & phenoRef = (*i).second;
461             if(phenoRef.HasExplicitName())
462             {
463                 traitsElem->LinkEndChild(CmdExportPhenotype(phenoRef));
464             }
465         }
466 
467     }
468 
469     constObjVector regs = GetStructures().GetConstDisplayableRegions();
470     if(!regs.empty())
471     {
472         TiXmlElement * regsElem = new TiXmlElement( cnvstr::TAG_REGIONS.c_str() );
473         top->LinkEndChild( regsElem );
474         for(constObjVector::const_iterator iter = regs.begin(); iter != regs.end(); iter++)
475         {
476             const gcRegion * regP = dynamic_cast<const gcRegion*>(*iter);
477             assert (regP != NULL);
478             regsElem->LinkEndChild(CmdExportRegion(*regP));
479         }
480     }
481 
482     constObjVector pops = GetStructures().GetConstDisplayablePops();
483     if(!pops.empty())
484     {
485         TiXmlElement * popsElem = new TiXmlElement( cnvstr::TAG_POPULATIONS.c_str() );
486         top->LinkEndChild( popsElem );
487         for(constObjVector::const_iterator iter = pops.begin(); iter != pops.end(); iter++)
488         {
489             const gcPopulation * popP = dynamic_cast<const gcPopulation*>(*iter);
490             assert (popP != NULL);
491             popsElem->LinkEndChild(CmdExportPop(*popP));
492         }
493     }
494 
495     TiXmlElement * individualsElem = CmdExportIndividuals();
496     if(individualsElem != NULL)
497     {
498         top->LinkEndChild( individualsElem );
499     }
500 
501     const dataFileSet & files = GetDataFiles();
502     if(!files.empty())
503     {
504         TiXmlElement * infElem = new TiXmlElement( cnvstr::TAG_INFILES.c_str() );
505         top->LinkEndChild( infElem );
506         for(dataFileSet::const_iterator iter= files.begin(); iter != files.end(); iter++)
507         {
508             const GCFile & fileRef = *(*iter);
509             infElem->LinkEndChild(CmdExportInfile(fileRef));
510         }
511     }
512 
513     if(!(m_outfileName.IsEmpty()))
514     {
515         TiXmlElement * outf = new TiXmlElement( cnvstr::TAG_OUTFILE.c_str() );
516         top->LinkEndChild( outf );
517         TiXmlText * outName = new TiXmlText(m_outfileName.c_str());
518         outf->LinkEndChild(outName);
519     }
520 
521     if(!(m_commentString.IsEmpty()))
522     {
523         TiXmlElement * commentElem = new TiXmlElement( cnvstr::TAG_ADDCOMMENT.c_str() );
524         top->LinkEndChild( commentElem );
525         TiXmlText * commentText = new TiXmlText(m_commentString.c_str());
526         commentElem->LinkEndChild(commentText);
527     }
528 
529     return docP;
530 
531 }
532 
533 void
WriteBatchFile(TiXmlDocument * docP,wxString fileName)534 GCDataStore::WriteBatchFile(TiXmlDocument * docP, wxString fileName)
535 {
536     docP->SaveFile( fileName.c_str());
537 }
538 
539 //____________________________________________________________________________________
540