1 // ==========================================================================
2 //                         Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 
35 #ifndef APPS_MASON2_VCF_MATERIALIZATION_H_
36 #define APPS_MASON2_VCF_MATERIALIZATION_H_
37 
38 #include <stdexcept>
39 
40 #include <seqan/sequence.h>
41 #include <seqan/seq_io.h>
42 #include <seqan/vcf_io.h>
43 
44 #include "genomic_variants.h"
45 #include "methylation_levels.h"
46 
47 // ============================================================================
48 // Forwards
49 // ============================================================================
50 
51 // ============================================================================
52 // Tags, Classes, Enums
53 // ============================================================================
54 
55 // ----------------------------------------------------------------------------
56 // Class VcfMaterializer
57 // ----------------------------------------------------------------------------
58 
59 // Allows the contig- and haplotype-wise construction of haplotypes stored in VCF files.
60 
61 class VcfMaterializer
62 {
63 public:
64     // The random number generator to use for methylation simulation, if any.
65     TRng & rng;
66     // Options for the methylation simulation.
67     MethylationLevelSimulatorOptions const * methOptions;
68 
69     // The PositionMap is built for each contig to map between large variants, small variants, and original coordinate
70     // system.
71     PositionMap posMap;
72 
73     // ------------------------------------------------------------------------
74     // Paths
75     // ------------------------------------------------------------------------
76 
77     // Path to reference file.
78     seqan::CharString fastaFileName;
79     // Path to VCF file.
80     seqan::CharString vcfFileName;
81     // Path to methylation FASTA file.
82     seqan::CharString methFastaFileName;
83 
84     // ------------------------------------------------------------------------
85     // State for position in reference
86     // ------------------------------------------------------------------------
87 
88     // The index of the current contig.
89     int currRID;
90     // The index of the current haplotype of the contig.
91     int nextHaplotype;
92     // The number of haplotypes (set after call to init()).
93     int numHaplotypes;
94     // The variants for the current contig.
95     Variants contigVariants;
96     // Current contig reference sequence.
97     seqan::Dna5String contigSeq;
98     // Current methylation levels.
99     MethylationLevels currentLevels;
100 
101     // ------------------------------------------------------------------------
102     // File Input
103     // ------------------------------------------------------------------------
104 
105     // The FAI Index to load the reference sequence from.
106     seqan::FaiIndex faiIndex;
107     // The FAI Index to load the methylation sequences from.
108     seqan::FaiIndex methFaiIndex;
109     // The VCF stream to load from and the VCF heade.r
110     seqan::VcfFileIn vcfFileIn;
111     seqan::VcfHeader vcfHeader;
112     // The current VCF record.  rID == INVALID_REFID if invalid, used for termination.
113     seqan::VcfRecord vcfRecord;
114 
VcfMaterializer(TRng & rng)115     VcfMaterializer(TRng & rng) : rng(rng), currRID(-1), nextHaplotype(0), numHaplotypes(0)
116     {}
117 
118     // If you give methFastaFileName, then you also have to set methOptions.
119     //
120     // The methylation simulation assumes that there is an methylation options object.
121     VcfMaterializer(TRng & rng,
122                     char const * fastaFileName,
123                     char const * vcfFileName,
124                     char const * methFastaFileName = "",
125                     MethylationLevelSimulatorOptions const * methOptions = 0) :
rng(rng)126             rng(rng), methOptions(methOptions), fastaFileName(fastaFileName), vcfFileName(vcfFileName),
127             methFastaFileName(methFastaFileName), currRID(-1), nextHaplotype(0), numHaplotypes(0)
128     {}
129 
130     // Call to open all files.
131     //
132     // Throws: MasonIOException
133     void init();
134 
135     // Materialize next contig.
136     //
137     // Write sequence to seq, reference id to rID, haplotype to haplotype.  Returns true if the materialization could be
138     // done and false if there are no more contigs to materialize.
139     //
140     // Call init() before calling materializeNext().
141     //
142     // Throws: MasonIOException
143     bool materializeNext(seqan::Dna5String & seq,
144                          std::vector<SmallVarInfo> & varInfos,
145                          std::vector<std::pair<int, int> > & breakpoints,
146                          int & rID, int & haplotype);
147 
148     // Similar to the one above but loads methylation levels into levels.  Can only work if methFastFileName is not
149     // empty.
150     bool materializeNext(seqan::Dna5String & seq,
151                          MethylationLevels & levels,
152                          std::vector<SmallVarInfo> & varInfos,
153                          std::vector<std::pair<int, int> > & breakpoints,
154                          int & rID, int & haplotype);
155 
156 private:
157 
158     bool _materializeNext(seqan::Dna5String & seq, MethylationLevels * levels,
159                           std::vector<SmallVarInfo> & varInfos,
160                           std::vector<std::pair<int, int> > & breakpoints,
161                           int & rID, int & haplotype);
162 
163     // Load variants of next contig into variants.
164     int _loadVariantsForContig(Variants & variants, int rID);
165 
166     // Append VCF record to variants.
167     void _appendToVariants(Variants & variants, seqan::VcfRecord const & vcfRecord);
168 
169     // Append chunk of 6 BND records to variants.
170     void _appendToVariantsBnd(Variants & variants, std::vector<seqan::VcfRecord> const & vcfRecords);
171 
172     // Load the levels for the contig with the given rID.
173     void _loadLevels(int rID);
174 };
175 
176 // ============================================================================
177 // Metafunctions
178 // ============================================================================
179 
180 // ============================================================================
181 // Functions
182 // ============================================================================
183 
184 #endif  // #ifndef APPS_MASON2_VCF_MATERIALIZATION_H_
185