1 // ========================================================================== 2 // Mason - A Read Simulator 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de> 33 // ========================================================================== 34 35 #ifndef APPS_MASON2_VCF_MATERIALIZATION_H_ 36 #define APPS_MASON2_VCF_MATERIALIZATION_H_ 37 38 #include <stdexcept> 39 40 #include <seqan/sequence.h> 41 #include <seqan/seq_io.h> 42 #include <seqan/vcf_io.h> 43 44 #include "genomic_variants.h" 45 #include "methylation_levels.h" 46 47 // ============================================================================ 48 // Forwards 49 // ============================================================================ 50 51 // ============================================================================ 52 // Tags, Classes, Enums 53 // ============================================================================ 54 55 // ---------------------------------------------------------------------------- 56 // Class VcfMaterializer 57 // ---------------------------------------------------------------------------- 58 59 // Allows the contig- and haplotype-wise construction of haplotypes stored in VCF files. 60 61 class VcfMaterializer 62 { 63 public: 64 // The random number generator to use for methylation simulation, if any. 65 TRng & rng; 66 // Options for the methylation simulation. 67 MethylationLevelSimulatorOptions const * methOptions; 68 69 // The PositionMap is built for each contig to map between large variants, small variants, and original coordinate 70 // system. 71 PositionMap posMap; 72 73 // ------------------------------------------------------------------------ 74 // Paths 75 // ------------------------------------------------------------------------ 76 77 // Path to reference file. 78 seqan::CharString fastaFileName; 79 // Path to VCF file. 80 seqan::CharString vcfFileName; 81 // Path to methylation FASTA file. 82 seqan::CharString methFastaFileName; 83 84 // ------------------------------------------------------------------------ 85 // State for position in reference 86 // ------------------------------------------------------------------------ 87 88 // The index of the current contig. 89 int currRID; 90 // The index of the current haplotype of the contig. 91 int nextHaplotype; 92 // The number of haplotypes (set after call to init()). 93 int numHaplotypes; 94 // The variants for the current contig. 95 Variants contigVariants; 96 // Current contig reference sequence. 97 seqan::Dna5String contigSeq; 98 // Current methylation levels. 99 MethylationLevels currentLevels; 100 101 // ------------------------------------------------------------------------ 102 // File Input 103 // ------------------------------------------------------------------------ 104 105 // The FAI Index to load the reference sequence from. 106 seqan::FaiIndex faiIndex; 107 // The FAI Index to load the methylation sequences from. 108 seqan::FaiIndex methFaiIndex; 109 // The VCF stream to load from and the VCF heade.r 110 seqan::VcfFileIn vcfFileIn; 111 seqan::VcfHeader vcfHeader; 112 // The current VCF record. rID == INVALID_REFID if invalid, used for termination. 113 seqan::VcfRecord vcfRecord; 114 VcfMaterializer(TRng & rng)115 VcfMaterializer(TRng & rng) : rng(rng), currRID(-1), nextHaplotype(0), numHaplotypes(0) 116 {} 117 118 // If you give methFastaFileName, then you also have to set methOptions. 119 // 120 // The methylation simulation assumes that there is an methylation options object. 121 VcfMaterializer(TRng & rng, 122 char const * fastaFileName, 123 char const * vcfFileName, 124 char const * methFastaFileName = "", 125 MethylationLevelSimulatorOptions const * methOptions = 0) : rng(rng)126 rng(rng), methOptions(methOptions), fastaFileName(fastaFileName), vcfFileName(vcfFileName), 127 methFastaFileName(methFastaFileName), currRID(-1), nextHaplotype(0), numHaplotypes(0) 128 {} 129 130 // Call to open all files. 131 // 132 // Throws: MasonIOException 133 void init(); 134 135 // Materialize next contig. 136 // 137 // Write sequence to seq, reference id to rID, haplotype to haplotype. Returns true if the materialization could be 138 // done and false if there are no more contigs to materialize. 139 // 140 // Call init() before calling materializeNext(). 141 // 142 // Throws: MasonIOException 143 bool materializeNext(seqan::Dna5String & seq, 144 std::vector<SmallVarInfo> & varInfos, 145 std::vector<std::pair<int, int> > & breakpoints, 146 int & rID, int & haplotype); 147 148 // Similar to the one above but loads methylation levels into levels. Can only work if methFastFileName is not 149 // empty. 150 bool materializeNext(seqan::Dna5String & seq, 151 MethylationLevels & levels, 152 std::vector<SmallVarInfo> & varInfos, 153 std::vector<std::pair<int, int> > & breakpoints, 154 int & rID, int & haplotype); 155 156 private: 157 158 bool _materializeNext(seqan::Dna5String & seq, MethylationLevels * levels, 159 std::vector<SmallVarInfo> & varInfos, 160 std::vector<std::pair<int, int> > & breakpoints, 161 int & rID, int & haplotype); 162 163 // Load variants of next contig into variants. 164 int _loadVariantsForContig(Variants & variants, int rID); 165 166 // Append VCF record to variants. 167 void _appendToVariants(Variants & variants, seqan::VcfRecord const & vcfRecord); 168 169 // Append chunk of 6 BND records to variants. 170 void _appendToVariantsBnd(Variants & variants, std::vector<seqan::VcfRecord> const & vcfRecords); 171 172 // Load the levels for the contig with the given rID. 173 void _loadLevels(int rID); 174 }; 175 176 // ============================================================================ 177 // Metafunctions 178 // ============================================================================ 179 180 // ============================================================================ 181 // Functions 182 // ============================================================================ 183 184 #endif // #ifndef APPS_MASON2_VCF_MATERIALIZATION_H_ 185