1 // ==========================================================================
2 //                         Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 // Data structures for storing genomic variants and routines to work on them.
35 //
36 // Variants are split into SNPs, small indels, and large structural variants.
37 // There is a data structure for each class of variant.  Doing such a
38 // separation allows for almost simple translation between the reference
39 // coordinate system and the one for the genome including all variants.
40 //
41 // There are routines provided for
42 // ==========================================================================
43 
44 // TODO(holtgrew): Could put the coordinate system into types. This would add static checking on coordinate system.
45 
46 #ifndef APPS_MASON2_GENOMIC_VARIANTS_H_
47 #define APPS_MASON2_GENOMIC_VARIANTS_H_
48 
49 #include <seqan/align.h>
50 #include <seqan/misc/interval_tree.h>
51 
52 #include "methylation_levels.h"
53 
54 // ============================================================================
55 // Forwards
56 // ============================================================================
57 
58 typedef seqan::JournalEntries<seqan::JournalEntry<unsigned, int>, seqan::SortedArray> TJournalEntries;
59 
60 // ============================================================================
61 // Tags, Classes, Enums
62 // ============================================================================
63 
64 // --------------------------------------------------------------------------
65 // Class SnpRecord
66 // --------------------------------------------------------------------------
67 
68 // Represents a SNP in one haplotype.
69 //
70 // All coordinates are given as source coordinates.
71 
72 struct SnpRecord
73 {
74     // Reference id and position on the reference.
75     int rId;
76     int pos;
77 
78     // The haplotype that this variation belongs to.
79     int haplotype;
80 
81     // The target nucleotide.
82     seqan::Dna5 to;
83 
SnpRecordSnpRecord84     SnpRecord() : rId(-1), pos(-1), haplotype(-1), to('\0')
85     {}
86 
SnpRecordSnpRecord87     SnpRecord(int haplotype, int rId, int pos, char to) :
88             rId(rId), pos(pos), haplotype(haplotype), to(to)
89     {}
90 
91     bool operator<(SnpRecord const & other) const
92     {
93         if (rId < other.rId || (rId == other.rId && pos < other.pos) ||
94             (rId == other.rId && pos == other.pos && haplotype < other.haplotype))
95             return true;
96         return false;
97     }
98 
getPosSnpRecord99     std::pair<int, int> getPos() const
100     {
101         return std::make_pair(rId, pos);
102     }
103 };
104 
105 std::ostream & operator<<(std::ostream & out, SnpRecord const & record);
106 
107 // --------------------------------------------------------------------------
108 // Class SmallIndelRecord
109 // --------------------------------------------------------------------------
110 
111 // Represents a small indel.
112 //
113 // All coordinates are given as source coordinates.
114 
115 struct SmallIndelRecord
116 {
117     // Reference id and position on the reference.
118     int rId;
119     int pos;
120 
121     // The haplotype that this variation belongs to.
122     int haplotype;
123 
124     // The size of the indel, negative numbers for deletions, positive numbers for insertions.
125     int size;
126 
127     // The inserted sequence if any.
128     seqan::CharString seq;
129 
SmallIndelRecordSmallIndelRecord130     SmallIndelRecord() : rId(-1), pos(-1), haplotype(-1), size(0)
131     {}
132 
SmallIndelRecordSmallIndelRecord133     SmallIndelRecord(int haplotype, int rId, int pos, int size, seqan::CharString const & seq) :
134             rId(rId), pos(pos), haplotype(haplotype), size(size), seq(seq)
135     {}
136 
137     bool operator<(SmallIndelRecord const & other) const
138     {
139         return getPos() < other.getPos();
140     }
141 
getPosSmallIndelRecord142     std::pair<int, int> getPos() const
143     {
144         return std::make_pair(rId, pos);
145     }
146 };
147 
148 std::ostream & operator<<(std::ostream & out, SmallIndelRecord const & record);
149 
150 // --------------------------------------------------------------------------
151 // Class SmallVarInfo
152 // --------------------------------------------------------------------------
153 
154 // Information about small variants.
155 
156 struct SmallVarInfo
157 {
158     enum Kind { SNP, INS, DEL };
159 
160     Kind kind;
161     int pos;
162     int count;
163 
SmallVarInfoSmallVarInfo164     SmallVarInfo() : kind(SNP), pos(-1), count(0) {}
SmallVarInfoSmallVarInfo165     SmallVarInfo(Kind kind, int pos, int count) : kind(kind), pos(pos), count(count) {}
166 
167     bool operator<(SmallVarInfo const & other) const
168     { return (pos < other.pos || (pos == other.pos && kind < other.kind)); }
169 };
170 
171 // --------------------------------------------------------------------------
172 // Class StructuralVariantRecord
173 // --------------------------------------------------------------------------
174 
175 // Store structural variant information.
176 //
177 // All coordinates are given as source coordinates.
178 
179 struct StructuralVariantRecord
180 {
181     enum Kind
182     {
183         INVALID,
184         INDEL,
185         INVERSION,
186         TRANSLOCATION,
187         DUPLICATION
188     };
189 
190     // The kind of the SV.
191     Kind kind;
192 
193     // Reference id and position on the reference.
194     int rId;
195     int pos;
196 
197     // The haplotype that this variation belongs to.
198     int haplotype;
199 
200     // In case of indel, negative numbers give deletions, positionve numbers give insertions.  In case of inversions,
201     // the length of the inverted sequence.  In case of translocation the length of the translocated sequence, and in
202     // the case the length of the duplicated sequence.
203     int size;
204 
205     // Target reference id and position on this reference.  Currently, we only have SVs on the same chromosome.  Used in
206     // case of translocation and duplication.
207     int targetRId;
208     int targetPos;
209 
210     // The inserted sequence if any.
211     seqan::CharString seq;
212 
StructuralVariantRecordStructuralVariantRecord213     StructuralVariantRecord() :
214             kind(INVALID), rId(-1), pos(-1), haplotype(-1), size(0), targetRId(-1), targetPos(-1)
215     {}
216 
217     StructuralVariantRecord(Kind kind, int haplotype, int rId, int pos, int size,
218                             int targetRId = -1, int targetPos = -1) :
kindStructuralVariantRecord219             kind(kind), rId(rId), pos(pos), haplotype(haplotype), size(size), targetRId(targetRId),
220             targetPos(targetPos)
221     {}
222 
223     bool operator<(StructuralVariantRecord const & other) const
224     {
225         return getPos() < other.getPos();
226     }
227 
getPosStructuralVariantRecord228     std::pair<int, int> getPos() const
229     {
230         return std::make_pair(rId, pos);
231     }
232 
233     // Returns true if query is within one base of the breakend.
234     bool nearBreakend(int query) const;
235 
236     // Returns end position of SV.
237     //
238     // Return max value of int in case it is marked as invalid.
239     int endPosition() const;
240 
241     // Return true if this SV overlaps with other within one bp.
overlapsWithStructuralVariantRecord242     bool overlapsWith(StructuralVariantRecord const & other) const
243     {
244         return (other.pos <= endPosition()) && (pos <= other.endPosition());
245     }
246 };
247 
248 std::ostream & operator<<(std::ostream & out, StructuralVariantRecord const & record);
249 
250 // --------------------------------------------------------------------------
251 // Class Variants
252 // --------------------------------------------------------------------------
253 
254 // Contains the simulated variants.
255 
256 struct Variants
257 {
258     // SNP records.
259     seqan::String<SnpRecord> snps;
260 
261     // Small indel records.
262     seqan::String<SmallIndelRecord> smallIndels;
263 
264     // Structural variation record.
265     seqan::String<StructuralVariantRecord> svRecords;
266 
267     // Names of the snps, smallIndels, SVs, as in the VCF file.
268     seqan::StringSet<seqan::CharString> snpIDs;
269     seqan::StringSet<seqan::CharString> smallIndelIDs;
270     seqan::StringSet<seqan::CharString> svIDs;
271 
clearVariants272     void clear()
273     {
274         seqan::clear(snps);
275         seqan::clear(smallIndels);
276         seqan::clear(svRecords);
277 
278         seqan::clear(snpIDs);
279         seqan::clear(smallIndelIDs);
280         seqan::clear(svIDs);
281     }
282 
283     enum IndexKind
284     {
285         SNP,
286         SMALL_INDEL,
287         SV
288     };
289 
290     // Return the name of a variant from its index.
getVariantNameVariants291     seqan::CharString getVariantName(int idx) const
292     {
293         std::pair<IndexKind, int> res = resolveIdx(idx);
294         if (res.first == SNP)
295         {
296             if (res.second >= (int)length(snpIDs))
297                 return ".";
298             return snpIDs[res.second];
299         }
300         else if (res.first == SMALL_INDEL)
301         {
302             if (res.second >= (int)length(smallIndelIDs))
303                 return ".";
304             return smallIndelIDs[res.second];
305         }
306         else
307         {
308             if (res.second >= (int)length(svIDs))
309                 return ".";
310             return svIDs[res.second];
311         }
312     }
313 
314     // Translate number of variant with type to ids.
posToIdxVariants315     int posToIdx(IndexKind kind, int pos) const
316     {
317         switch (kind)
318         {
319             case SNP:
320                 return pos;
321             case SMALL_INDEL:
322                 return pos + length(snps);
323             case SV:
324                 return pos + length(snps) + length(smallIndels);
325             default:
326                 SEQAN_FAIL("Cannot reach here.");
327         }
328 
329         SEQAN_FAIL("Invalid kind!");
330         return -1;
331     }
332 
333     // Each variant has a numeric index.  Indices go up to (length(snps) + length(smallIndels) + length(svRecords)) - 1
334     // and are "stacked".
resolveIdxVariants335     std::pair<IndexKind, int> resolveIdx(int idx) const
336     {
337         if (idx < (int)length(snps))
338             return std::make_pair(SNP, idx);
339         else if (idx < (int)(length(snps) + (int)length(smallIndels)))
340             return std::make_pair(SMALL_INDEL, (int)(idx - length(snps)));
341         else if (idx < (int)(length(snps) + (int)(length(smallIndels) + length(svRecords))))
342             return std::make_pair(SV, (int)(idx - length(snps) - length(smallIndels)));
343         else
344             SEQAN_FAIL("Invalid idx!");
345         return std::make_pair(SNP, -1);
346     }
347 };
348 
349 // --------------------------------------------------------------------------
350 // Class GenomicInterval
351 // --------------------------------------------------------------------------
352 
353 // We annotate intervals (in an interval tree) from the genome with structural variants with intervals in the sequence
354 // with small variants.  These intervals are stored as GenomicInterval objects.
355 
356 struct GenomicInterval
357 {
358     // The kind of the interval.  Can be inserted, inverted, duplicated, or anything else (normal = translocation,
359     // untouched.
360     enum Kind
361     {
362         NORMAL,
363         INSERTED,
364         INVERTED,
365         DUPLICATED
366     };
367 
368     // Begin and end position on sequence with SVs.
369     int svBeginPos;
370     int svEndPos;
371 
372     // Begin and end position on sequence without SVs.  Set to [-1, -1) in case of insertion.
373     int smallVarBeginPos;
374     int smallVarEndPos;
375 
376     // The strand of the interval '+'/'-'.
377     char strand;
378 
379     // The kind of the interval.
380     Kind kind;
381 
382     explicit
383     GenomicInterval(int svBeginPos = -1, int svEndPos = -1,
384                     int smallVarBeginPos = -1, int smallVarEndPos = -1,
385                     char strand = '.', Kind kind = NORMAL) :
svBeginPosGenomicInterval386             svBeginPos(svBeginPos), svEndPos(svEndPos), smallVarBeginPos(smallVarBeginPos),
387             smallVarEndPos(smallVarEndPos), strand(strand), kind(kind)
388     {}
389 
390     bool operator==(GenomicInterval const & other) const
391     {
392         return (svBeginPos == other.svBeginPos) &&
393                 (svEndPos == other.svEndPos) &&
394                 (smallVarBeginPos == other.smallVarBeginPos) &&
395                 (smallVarEndPos == other.smallVarEndPos) &&
396                 (strand == other.strand);
397     }
398 
399     bool operator!=(GenomicInterval const & other) const
400     {
401         return !(*this == other);
402     }
403 };
404 
cmpIntervalSmallVarPos(GenomicInterval const & lhs,GenomicInterval const & rhs)405 inline bool cmpIntervalSmallVarPos(GenomicInterval const & lhs,
406                                    GenomicInterval const & rhs)
407 {
408     return std::make_pair(lhs.smallVarBeginPos, lhs.smallVarEndPos) < std::make_pair(rhs.smallVarBeginPos, rhs.smallVarEndPos);
409 }
410 
411 // --------------------------------------------------------------------------
412 // Class PositionMap
413 // --------------------------------------------------------------------------
414 
415 // Used for translating between the sequence with SVs, small variants, and original sequence.
416 
417 class PositionMap
418 {
419 public:
420     typedef int TValue;
421     typedef GenomicInterval TCargo;
422     typedef seqan::IntervalAndCargo<TValue, TCargo> TInterval;
423     typedef seqan::IntervalTree<TValue, TCargo> TIntervalTree;
424 
425     typedef seqan::String<seqan::GapAnchor<int> > TGapAnchors;
426     typedef seqan::Gaps<seqan::Nothing, seqan::AnchorGaps<TGapAnchors> > TGaps;
427 
428     // TODO(holtgrew): We need a function *in this class* that builds the large variants data strutures for better encapsulation!
429 
430     // Gap anchors for gaps for translating between original and small variant coordinate system.
431     TGapAnchors refGapAnchors, smallVarGapAnchors;
432     // The mapping from the genome with large variants to the one with small variants.
433     TIntervalTree svIntervalTree;
434     // The mapping from the genome with small variants to the one with large variants.
435     TIntervalTree svIntervalTreeSTL;  // small-to-large
436     // The breakpoints (as [(point, idx)] on the sequence with variants.
437     std::set<std::pair<int, int> > svBreakpoints;
438 
439     // Returns true if the interval on the sequence with structural variants overlaps with a breakpoint.
440     bool overlapsWithBreakpoint(int svBeginPos, int svEndPos) const;
441 
442     // Returns the GenomicInterval on the sequence with small variants for the given position on the sequence with SVs.
443     GenomicInterval getGenomicInterval(int svPos) const;
444 
445     // Returns the GenomicInterval on the sequence using a position on the small var reference.
446     GenomicInterval getGenomicIntervalSmallVarPos(int smallVarPos) const;
447 
448     // Translates an interval on the sequence with small variants to an interval on the original sequence.  The
449     // translation is done in such a way that when the begin position is in an insertion, it is projected to the right
450     // of the gap and if the end position is in an insertion, it is projected to the left of the gap.
451     std::pair<int, int> toOriginalInterval(int smallVarBeginPos, int smallVarEndPos) const;
452 
453     // Translates an interval on the sequence with SVs to an interval on the sequence with small variants.
454     //
455     // Returns (-1, -1) if the interval lies in an insertion.
456     //
457     // Returns (a, b), a > b if on the reverse strand
458     //
459     // The interval must not overlap with a breakpoint.
460     std::pair<int, int> toSmallVarInterval(int svBeginPos, int svEndPos) const;
461 
462     // Translate the interval on the original sequence into coordinates with small variants.
463     std::pair<int, int> originalToSmallVarInterval(int beginPos, int endPos) const;
464 
465     // Translate the interval on the reference with small variants to the one with large variants.
466     std::pair<int, int> smallVarToLargeVarInterval(int beginPos, int endPos) const;
467 
468     // Reset the PositionMap with the length of the original sequence.
469     void reinit(TJournalEntries const & journal);
470 };
471 
472 // --------------------------------------------------------------------------
473 // Class VariantMaterializer
474 // --------------------------------------------------------------------------
475 
476 // Materialize variants stored in a Variants object.
477 //
478 // Note that the class assumes that all variants come from the same contig and haplotype.
479 
480 // TODO(holtgrew): Rename to ContigMaterializer.
481 
482 class VariantMaterializer
483 {
484 public:
485     // The random number generator to use for methylation levels.
486     TRng * rng;
487     // The Variants to materialize for.
488     Variants const * variants;
489     // Options for the methylation level simulator.  Methylation simulation is required for fixing methylation levels.
490     MethylationLevelSimulatorOptions const * methSimOptions;
491 
492     // Verbosity.
493     int verbosity;
494 
VariantMaterializer()495     VariantMaterializer() : rng(), variants(), methSimOptions(), verbosity(1)
496     {}
497 
VariantMaterializer(TRng & rng,Variants const & variants)498     VariantMaterializer(TRng & rng, Variants const & variants) :
499             rng(&rng), variants(&variants), methSimOptions(), verbosity(1)
500     {}
501 
VariantMaterializer(TRng & rng,Variants const & variants,MethylationLevelSimulatorOptions const & methSimOptions)502     VariantMaterializer(TRng & rng, Variants const & variants, MethylationLevelSimulatorOptions const & methSimOptions) :
503             rng(&rng), variants(&variants), methSimOptions(&methSimOptions), verbosity(1)
504     {}
505 
506     // Materialize the variants from the haplotype with the given id in *variants to result given the reference sequence refSeq.
507     //
508     // Breakpoints is a vector of (point, id) where point is a point on the materialized contig with variants and id is
509     // an integer index into variants.  See Variants::resolveIdx() for more information.
run(seqan::Dna5String & resultSeq,PositionMap & posMap,std::vector<SmallVarInfo> & smallVars,std::vector<std::pair<int,int>> & breakpoints,seqan::Dna5String const & refSeq,int haplotypeId)510     int run(seqan::Dna5String & resultSeq,
511             PositionMap & posMap,
512             std::vector<SmallVarInfo> & smallVars,
513             std::vector<std::pair<int, int> > & breakpoints,
514             seqan::Dna5String const & refSeq,
515             int haplotypeId)
516     {
517         return _runImpl(&resultSeq, &posMap, 0, smallVars, breakpoints, &refSeq, 0, haplotypeId);
518     }
519 
520     // Same as the run() above, but including reference levels.
run(seqan::Dna5String & resultSeq,PositionMap & posMap,MethylationLevels & resultLvls,std::vector<SmallVarInfo> & smallVars,std::vector<std::pair<int,int>> & breakpoints,seqan::Dna5String const & refSeq,MethylationLevels const & refLvls,int haplotypeId)521     int run(seqan::Dna5String & resultSeq,
522             PositionMap & posMap,
523             MethylationLevels & resultLvls,
524             std::vector<SmallVarInfo> & smallVars,
525             std::vector<std::pair<int, int> > & breakpoints,
526             seqan::Dna5String const & refSeq,
527             MethylationLevels const & refLvls,
528             int haplotypeId)
529     {
530         return _runImpl(&resultSeq, &posMap, &resultLvls, smallVars, breakpoints, &refSeq, &refLvls, haplotypeId);
531     }
532 
533     // Implementation of the materialization, uses pointers instead of references for deciding whether materializing
534     // levels or not.
535     int _runImpl(seqan::Dna5String * resultSeq,
536                  PositionMap * posMap,
537                  MethylationLevels * resultLvls,
538                  std::vector<SmallVarInfo> & smallVars,
539                  std::vector<std::pair<int, int> > & breakpoints,
540                  seqan::Dna5String const * ref,
541                  MethylationLevels const * refLvls,
542                  int haplotypeId);
543 
544     // Materialization of the small variants.
545     //
546     // Levels passed as NULL if not given.
547     int _materializeSmallVariants(seqan::Dna5String & seq,
548                                   TJournalEntries & journal,
549                                   MethylationLevels * levelsSmallVariants,
550                                   std::vector<SmallVarInfo> & smallVarInfos,
551                                   seqan::Dna5String const & contig,
552                                   Variants const & variants,
553                                   MethylationLevels const * levels,
554                                   int hId);
555 
556     // Materialization of the large variants.
557     //
558     // Levels passed as NULL if not given.
559     int _materializeLargeVariants(seqan::Dna5String & seq,
560                                   MethylationLevels * levelsLargeVariants,
561                                   std::vector<SmallVarInfo> & varInfos,
562                                   std::vector<std::pair<int, int> > & breakpoints,
563                                   PositionMap & positionMap,
564                                   TJournalEntries const & journal,
565                                   seqan::Dna5String const & contig,
566                                   std::vector<SmallVarInfo> const & smallVarInfos,
567                                   Variants const & variants,
568                                   MethylationLevels const * levels,
569                                   int hId);
570 };
571 
572 // ============================================================================
573 // Metafunctions
574 // ============================================================================
575 
576 // ============================================================================
577 // Functions
578 // ============================================================================
579 
580 #endif  // #ifndef APPS_MASON2_GENOMIC_VARIANTS_H_
581