1 // ==========================================================================
2 // Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 // Data structures for storing genomic variants and routines to work on them.
35 //
36 // Variants are split into SNPs, small indels, and large structural variants.
37 // There is a data structure for each class of variant. Doing such a
38 // separation allows for almost simple translation between the reference
39 // coordinate system and the one for the genome including all variants.
40 //
41 // There are routines provided for
42 // ==========================================================================
43
44 // TODO(holtgrew): Could put the coordinate system into types. This would add static checking on coordinate system.
45
46 #ifndef APPS_MASON2_GENOMIC_VARIANTS_H_
47 #define APPS_MASON2_GENOMIC_VARIANTS_H_
48
49 #include <seqan/align.h>
50 #include <seqan/misc/interval_tree.h>
51
52 #include "methylation_levels.h"
53
54 // ============================================================================
55 // Forwards
56 // ============================================================================
57
58 typedef seqan::JournalEntries<seqan::JournalEntry<unsigned, int>, seqan::SortedArray> TJournalEntries;
59
60 // ============================================================================
61 // Tags, Classes, Enums
62 // ============================================================================
63
64 // --------------------------------------------------------------------------
65 // Class SnpRecord
66 // --------------------------------------------------------------------------
67
68 // Represents a SNP in one haplotype.
69 //
70 // All coordinates are given as source coordinates.
71
72 struct SnpRecord
73 {
74 // Reference id and position on the reference.
75 int rId;
76 int pos;
77
78 // The haplotype that this variation belongs to.
79 int haplotype;
80
81 // The target nucleotide.
82 seqan::Dna5 to;
83
SnpRecordSnpRecord84 SnpRecord() : rId(-1), pos(-1), haplotype(-1), to('\0')
85 {}
86
SnpRecordSnpRecord87 SnpRecord(int haplotype, int rId, int pos, char to) :
88 rId(rId), pos(pos), haplotype(haplotype), to(to)
89 {}
90
91 bool operator<(SnpRecord const & other) const
92 {
93 if (rId < other.rId || (rId == other.rId && pos < other.pos) ||
94 (rId == other.rId && pos == other.pos && haplotype < other.haplotype))
95 return true;
96 return false;
97 }
98
getPosSnpRecord99 std::pair<int, int> getPos() const
100 {
101 return std::make_pair(rId, pos);
102 }
103 };
104
105 std::ostream & operator<<(std::ostream & out, SnpRecord const & record);
106
107 // --------------------------------------------------------------------------
108 // Class SmallIndelRecord
109 // --------------------------------------------------------------------------
110
111 // Represents a small indel.
112 //
113 // All coordinates are given as source coordinates.
114
115 struct SmallIndelRecord
116 {
117 // Reference id and position on the reference.
118 int rId;
119 int pos;
120
121 // The haplotype that this variation belongs to.
122 int haplotype;
123
124 // The size of the indel, negative numbers for deletions, positive numbers for insertions.
125 int size;
126
127 // The inserted sequence if any.
128 seqan::CharString seq;
129
SmallIndelRecordSmallIndelRecord130 SmallIndelRecord() : rId(-1), pos(-1), haplotype(-1), size(0)
131 {}
132
SmallIndelRecordSmallIndelRecord133 SmallIndelRecord(int haplotype, int rId, int pos, int size, seqan::CharString const & seq) :
134 rId(rId), pos(pos), haplotype(haplotype), size(size), seq(seq)
135 {}
136
137 bool operator<(SmallIndelRecord const & other) const
138 {
139 return getPos() < other.getPos();
140 }
141
getPosSmallIndelRecord142 std::pair<int, int> getPos() const
143 {
144 return std::make_pair(rId, pos);
145 }
146 };
147
148 std::ostream & operator<<(std::ostream & out, SmallIndelRecord const & record);
149
150 // --------------------------------------------------------------------------
151 // Class SmallVarInfo
152 // --------------------------------------------------------------------------
153
154 // Information about small variants.
155
156 struct SmallVarInfo
157 {
158 enum Kind { SNP, INS, DEL };
159
160 Kind kind;
161 int pos;
162 int count;
163
SmallVarInfoSmallVarInfo164 SmallVarInfo() : kind(SNP), pos(-1), count(0) {}
SmallVarInfoSmallVarInfo165 SmallVarInfo(Kind kind, int pos, int count) : kind(kind), pos(pos), count(count) {}
166
167 bool operator<(SmallVarInfo const & other) const
168 { return (pos < other.pos || (pos == other.pos && kind < other.kind)); }
169 };
170
171 // --------------------------------------------------------------------------
172 // Class StructuralVariantRecord
173 // --------------------------------------------------------------------------
174
175 // Store structural variant information.
176 //
177 // All coordinates are given as source coordinates.
178
179 struct StructuralVariantRecord
180 {
181 enum Kind
182 {
183 INVALID,
184 INDEL,
185 INVERSION,
186 TRANSLOCATION,
187 DUPLICATION
188 };
189
190 // The kind of the SV.
191 Kind kind;
192
193 // Reference id and position on the reference.
194 int rId;
195 int pos;
196
197 // The haplotype that this variation belongs to.
198 int haplotype;
199
200 // In case of indel, negative numbers give deletions, positionve numbers give insertions. In case of inversions,
201 // the length of the inverted sequence. In case of translocation the length of the translocated sequence, and in
202 // the case the length of the duplicated sequence.
203 int size;
204
205 // Target reference id and position on this reference. Currently, we only have SVs on the same chromosome. Used in
206 // case of translocation and duplication.
207 int targetRId;
208 int targetPos;
209
210 // The inserted sequence if any.
211 seqan::CharString seq;
212
StructuralVariantRecordStructuralVariantRecord213 StructuralVariantRecord() :
214 kind(INVALID), rId(-1), pos(-1), haplotype(-1), size(0), targetRId(-1), targetPos(-1)
215 {}
216
217 StructuralVariantRecord(Kind kind, int haplotype, int rId, int pos, int size,
218 int targetRId = -1, int targetPos = -1) :
kindStructuralVariantRecord219 kind(kind), rId(rId), pos(pos), haplotype(haplotype), size(size), targetRId(targetRId),
220 targetPos(targetPos)
221 {}
222
223 bool operator<(StructuralVariantRecord const & other) const
224 {
225 return getPos() < other.getPos();
226 }
227
getPosStructuralVariantRecord228 std::pair<int, int> getPos() const
229 {
230 return std::make_pair(rId, pos);
231 }
232
233 // Returns true if query is within one base of the breakend.
234 bool nearBreakend(int query) const;
235
236 // Returns end position of SV.
237 //
238 // Return max value of int in case it is marked as invalid.
239 int endPosition() const;
240
241 // Return true if this SV overlaps with other within one bp.
overlapsWithStructuralVariantRecord242 bool overlapsWith(StructuralVariantRecord const & other) const
243 {
244 return (other.pos <= endPosition()) && (pos <= other.endPosition());
245 }
246 };
247
248 std::ostream & operator<<(std::ostream & out, StructuralVariantRecord const & record);
249
250 // --------------------------------------------------------------------------
251 // Class Variants
252 // --------------------------------------------------------------------------
253
254 // Contains the simulated variants.
255
256 struct Variants
257 {
258 // SNP records.
259 seqan::String<SnpRecord> snps;
260
261 // Small indel records.
262 seqan::String<SmallIndelRecord> smallIndels;
263
264 // Structural variation record.
265 seqan::String<StructuralVariantRecord> svRecords;
266
267 // Names of the snps, smallIndels, SVs, as in the VCF file.
268 seqan::StringSet<seqan::CharString> snpIDs;
269 seqan::StringSet<seqan::CharString> smallIndelIDs;
270 seqan::StringSet<seqan::CharString> svIDs;
271
clearVariants272 void clear()
273 {
274 seqan::clear(snps);
275 seqan::clear(smallIndels);
276 seqan::clear(svRecords);
277
278 seqan::clear(snpIDs);
279 seqan::clear(smallIndelIDs);
280 seqan::clear(svIDs);
281 }
282
283 enum IndexKind
284 {
285 SNP,
286 SMALL_INDEL,
287 SV
288 };
289
290 // Return the name of a variant from its index.
getVariantNameVariants291 seqan::CharString getVariantName(int idx) const
292 {
293 std::pair<IndexKind, int> res = resolveIdx(idx);
294 if (res.first == SNP)
295 {
296 if (res.second >= (int)length(snpIDs))
297 return ".";
298 return snpIDs[res.second];
299 }
300 else if (res.first == SMALL_INDEL)
301 {
302 if (res.second >= (int)length(smallIndelIDs))
303 return ".";
304 return smallIndelIDs[res.second];
305 }
306 else
307 {
308 if (res.second >= (int)length(svIDs))
309 return ".";
310 return svIDs[res.second];
311 }
312 }
313
314 // Translate number of variant with type to ids.
posToIdxVariants315 int posToIdx(IndexKind kind, int pos) const
316 {
317 switch (kind)
318 {
319 case SNP:
320 return pos;
321 case SMALL_INDEL:
322 return pos + length(snps);
323 case SV:
324 return pos + length(snps) + length(smallIndels);
325 default:
326 SEQAN_FAIL("Cannot reach here.");
327 }
328
329 SEQAN_FAIL("Invalid kind!");
330 return -1;
331 }
332
333 // Each variant has a numeric index. Indices go up to (length(snps) + length(smallIndels) + length(svRecords)) - 1
334 // and are "stacked".
resolveIdxVariants335 std::pair<IndexKind, int> resolveIdx(int idx) const
336 {
337 if (idx < (int)length(snps))
338 return std::make_pair(SNP, idx);
339 else if (idx < (int)(length(snps) + (int)length(smallIndels)))
340 return std::make_pair(SMALL_INDEL, (int)(idx - length(snps)));
341 else if (idx < (int)(length(snps) + (int)(length(smallIndels) + length(svRecords))))
342 return std::make_pair(SV, (int)(idx - length(snps) - length(smallIndels)));
343 else
344 SEQAN_FAIL("Invalid idx!");
345 return std::make_pair(SNP, -1);
346 }
347 };
348
349 // --------------------------------------------------------------------------
350 // Class GenomicInterval
351 // --------------------------------------------------------------------------
352
353 // We annotate intervals (in an interval tree) from the genome with structural variants with intervals in the sequence
354 // with small variants. These intervals are stored as GenomicInterval objects.
355
356 struct GenomicInterval
357 {
358 // The kind of the interval. Can be inserted, inverted, duplicated, or anything else (normal = translocation,
359 // untouched.
360 enum Kind
361 {
362 NORMAL,
363 INSERTED,
364 INVERTED,
365 DUPLICATED
366 };
367
368 // Begin and end position on sequence with SVs.
369 int svBeginPos;
370 int svEndPos;
371
372 // Begin and end position on sequence without SVs. Set to [-1, -1) in case of insertion.
373 int smallVarBeginPos;
374 int smallVarEndPos;
375
376 // The strand of the interval '+'/'-'.
377 char strand;
378
379 // The kind of the interval.
380 Kind kind;
381
382 explicit
383 GenomicInterval(int svBeginPos = -1, int svEndPos = -1,
384 int smallVarBeginPos = -1, int smallVarEndPos = -1,
385 char strand = '.', Kind kind = NORMAL) :
svBeginPosGenomicInterval386 svBeginPos(svBeginPos), svEndPos(svEndPos), smallVarBeginPos(smallVarBeginPos),
387 smallVarEndPos(smallVarEndPos), strand(strand), kind(kind)
388 {}
389
390 bool operator==(GenomicInterval const & other) const
391 {
392 return (svBeginPos == other.svBeginPos) &&
393 (svEndPos == other.svEndPos) &&
394 (smallVarBeginPos == other.smallVarBeginPos) &&
395 (smallVarEndPos == other.smallVarEndPos) &&
396 (strand == other.strand);
397 }
398
399 bool operator!=(GenomicInterval const & other) const
400 {
401 return !(*this == other);
402 }
403 };
404
cmpIntervalSmallVarPos(GenomicInterval const & lhs,GenomicInterval const & rhs)405 inline bool cmpIntervalSmallVarPos(GenomicInterval const & lhs,
406 GenomicInterval const & rhs)
407 {
408 return std::make_pair(lhs.smallVarBeginPos, lhs.smallVarEndPos) < std::make_pair(rhs.smallVarBeginPos, rhs.smallVarEndPos);
409 }
410
411 // --------------------------------------------------------------------------
412 // Class PositionMap
413 // --------------------------------------------------------------------------
414
415 // Used for translating between the sequence with SVs, small variants, and original sequence.
416
417 class PositionMap
418 {
419 public:
420 typedef int TValue;
421 typedef GenomicInterval TCargo;
422 typedef seqan::IntervalAndCargo<TValue, TCargo> TInterval;
423 typedef seqan::IntervalTree<TValue, TCargo> TIntervalTree;
424
425 typedef seqan::String<seqan::GapAnchor<int> > TGapAnchors;
426 typedef seqan::Gaps<seqan::Nothing, seqan::AnchorGaps<TGapAnchors> > TGaps;
427
428 // TODO(holtgrew): We need a function *in this class* that builds the large variants data strutures for better encapsulation!
429
430 // Gap anchors for gaps for translating between original and small variant coordinate system.
431 TGapAnchors refGapAnchors, smallVarGapAnchors;
432 // The mapping from the genome with large variants to the one with small variants.
433 TIntervalTree svIntervalTree;
434 // The mapping from the genome with small variants to the one with large variants.
435 TIntervalTree svIntervalTreeSTL; // small-to-large
436 // The breakpoints (as [(point, idx)] on the sequence with variants.
437 std::set<std::pair<int, int> > svBreakpoints;
438
439 // Returns true if the interval on the sequence with structural variants overlaps with a breakpoint.
440 bool overlapsWithBreakpoint(int svBeginPos, int svEndPos) const;
441
442 // Returns the GenomicInterval on the sequence with small variants for the given position on the sequence with SVs.
443 GenomicInterval getGenomicInterval(int svPos) const;
444
445 // Returns the GenomicInterval on the sequence using a position on the small var reference.
446 GenomicInterval getGenomicIntervalSmallVarPos(int smallVarPos) const;
447
448 // Translates an interval on the sequence with small variants to an interval on the original sequence. The
449 // translation is done in such a way that when the begin position is in an insertion, it is projected to the right
450 // of the gap and if the end position is in an insertion, it is projected to the left of the gap.
451 std::pair<int, int> toOriginalInterval(int smallVarBeginPos, int smallVarEndPos) const;
452
453 // Translates an interval on the sequence with SVs to an interval on the sequence with small variants.
454 //
455 // Returns (-1, -1) if the interval lies in an insertion.
456 //
457 // Returns (a, b), a > b if on the reverse strand
458 //
459 // The interval must not overlap with a breakpoint.
460 std::pair<int, int> toSmallVarInterval(int svBeginPos, int svEndPos) const;
461
462 // Translate the interval on the original sequence into coordinates with small variants.
463 std::pair<int, int> originalToSmallVarInterval(int beginPos, int endPos) const;
464
465 // Translate the interval on the reference with small variants to the one with large variants.
466 std::pair<int, int> smallVarToLargeVarInterval(int beginPos, int endPos) const;
467
468 // Reset the PositionMap with the length of the original sequence.
469 void reinit(TJournalEntries const & journal);
470 };
471
472 // --------------------------------------------------------------------------
473 // Class VariantMaterializer
474 // --------------------------------------------------------------------------
475
476 // Materialize variants stored in a Variants object.
477 //
478 // Note that the class assumes that all variants come from the same contig and haplotype.
479
480 // TODO(holtgrew): Rename to ContigMaterializer.
481
482 class VariantMaterializer
483 {
484 public:
485 // The random number generator to use for methylation levels.
486 TRng * rng;
487 // The Variants to materialize for.
488 Variants const * variants;
489 // Options for the methylation level simulator. Methylation simulation is required for fixing methylation levels.
490 MethylationLevelSimulatorOptions const * methSimOptions;
491
492 // Verbosity.
493 int verbosity;
494
VariantMaterializer()495 VariantMaterializer() : rng(), variants(), methSimOptions(), verbosity(1)
496 {}
497
VariantMaterializer(TRng & rng,Variants const & variants)498 VariantMaterializer(TRng & rng, Variants const & variants) :
499 rng(&rng), variants(&variants), methSimOptions(), verbosity(1)
500 {}
501
VariantMaterializer(TRng & rng,Variants const & variants,MethylationLevelSimulatorOptions const & methSimOptions)502 VariantMaterializer(TRng & rng, Variants const & variants, MethylationLevelSimulatorOptions const & methSimOptions) :
503 rng(&rng), variants(&variants), methSimOptions(&methSimOptions), verbosity(1)
504 {}
505
506 // Materialize the variants from the haplotype with the given id in *variants to result given the reference sequence refSeq.
507 //
508 // Breakpoints is a vector of (point, id) where point is a point on the materialized contig with variants and id is
509 // an integer index into variants. See Variants::resolveIdx() for more information.
run(seqan::Dna5String & resultSeq,PositionMap & posMap,std::vector<SmallVarInfo> & smallVars,std::vector<std::pair<int,int>> & breakpoints,seqan::Dna5String const & refSeq,int haplotypeId)510 int run(seqan::Dna5String & resultSeq,
511 PositionMap & posMap,
512 std::vector<SmallVarInfo> & smallVars,
513 std::vector<std::pair<int, int> > & breakpoints,
514 seqan::Dna5String const & refSeq,
515 int haplotypeId)
516 {
517 return _runImpl(&resultSeq, &posMap, 0, smallVars, breakpoints, &refSeq, 0, haplotypeId);
518 }
519
520 // Same as the run() above, but including reference levels.
run(seqan::Dna5String & resultSeq,PositionMap & posMap,MethylationLevels & resultLvls,std::vector<SmallVarInfo> & smallVars,std::vector<std::pair<int,int>> & breakpoints,seqan::Dna5String const & refSeq,MethylationLevels const & refLvls,int haplotypeId)521 int run(seqan::Dna5String & resultSeq,
522 PositionMap & posMap,
523 MethylationLevels & resultLvls,
524 std::vector<SmallVarInfo> & smallVars,
525 std::vector<std::pair<int, int> > & breakpoints,
526 seqan::Dna5String const & refSeq,
527 MethylationLevels const & refLvls,
528 int haplotypeId)
529 {
530 return _runImpl(&resultSeq, &posMap, &resultLvls, smallVars, breakpoints, &refSeq, &refLvls, haplotypeId);
531 }
532
533 // Implementation of the materialization, uses pointers instead of references for deciding whether materializing
534 // levels or not.
535 int _runImpl(seqan::Dna5String * resultSeq,
536 PositionMap * posMap,
537 MethylationLevels * resultLvls,
538 std::vector<SmallVarInfo> & smallVars,
539 std::vector<std::pair<int, int> > & breakpoints,
540 seqan::Dna5String const * ref,
541 MethylationLevels const * refLvls,
542 int haplotypeId);
543
544 // Materialization of the small variants.
545 //
546 // Levels passed as NULL if not given.
547 int _materializeSmallVariants(seqan::Dna5String & seq,
548 TJournalEntries & journal,
549 MethylationLevels * levelsSmallVariants,
550 std::vector<SmallVarInfo> & smallVarInfos,
551 seqan::Dna5String const & contig,
552 Variants const & variants,
553 MethylationLevels const * levels,
554 int hId);
555
556 // Materialization of the large variants.
557 //
558 // Levels passed as NULL if not given.
559 int _materializeLargeVariants(seqan::Dna5String & seq,
560 MethylationLevels * levelsLargeVariants,
561 std::vector<SmallVarInfo> & varInfos,
562 std::vector<std::pair<int, int> > & breakpoints,
563 PositionMap & positionMap,
564 TJournalEntries const & journal,
565 seqan::Dna5String const & contig,
566 std::vector<SmallVarInfo> const & smallVarInfos,
567 Variants const & variants,
568 MethylationLevels const * levels,
569 int hId);
570 };
571
572 // ============================================================================
573 // Metafunctions
574 // ============================================================================
575
576 // ============================================================================
577 // Functions
578 // ============================================================================
579
580 #endif // #ifndef APPS_MASON2_GENOMIC_VARIANTS_H_
581