1 // ==========================================================================
2 // Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34
35 #include "genomic_variants.h"
36
operator <<(std::ostream & out,SnpRecord const & record)37 std::ostream & operator<<(std::ostream & out, SnpRecord const & record)
38 {
39 out << "SnpRecord(" << record.haplotype << ", " << record.rId << ", " << record.pos
40 << ", " << record.to << ")";
41 return out;
42 }
43
operator <<(std::ostream & out,SmallIndelRecord const & record)44 std::ostream & operator<<(std::ostream & out, SmallIndelRecord const & record)
45 {
46 out << "SnpRecord(" << record.haplotype << ", " << record.rId << ", " << record.pos
47 << ", " << record.size << ", " << record.seq << ")";
48 return out;
49 }
50
endPosition() const51 int StructuralVariantRecord::endPosition() const
52 {
53 if (pos == -1)
54 return std::numeric_limits<int>::max();
55
56 switch (kind)
57 {
58 case INDEL:
59 if (size > 0)
60 return pos;
61 else
62 return pos + size;
63 case INVERSION:
64 return pos + size;
65 case TRANSLOCATION:
66 return targetPos;
67 case DUPLICATION:
68 return targetPos;
69 default:
70 return -1;
71 }
72 }
73
nearBreakend(int query) const74 bool StructuralVariantRecord::nearBreakend(int query) const
75 {
76 if (pos == -1)
77 return false; // invalid/sentinel has no breakends
78
79 switch (kind)
80 {
81 case INDEL:
82 if (size > 0)
83 return (query == pos || query == pos + 1);
84 else
85 return (query == pos || query == pos + 1 ||
86 query == pos - size || query == pos - size + 1);
87 case INVERSION:
88 return (query == pos || query == pos + 1 ||
89 query == pos + size || query == pos + size + 1);
90 case TRANSLOCATION:
91 return (query == pos || query == pos + 1 ||
92 query == targetPos || query == targetPos + 1);
93 case DUPLICATION:
94 return (query == pos || query == pos + 1 ||
95 query == targetPos || query == targetPos + 1);
96 default:
97 return false;
98 }
99 }
100
operator <<(std::ostream & out,StructuralVariantRecord const & record)101 std::ostream & operator<<(std::ostream & out, StructuralVariantRecord const & record)
102 {
103 char const * kind;
104 switch (record.kind)
105 {
106 case StructuralVariantRecord::TRANSLOCATION:
107 kind = "TRANSLOCATION";
108 break;
109 case StructuralVariantRecord::INDEL:
110 kind = "INDEL";
111 break;
112 case StructuralVariantRecord::INVERSION:
113 kind = "INVERSION";
114 break;
115 case StructuralVariantRecord::DUPLICATION:
116 kind = "DUPLICATION";
117 break;
118 default:
119 kind = "INVALID";
120 break;
121 }
122 out << "StructuralVariantRecord(kind=" << kind << ", haplotype=" << record.haplotype
123 << ", rId=" << record.rId << ", pos=" << record.pos << ", size=" << record.size
124 << ", targetRId=" << record.targetRId << ", targetPos=" << record.targetPos
125 << ", seq=\"" << record.seq << "\")";
126 return out;
127 }
128
129 // ----------------------------------------------------------------------------
130 // Function VariantMaterializer::_runImpl()
131 // ----------------------------------------------------------------------------
132
_runImpl(seqan::Dna5String * resultSeq,PositionMap * posMap,MethylationLevels * resultLvls,std::vector<SmallVarInfo> & varInfos,std::vector<std::pair<int,int>> & breakpoints,seqan::Dna5String const * ref,MethylationLevels const * refLvls,int haplotypeId)133 int VariantMaterializer::_runImpl(
134 seqan::Dna5String * resultSeq,
135 PositionMap * posMap,
136 MethylationLevels * resultLvls,
137 std::vector<SmallVarInfo> & varInfos,
138 std::vector<std::pair<int, int> > & breakpoints,
139 seqan::Dna5String const * ref,
140 MethylationLevels const * refLvls,
141 int haplotypeId)
142 {
143 breakpoints.clear();
144 clear(*resultSeq);
145 if (resultLvls)
146 resultLvls->clear();
147
148 // Apply small variants. We get a sequence with the small variants and a journal of the difference to contig.
149 seqan::Dna5String seqSmallVariants;
150 TJournalEntries journal;
151 MethylationLevels levelsSmallVariants; // only used if revLevels != 0
152 MethylationLevels * smallLvlsPtr = refLvls ? &levelsSmallVariants : 0;
153 std::vector<SmallVarInfo> smallVarInfos;
154 if (_materializeSmallVariants(seqSmallVariants, journal, smallLvlsPtr, smallVarInfos, *ref, *variants,
155 refLvls, haplotypeId) != 0)
156 return 1;
157
158 // Build position map for large variant -> small variant and small variant <-> reference position mapping.
159 posMap->reinit(journal); // build mapping from small variant to reference positions
160
161 // Apply structural variants and build the interval tree of posMap
162 if (_materializeLargeVariants(*resultSeq, resultLvls, varInfos, breakpoints, *posMap, journal, seqSmallVariants,
163 smallVarInfos, *variants, smallLvlsPtr, haplotypeId) != 0)
164 return 1;
165
166 // Sort resulting variant infos.
167 std::sort(varInfos.begin(), varInfos.end());
168
169 // std::sort(breakpoints.begin(), breakpoints.end());
170 // std::cout << "SV Breakpoints\n";
171 // for (unsigned i = 0; i < breakpoints.size(); ++i)
172 // std::cout << " " << breakpoints[i] << "\n";
173
174 // Copy out SV breakpoints.
175 posMap->svBreakpoints.insert(breakpoints.begin(), breakpoints.end());
176
177 return 0;
178 }
179
180 // ----------------------------------------------------------------------------
181 // Function VariantMaterializer::_materializeSmallVariants()
182 // ----------------------------------------------------------------------------
183
_materializeSmallVariants(seqan::Dna5String & seq,TJournalEntries & journal,MethylationLevels * levelsSmallVariants,std::vector<SmallVarInfo> & smallVarInfos,seqan::Dna5String const & contig,Variants const & variants,MethylationLevels const * levels,int hId)184 int VariantMaterializer::_materializeSmallVariants(
185 seqan::Dna5String & seq,
186 TJournalEntries & journal,
187 MethylationLevels * levelsSmallVariants,
188 std::vector<SmallVarInfo> & smallVarInfos,
189 seqan::Dna5String const & contig,
190 Variants const & variants,
191 MethylationLevels const * levels,
192 int hId)
193 {
194 if (methSimOptions)
195 {
196 SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsSmallVariants != 0));
197 SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0));
198 }
199
200 // Clear journal and output methylation levels.
201 reinit(journal, length(contig));
202 if (levelsSmallVariants)
203 levelsSmallVariants->clear();
204 // Store variation points with a flag whether it is a SNP (true) or a breakpoint (false).
205 seqan::String<std::pair<int, bool> > varPoints;
206
207 // Fors this, we have to iterate in parallel over SNP and small indel records.
208 //
209 // Current index in snp/small indel array.
210 unsigned snpsIdx = 0;
211 unsigned smallIndelIdx = 0;
212 // Current SNP record, default to sentinel.
213 SnpRecord snpRecord;
214 snpRecord.rId = std::numeric_limits<int>::max();
215 if (snpsIdx < length(variants.snps))
216 snpRecord = variants.snps[snpsIdx++];
217 // Current small indel record, default to sentinel.
218 SmallIndelRecord smallIndelRecord;
219 smallIndelRecord.rId = std::numeric_limits<int>::max();
220 if (smallIndelIdx < length(variants.smallIndels))
221 smallIndelRecord = variants.smallIndels[smallIndelIdx++];
222 // Track last position from contig appended to seq so far.
223 int lastPos = 0;
224 if (verbosity >= 3)
225 std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
226
227 // TODO(holtgrew): Extract contig building into their own functions.
228 if (verbosity >= 2)
229 std::cerr << "building output\n";
230 while (snpRecord.rId != std::numeric_limits<int>::max() || smallIndelRecord.rId != std::numeric_limits<int>::max())
231 {
232 // TODO(holtgrew): Extract SNP and small indel handling into their own functions.
233 if (snpRecord.getPos() < smallIndelRecord.getPos()) // process SNP records
234 {
235 if (snpRecord.haplotype == hId) // Ignore all but the current contig.
236 {
237 if (verbosity >= 3)
238 std::cerr << "append(seq, infix(contig, " << lastPos << ", " << snpRecord.pos << ") " << __LINE__ << "\n";
239 // Append interim sequence and methylation levels->
240 append(seq, infix(contig, lastPos, snpRecord.pos));
241 if (methSimOptions && methSimOptions->simulateMethylationLevels)
242 {
243 append(levelsSmallVariants->forward, infix(levels->forward, lastPos, snpRecord.pos + 1));
244 append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, snpRecord.pos + 1));
245 appendValue(varPoints, std::make_pair((int)length(seq), true)); // variation points before/after SNP
246 appendValue(varPoints, std::make_pair((int)length(seq) + 1, true));
247 }
248
249 SEQAN_ASSERT_GEQ(snpRecord.pos, lastPos);
250 if (verbosity >= 3)
251 std::cerr << "appendValue(seq, " << snpRecord.to << "')\n";
252 appendValue(seq, snpRecord.to);
253 lastPos = snpRecord.pos + 1;
254 if (verbosity >= 3)
255 std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
256
257 // Register SNP as small variant info.
258 smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::SNP, length(seq) - 1, 1));
259 }
260
261 if (snpsIdx >= length(variants.snps))
262 snpRecord.rId = std::numeric_limits<int>::max();
263 else
264 snpRecord = variants.snps[snpsIdx++];
265 }
266 else
267 {
268 if (smallIndelRecord.haplotype == hId) // Ignore all but the current contig.
269 {
270 if (smallIndelRecord.size > 0)
271 {
272 if (verbosity >= 3)
273 std::cerr << "append(seq, infix(contig, " << lastPos << ", " << smallIndelRecord.pos << ") "
274 << __LINE__ << "\n";
275
276 // Simulate methylation levels for insertion.
277 MethylationLevels lvls;
278 if (methSimOptions && methSimOptions->simulateMethylationLevels)
279 {
280 MethylationLevelSimulator methSim(*rng, *methSimOptions);
281 methSim.run(lvls, smallIndelRecord.seq);
282 }
283
284 // Append interim sequence and methylation levels->
285 append(seq, infix(contig, lastPos, smallIndelRecord.pos));
286 if (methSimOptions && methSimOptions->simulateMethylationLevels)
287 {
288 append(levelsSmallVariants->forward, infix(levels->forward, lastPos, smallIndelRecord.pos));
289 append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, smallIndelRecord.pos));
290 appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point before insertion
291 }
292
293 SEQAN_ASSERT_GEQ(smallIndelRecord.pos, lastPos);
294 if (verbosity >= 3)
295 std::cerr << "append(seq, \"" << smallIndelRecord.seq << "\") " << __LINE__ << "\n";
296 // Register insertion as small variant info.
297 for (unsigned i = 0; i < length(smallIndelRecord.seq); ++i)
298 smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::INS, length(seq) + i, 1));
299 // Append novel sequence and methylation levels->
300 append(seq, smallIndelRecord.seq);
301 if (methSimOptions && methSimOptions->simulateMethylationLevels)
302 {
303 append(levelsSmallVariants->forward, lvls.forward);
304 append(levelsSmallVariants->reverse, lvls.reverse);
305 appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point after insertion
306 }
307 lastPos = smallIndelRecord.pos;
308 recordInsertion(journal, hostToVirtualPosition(journal, smallIndelRecord.pos),
309 0, smallIndelRecord.size);
310 if (verbosity >= 3)
311 std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
312 }
313 else // deletion
314 {
315 if (verbosity >= 3)
316 std::cerr << "append(seq, infix(contig, " << lastPos << ", " << smallIndelRecord.pos << ") " << __LINE__ << "\n";
317 // Append interim sequence and methylation levels->
318 append(seq, infix(contig, lastPos, smallIndelRecord.pos)); // interim chars
319 if (methSimOptions && methSimOptions->simulateMethylationLevels)
320 {
321 appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point at deletion
322 append(levelsSmallVariants->forward, infix(levels->forward, lastPos, smallIndelRecord.pos));
323 append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, smallIndelRecord.pos));
324 }
325
326 lastPos = smallIndelRecord.pos - smallIndelRecord.size;
327 SEQAN_ASSERT_LT(lastPos, (int)length(contig));
328 recordErase(journal,
329 hostToVirtualPosition(journal, smallIndelRecord.pos),
330 hostToVirtualPosition(journal, smallIndelRecord.pos - smallIndelRecord.size));
331 if (verbosity >= 3)
332 std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
333
334 // Register deletion as small variant info.
335 smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::DEL, length(seq), -smallIndelRecord.size));
336 }
337 }
338
339 if (smallIndelIdx >= length(variants.smallIndels))
340 smallIndelRecord.rId = std::numeric_limits<int>::max();
341 else
342 smallIndelRecord = variants.smallIndels[smallIndelIdx++];
343 }
344 }
345 // Insert remaining characters.
346 if (verbosity >= 3)
347 std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ")\n";
348 append(seq, infix(contig, lastPos, length(contig)));
349
350 if (methSimOptions && methSimOptions->simulateMethylationLevels)
351 {
352 append(levelsSmallVariants->forward, infix(levels->forward, lastPos, length(contig)));
353 append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, length(contig)));
354
355 SEQAN_ASSERT_EQ(length(seq), length(levelsSmallVariants->forward));
356 SEQAN_ASSERT_EQ(length(seq), length(levelsSmallVariants->reverse));
357
358 fixVariationLevels(*levelsSmallVariants, *rng, seq, varPoints, *methSimOptions);
359 }
360
361 return 0;
362 }
363
364 // ----------------------------------------------------------------------------
365 // Function VariantMaterializer::_materializeLargeVariants()
366 // ----------------------------------------------------------------------------
367
_materializeLargeVariants(seqan::Dna5String & seq,MethylationLevels * levelsLargeVariants,std::vector<SmallVarInfo> & varInfos,std::vector<std::pair<int,int>> & breakpoints,PositionMap & positionMap,TJournalEntries const & journal,seqan::Dna5String const & contig,std::vector<SmallVarInfo> const & smallVarInfos,Variants const & variants,MethylationLevels const * levels,int hId)368 int VariantMaterializer::_materializeLargeVariants(
369 seqan::Dna5String & seq,
370 MethylationLevels * levelsLargeVariants,
371 std::vector<SmallVarInfo> & varInfos,
372 std::vector<std::pair<int, int> > & breakpoints,
373 PositionMap & positionMap,
374 TJournalEntries const & journal,
375 seqan::Dna5String const & contig,
376 std::vector<SmallVarInfo> const & smallVarInfos,
377 Variants const & variants,
378 MethylationLevels const * levels,
379 int hId)
380 {
381 if (methSimOptions)
382 {
383 SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsLargeVariants != 0));
384 SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0));
385 }
386
387 // We will record all intervals for the positionMap.svIntervalTree in this String.
388 seqan::String<GenomicInterval> intervals;
389
390 // Clear output methylation levels->
391 if (levelsLargeVariants)
392 levelsLargeVariants->clear();
393 // Store variation points. We reuse the fixVariationLevels() function from small indel/snp simulation and thus
394 // have to store a bool that is always set to false.
395 seqan::String<std::pair<int, bool> > varPoints;
396
397 // Track last position from contig appended to seq so far.
398 int lastPos = 0;
399 if (verbosity >= 3)
400 std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
401
402 // Pointer to the current small variant to write out translated to varInfo.
403 std::vector<SmallVarInfo>::const_iterator itSmallVar = smallVarInfos.begin();
404
405 // Number of bytes written out so far/current position in variant.
406 unsigned currentPos = 0;
407
408 for (unsigned i = 0; i < length(variants.svRecords); ++i)
409 {
410 if (variants.svRecords[i].haplotype != hId) // Ignore all but the current contig.
411 continue;
412 // We obtain a copy of the current SV record since we translate its positions below.
413 StructuralVariantRecord svRecord = variants.svRecords[i];
414
415 // Translate positions and lengths of SV record.
416 if (verbosity >= 2)
417 std::cerr << " Translating SvRecord\n " << svRecord << '\n';
418 svRecord.pos = hostToVirtualPosition(journal, svRecord.pos);
419 SEQAN_ASSERT_LT(svRecord.pos, (int)length(contig));
420 // We do not need to adjust the sizes for insertions.
421 if (svRecord.kind != StructuralVariantRecord::INDEL || svRecord.size < 0)
422 svRecord.size = hostToVirtualPosition(journal, svRecord.pos + svRecord.size) -
423 hostToVirtualPosition(journal, svRecord.pos);
424 if (svRecord.targetPos != -1)
425 svRecord.targetPos = hostToVirtualPosition(journal, svRecord.targetPos);
426 if (verbosity >= 2)
427 std::cerr << " => " << svRecord << '\n';
428
429 // Copy out small variant infos for interim chars.
430 for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar)
431 {
432 int offset = (int)currentPos - lastPos;
433 varInfos.push_back(*itSmallVar);
434 varInfos.back().pos += offset;
435 }
436
437 // Copy from contig to seq with SVs.
438 if (verbosity >= 3)
439 std::cerr << "lastPos == " << lastPos << "\n";
440 append(seq, infix(contig, lastPos, svRecord.pos)); // interim chars
441 if (methSimOptions && methSimOptions->simulateMethylationLevels)
442 {
443 append(levelsLargeVariants->forward, infix(levels->forward, lastPos, svRecord.pos));
444 append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, svRecord.pos));
445 appendValue(varPoints, std::make_pair((int)length(seq), false));
446 }
447 if (currentPos != length(seq))
448 appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, svRecord.pos,
449 '+', GenomicInterval::NORMAL));
450 currentPos = length(seq);
451 if (verbosity >= 3)
452 std::cerr << "append(seq, infix(contig, " << lastPos << ", " << svRecord.pos << ") " << __LINE__ << " (interim)\n";
453 switch (svRecord.kind)
454 {
455 case StructuralVariantRecord::INDEL:
456 {
457 if (svRecord.size > 0) // insertion
458 {
459 SEQAN_ASSERT_EQ((int)length(svRecord.seq), svRecord.size);
460
461 // Simulate methylation levels for insertion.
462 MethylationLevels lvls;
463 if (methSimOptions && methSimOptions->simulateMethylationLevels)
464 {
465 MethylationLevelSimulator methSim(*rng, *methSimOptions);
466 methSim.run(lvls, svRecord.seq);
467 }
468
469 // Append novel sequence and methylation levels.
470 append(seq, svRecord.seq);
471 if (methSimOptions && methSimOptions->simulateMethylationLevels)
472 {
473 append(levelsLargeVariants->forward, lvls.forward);
474 append(levelsLargeVariants->reverse, lvls.reverse);
475 appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point after insertion
476 }
477 if (currentPos != length(seq))
478 appendValue(intervals, GenomicInterval(currentPos, length(seq), -1, -1,
479 '+', GenomicInterval::INSERTED));
480 if (verbosity >= 3)
481 std::cerr << "append(seq, svRecord.seq (length == " << length(svRecord.seq) << ") " << __LINE__ << " (insertion)\n";
482 lastPos = svRecord.pos;
483 SEQAN_ASSERT_LT(lastPos, (int)length(contig));
484
485 // Copy out breakpoints.
486 breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
487 breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));
488
489 currentPos = length(seq);
490 }
491 else // deletion
492 {
493 lastPos = svRecord.pos - svRecord.size;
494 SEQAN_ASSERT_LT(lastPos, (int)length(contig));
495
496 // Copy out breakpoint.
497 breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
498 }
499 }
500 break;
501 case StructuralVariantRecord::INVERSION:
502 {
503 unsigned oldLen = length(seq);
504 append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
505 if (methSimOptions && methSimOptions->simulateMethylationLevels)
506 {
507 appendValue(varPoints, std::make_pair((int)length(seq), false)); // variation point at deletion
508 append(levelsLargeVariants->forward, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
509 reverse(infix(levelsLargeVariants->forward, oldLen, length(levelsLargeVariants->forward)));
510 append(levelsLargeVariants->reverse, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
511 reverse(infix(levelsLargeVariants->reverse, oldLen, length(levelsLargeVariants->reverse)));
512 }
513 if (currentPos != length(seq))
514 appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
515 '-', GenomicInterval::INVERTED));
516
517 // Copy out small variant infos for inversion.
518 for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar)
519 {
520 varInfos.push_back(*itSmallVar);
521 varInfos.back().pos = currentPos + svRecord.size - (varInfos.back().pos - lastPos);
522 }
523
524 if (verbosity >= 3)
525 std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (inversion)\n";
526 reverseComplement(infix(seq, oldLen, length(seq)));
527 lastPos = svRecord.pos + svRecord.size;
528 SEQAN_ASSERT_LT(lastPos, (int)length(contig));
529
530 // Copy out breakpoints.
531 breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
532 breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));
533
534 currentPos = length(seq);
535 }
536 break;
537 case StructuralVariantRecord::TRANSLOCATION:
538 {
539 SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size);
540 append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos));
541 if (methSimOptions && methSimOptions->simulateMethylationLevels)
542 {
543 appendValue(varPoints, std::make_pair((int)length(seq), false));
544 append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos));
545 append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos));
546 }
547 if (currentPos != length(seq))
548 appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos,
549 '+', GenomicInterval::NORMAL));
550 unsigned tmpCurrentPos = length(seq);
551 append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
552 if (methSimOptions && methSimOptions->simulateMethylationLevels)
553 {
554 appendValue(varPoints, std::make_pair((int)length(seq), false));
555 append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
556 append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
557 }
558 if (tmpCurrentPos != length(seq))
559 appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
560 '+', GenomicInterval::NORMAL));
561 if (verbosity >= 3)
562 std::cerr << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << " (translocation)\n"
563 << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n";
564 lastPos = svRecord.targetPos;
565 SEQAN_ASSERT_LT(lastPos, (int)length(contig));
566
567 // Copy out small variant infos for translocation, shift left to right and righ to left but keep
568 // center intact.
569 for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar)
570 {
571 int offset = (int)currentPos - lastPos;
572 varInfos.push_back(*itSmallVar);
573 varInfos.back().pos += offset;
574
575 int bpLeft = svRecord.pos + svRecord.size;
576 int bpRight = svRecord.targetPos;
577 if (itSmallVar->pos < bpLeft)
578 varInfos.back().pos -= (svRecord.targetPos - svRecord.pos);
579 else if (itSmallVar->pos >= bpRight)
580 varInfos.back().pos += (svRecord.targetPos - svRecord.pos);
581 }
582
583 // Copy out breakpoints.
584 breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
585 breakpoints.push_back(std::make_pair(currentPos + svRecord.targetPos - svRecord.pos - svRecord.size, variants.posToIdx(Variants::SV, i)));
586 breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));
587
588 currentPos = length(seq);
589 }
590 break;
591 case StructuralVariantRecord::DUPLICATION:
592 {
593 append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
594 SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size);
595 if (methSimOptions && methSimOptions->simulateMethylationLevels) // first copy
596 {
597 appendValue(varPoints, std::make_pair((int)length(seq), false));
598 append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
599 append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
600 }
601 if (currentPos != length(seq))
602 appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
603 '+', GenomicInterval::DUPLICATED));
604 unsigned tmpCurrentPos = length(seq);
605 append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos));
606 if (methSimOptions && methSimOptions->simulateMethylationLevels)
607 {
608 appendValue(varPoints, std::make_pair((int)length(seq), false));
609 append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos));
610 append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos));
611 }
612 if (tmpCurrentPos != length(seq))
613 appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos,
614 '+', GenomicInterval::NORMAL));
615 tmpCurrentPos = length(seq);
616 append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
617 if (methSimOptions && methSimOptions->simulateMethylationLevels) // second copy
618 {
619 appendValue(varPoints, std::make_pair((int)length(seq), false));
620 append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
621 append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
622 }
623 if (tmpCurrentPos != length(seq))
624 appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
625 '+', GenomicInterval::NORMAL));
626 if (verbosity >= 3)
627 std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (duplication)\n"
628 << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << "\n"
629 << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n";
630 lastPos = svRecord.targetPos;
631 SEQAN_ASSERT_LT(lastPos, (int)length(contig));
632
633 // Write out small variant infos for duplication.
634 for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar)
635 {
636 int offset = (int)currentPos - lastPos;
637 varInfos.push_back(*itSmallVar);
638 varInfos.back().pos += offset;
639
640 if (itSmallVar->pos < svRecord.pos + svRecord.size)
641 {
642 varInfos.push_back(*itSmallVar);
643 varInfos.back().pos += (svRecord.targetPos - svRecord.pos);
644 }
645 }
646
647 // Copy out breakpoints.
648 breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
649 breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos, variants.posToIdx(Variants::SV, i)));
650 breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos + svRecord.targetPos - (svRecord.pos + svRecord.size), variants.posToIdx(Variants::SV, i)));
651 breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));
652
653 currentPos = length(seq);
654 }
655 break;
656 default:
657 return 1;
658 }
659 }
660 if (verbosity >= 3)
661 std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ") "
662 << __LINE__ << " (last interim)\n";
663 append(seq, infix(contig, lastPos, length(contig)));
664 if (methSimOptions && methSimOptions->simulateMethylationLevels)
665 {
666 append(levelsLargeVariants->forward, infix(levels->forward, lastPos, length(contig)));
667 append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, length(contig)));
668
669 SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->forward));
670 SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->reverse));
671
672 fixVariationLevels(*levelsLargeVariants, *rng, seq, varPoints, *methSimOptions);
673 }
674 if (currentPos != length(seq))
675 appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, length(contig),
676 '+', GenomicInterval::NORMAL));
677
678 // Copy out small variant infos for trailing characters.
679 for (; itSmallVar != smallVarInfos.end(); ++itSmallVar)
680 {
681 int offset = (int)currentPos - lastPos;
682 varInfos.push_back(*itSmallVar);
683 varInfos.back().pos += offset;
684 }
685
686 // Build the interval trees of the positionMap.
687 seqan::String<PositionMap::TInterval> svIntervals, svIntervalsSTL;
688 for (unsigned i = 0; i < length(intervals); ++i)
689 appendValue(svIntervals, PositionMap::TInterval(
690 intervals[i].svBeginPos, intervals[i].svEndPos, intervals[i]));
691 for (unsigned i = 0; i < length(intervals); ++i)
692 if (intervals[i].smallVarBeginPos != -1) // ignore insertions
693 appendValue(svIntervalsSTL, PositionMap::TInterval(
694 intervals[i].smallVarBeginPos, intervals[i].smallVarEndPos, intervals[i]));
695 createIntervalTree(positionMap.svIntervalTree, svIntervals);
696 createIntervalTree(positionMap.svIntervalTreeSTL, svIntervalsSTL);
697
698 return 0;
699 }
700
701 // --------------------------------------------------------------------------
702 // Function PositionMap::overlapsWithBreakpoint()
703 // --------------------------------------------------------------------------
704
overlapsWithBreakpoint(int svBeginPos,int svEndPos) const705 bool PositionMap::overlapsWithBreakpoint(int svBeginPos, int svEndPos) const
706 {
707 std::set<std::pair<int, int> >::const_iterator it = svBreakpoints.upper_bound(std::make_pair(svBeginPos, std::numeric_limits<int>::max()));
708 return (it != svBreakpoints.end() && it->first < svEndPos);
709 }
710
711 // --------------------------------------------------------------------------
712 // Function PositionMap::getGenomicInterval()
713 // --------------------------------------------------------------------------
714
getGenomicInterval(int svPos) const715 GenomicInterval PositionMap::getGenomicInterval(int svPos) const
716 {
717 seqan::String<GenomicInterval> intervals;
718 findIntervals(intervals, svIntervalTree, svPos);
719 SEQAN_ASSERT_EQ(length(intervals), 1u);
720 return intervals[0];
721 }
722
723 // --------------------------------------------------------------------------
724 // PositionMap::getGenomicIntervalSmallVarPos()
725 // --------------------------------------------------------------------------
726
727 // Returns the GenomicInterval on the sequence using a position on the small var reference.
getGenomicIntervalSmallVarPos(int smallVarPos) const728 GenomicInterval PositionMap::getGenomicIntervalSmallVarPos(int smallVarPos) const
729 {
730 seqan::String<GenomicInterval> intervals;
731 findIntervals(intervals, svIntervalTreeSTL, smallVarPos);
732 return intervals[0];
733 }
734
735 // --------------------------------------------------------------------------
736 // Function PositionMap::toSmallVarInterval()
737 // --------------------------------------------------------------------------
738
toSmallVarInterval(int svBeginPos,int svEndPos) const739 std::pair<int, int> PositionMap::toSmallVarInterval(int svBeginPos, int svEndPos) const
740 {
741 SEQAN_ASSERT(!overlapsWithBreakpoint(svBeginPos, svEndPos));
742 GenomicInterval gi = getGenomicInterval(svBeginPos);
743 if (gi.kind == GenomicInterval::INSERTED)
744 {
745 // novel sequence, cannot be projected
746 return std::make_pair(-1, -1);
747 }
748 if (gi.kind != GenomicInterval::INVERTED)
749 {
750 // forward
751 return std::make_pair(gi.smallVarBeginPos + (svBeginPos - gi.svBeginPos),
752 gi.smallVarBeginPos + (svEndPos - gi.svBeginPos));
753 }
754 else
755 {
756 // reverse
757 return std::make_pair(gi.smallVarBeginPos + (gi.svEndPos - svBeginPos),
758 gi.smallVarBeginPos + (gi.svEndPos - svEndPos));
759 }
760
761 return std::make_pair(-1, -1); // cannot reach here
762 }
763
764 // --------------------------------------------------------------------------
765 // Function PositionMap::smallVarToLargeVarInterval()
766 // --------------------------------------------------------------------------
767
smallVarToLargeVarInterval(int beginPos,int endPos) const768 std::pair<int, int> PositionMap::smallVarToLargeVarInterval(int beginPos, int endPos) const
769 {
770 // SEQAN_ASSERT(!overlapsWithBreakpoint(svBeginPos, svEndPos));
771 GenomicInterval gi = getGenomicIntervalSmallVarPos(beginPos);
772 SEQAN_ASSERT_NEQ(gi.kind, GenomicInterval::INSERTED);
773
774 if (gi.kind != GenomicInterval::INVERTED)
775 {
776 // forward
777 return std::make_pair(gi.svBeginPos + (beginPos - gi.smallVarBeginPos),
778 gi.svBeginPos + (endPos - gi.smallVarBeginPos));
779 }
780 else
781 {
782 // reverse
783 return std::make_pair(gi.svBeginPos + (gi.svEndPos - beginPos),
784 gi.svBeginPos + (gi.svEndPos - endPos));
785 }
786
787 return std::make_pair(-1, -1); // cannot reach here
788 }
789
790 // --------------------------------------------------------------------------
791 // Function PositionMap::orignalToSmallVarInterval()
792 // --------------------------------------------------------------------------
793
originalToSmallVarInterval(int beginPos,int endPos) const794 std::pair<int, int> PositionMap::originalToSmallVarInterval(int beginPos, int endPos) const
795 {
796 // TODO(holtgrew): Project to the left at the end.
797
798 // Get anchor gaps objects from anchors.
799 TGaps refGaps(seqan::Nothing(), refGapAnchors);
800 TGaps smallVarGaps(seqan::Nothing(), smallVarGapAnchors);
801
802 // Translate begin and end position.
803 int beginPos2 = toViewPosition(refGaps, beginPos);
804 int beginPos3 = toSourcePosition(smallVarGaps, beginPos2);
805 int endPos2 = toViewPosition(refGaps, endPos);
806 int endPos3 = toSourcePosition(smallVarGaps, endPos2);
807 return std::make_pair(beginPos3, endPos3);
808 }
809
810 // --------------------------------------------------------------------------
811 // Function PositionMap::toOriginalInterval()
812 // --------------------------------------------------------------------------
813
toOriginalInterval(int smallVarBeginPos,int smallVarEndPos) const814 std::pair<int, int> PositionMap::toOriginalInterval(int smallVarBeginPos, int smallVarEndPos) const
815 {
816 // TODO(holtgrew): Project to the left at the end.
817
818 // Get anchor gaps objects from anchors.
819 TGaps refGaps(seqan::Nothing(), refGapAnchors);
820 TGaps smallVarGaps(seqan::Nothing(), smallVarGapAnchors);
821
822 // Translate begin and end position.
823 int smallVarBeginPos2 = toViewPosition(smallVarGaps, smallVarBeginPos);
824 int smallVarBeginPos3 = toSourcePosition(refGaps, smallVarBeginPos2);
825 int smallVarEndPos2 = toViewPosition(smallVarGaps, smallVarEndPos);
826 int smallVarEndPos3 = toSourcePosition(refGaps, smallVarEndPos2);
827 return std::make_pair(smallVarBeginPos3, smallVarEndPos3);
828 }
829
830 // --------------------------------------------------------------------------
831 // Function PositionMap::reinit()
832 // --------------------------------------------------------------------------
833
reinit(TJournalEntries const & journal)834 void PositionMap::reinit(TJournalEntries const & journal)
835 {
836 // Reset the interval tree and breakpoints.
837 // TODO(holtgrew): Better API support for IntervalTree?
838 svIntervalTree = TIntervalTree();
839 svIntervalTreeSTL = TIntervalTree();
840 svBreakpoints.clear();
841 clear(refGapAnchors);
842 clear(smallVarGapAnchors);
843
844 // Convert the journal to two gaps.
845 //
846 // Get anchor gaps objects from anchors.
847 typedef seqan::Iterator<TGaps, seqan::Standard>::Type TGapsIter;
848 TGaps refGaps(seqan::Nothing(), refGapAnchors);
849 TGapsIter itRef = begin(refGaps, seqan::Standard());
850 TGaps smallVarGaps(seqan::Nothing(), smallVarGapAnchors);
851 TGapsIter itVar = begin(smallVarGaps, seqan::Standard());
852
853 // Iterate over the journal.
854 typedef seqan::Iterator<TJournalEntries const, seqan::Standard>::Type TJournalEntriesIt;
855 TJournalEntriesIt it = begin(journal, seqan::Standard());
856 SEQAN_ASSERT_NEQ(it->segmentSource, seqan::SOURCE_NULL);
857 SEQAN_ASSERT_EQ(it->virtualPosition, 0u);
858
859 unsigned lastRefPos = std::numeric_limits<unsigned>::max(); // Previous position from reference.
860 for (; it != end(journal, seqan::Standard()); ++it)
861 {
862 // std::cerr << *it << "\n";
863 SEQAN_ASSERT_NEQ(it->segmentSource, seqan::SOURCE_NULL);
864 if (it->segmentSource == seqan::SOURCE_ORIGINAL)
865 {
866 if (lastRefPos == std::numeric_limits<unsigned>::max())
867 {
868 if (it->physicalPosition != 0)
869 {
870 insertGaps(itRef, it->physicalPosition);
871 itRef += it->physicalPosition;
872 itVar += it->physicalPosition;
873 lastRefPos = it->physicalPosition + it->length;
874 // std::cerr << "INSERT REF GAPS\t" << it->physicalPosition << "\n";
875 }
876 itRef += it->length;
877 itVar += it->length;
878 // std::cerr << "FORWARD\t" << it->length << "\n";
879 }
880 else
881 {
882 if (it->physicalPosition != lastRefPos)
883 {
884 int len = it->physicalPosition - lastRefPos;
885 insertGaps(itVar, len);
886 // std::cerr << "INSERT VAR GAPS\t" << len << "\n";
887 itRef += len;
888 itVar += len;
889 // std::cerr << "FORWARD\t" << len << "\n";
890 }
891 itRef += it->length;
892 itVar += it->length;
893 // std::cerr << "2 FORWARD\t" << it->length << "\n";
894 }
895 lastRefPos = it->physicalPosition + it->length;
896 }
897 else
898 {
899 insertGaps(itRef, it->length);
900 // std::cerr << "INSERT REF GAPS\t" << it->length << "\n";
901 itRef += it->length;
902 itVar += it->length;
903 // std::cerr << "FORWARD\t" << it->length << "\n";
904 }
905 }
906
907 // std::cerr << "--> done\n";
908
909 // typedef seqan::Gaps<seqan::CharString, seqan::AnchorGaps<TGapAnchors> > TGaps2;
910 // seqan::CharString seqH;
911 // seqan::CharString seqV;
912 // for (unsigned i = 0; i < 1000; ++i)
913 // {
914 // appendValue(seqH, 'X');
915 // appendValue(seqV, 'X');
916 // }
917 // TGaps2 gapsH(seqH, refGapAnchors);
918 // TGaps2 gapsV(seqV, smallVarGapAnchors);
919
920 // std::cerr << "REF\t" << gapsH << "\n"
921 // << "VAR\t" << gapsV << "\n";
922 }
923