1 // ==========================================================================
2 //                         Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 
35 #include "genomic_variants.h"
36 
operator <<(std::ostream & out,SnpRecord const & record)37 std::ostream & operator<<(std::ostream & out, SnpRecord const & record)
38 {
39     out << "SnpRecord(" << record.haplotype << ", " << record.rId << ", " << record.pos
40         << ", " << record.to << ")";
41     return out;
42 }
43 
operator <<(std::ostream & out,SmallIndelRecord const & record)44 std::ostream & operator<<(std::ostream & out, SmallIndelRecord const & record)
45 {
46     out << "SnpRecord(" << record.haplotype << ", " << record.rId << ", " << record.pos
47         << ", " << record.size << ", " << record.seq << ")";
48     return out;
49 }
50 
endPosition() const51 int StructuralVariantRecord::endPosition() const
52 {
53     if (pos == -1)
54         return std::numeric_limits<int>::max();
55 
56     switch (kind)
57     {
58         case INDEL:
59             if (size > 0)
60                 return pos;
61             else
62                 return pos + size;
63         case INVERSION:
64             return pos + size;
65         case TRANSLOCATION:
66             return targetPos;
67         case DUPLICATION:
68             return targetPos;
69         default:
70             return -1;
71     }
72 }
73 
nearBreakend(int query) const74 bool StructuralVariantRecord::nearBreakend(int query) const
75 {
76     if (pos == -1)
77         return false;  // invalid/sentinel has no breakends
78 
79     switch (kind)
80     {
81         case INDEL:
82             if (size > 0)
83                 return (query == pos || query == pos + 1);
84             else
85                 return (query == pos || query == pos + 1 ||
86                         query == pos - size || query == pos - size + 1);
87         case INVERSION:
88             return (query == pos || query == pos + 1 ||
89                     query == pos + size || query == pos + size + 1);
90         case TRANSLOCATION:
91             return (query == pos || query == pos + 1 ||
92                     query == targetPos || query == targetPos + 1);
93         case DUPLICATION:
94             return (query == pos || query == pos + 1 ||
95                     query == targetPos || query == targetPos + 1);
96         default:
97             return false;
98     }
99 }
100 
operator <<(std::ostream & out,StructuralVariantRecord const & record)101 std::ostream & operator<<(std::ostream & out, StructuralVariantRecord const & record)
102 {
103     char const * kind;
104     switch (record.kind)
105     {
106         case StructuralVariantRecord::TRANSLOCATION:
107             kind = "TRANSLOCATION";
108             break;
109         case StructuralVariantRecord::INDEL:
110             kind = "INDEL";
111             break;
112         case StructuralVariantRecord::INVERSION:
113             kind = "INVERSION";
114             break;
115         case StructuralVariantRecord::DUPLICATION:
116             kind = "DUPLICATION";
117             break;
118         default:
119             kind = "INVALID";
120             break;
121     }
122     out << "StructuralVariantRecord(kind=" << kind << ", haplotype=" << record.haplotype
123         << ", rId=" << record.rId << ", pos=" << record.pos << ", size=" << record.size
124         << ", targetRId=" << record.targetRId << ", targetPos=" << record.targetPos
125         << ", seq=\"" << record.seq << "\")";
126     return out;
127 }
128 
129 // ----------------------------------------------------------------------------
130 // Function VariantMaterializer::_runImpl()
131 // ----------------------------------------------------------------------------
132 
_runImpl(seqan::Dna5String * resultSeq,PositionMap * posMap,MethylationLevels * resultLvls,std::vector<SmallVarInfo> & varInfos,std::vector<std::pair<int,int>> & breakpoints,seqan::Dna5String const * ref,MethylationLevels const * refLvls,int haplotypeId)133 int VariantMaterializer::_runImpl(
134         seqan::Dna5String * resultSeq,
135         PositionMap * posMap,
136         MethylationLevels * resultLvls,
137         std::vector<SmallVarInfo> & varInfos,
138         std::vector<std::pair<int, int> > & breakpoints,
139         seqan::Dna5String const * ref,
140         MethylationLevels const * refLvls,
141         int haplotypeId)
142 {
143     breakpoints.clear();
144     clear(*resultSeq);
145     if (resultLvls)
146         resultLvls->clear();
147 
148     // Apply small variants.  We get a sequence with the small variants and a journal of the difference to contig.
149     seqan::Dna5String seqSmallVariants;
150     TJournalEntries journal;
151     MethylationLevels levelsSmallVariants;  // only used if revLevels != 0
152     MethylationLevels * smallLvlsPtr = refLvls ? &levelsSmallVariants : 0;
153     std::vector<SmallVarInfo> smallVarInfos;
154     if (_materializeSmallVariants(seqSmallVariants, journal, smallLvlsPtr, smallVarInfos, *ref, *variants,
155                                   refLvls, haplotypeId) != 0)
156         return 1;
157 
158     // Build position map for large variant -> small variant and small variant <-> reference position mapping.
159     posMap->reinit(journal);  // build mapping from small variant to reference positions
160 
161     // Apply structural variants and build the interval tree of posMap
162     if (_materializeLargeVariants(*resultSeq, resultLvls, varInfos, breakpoints, *posMap, journal, seqSmallVariants,
163                                   smallVarInfos, *variants, smallLvlsPtr, haplotypeId) != 0)
164         return 1;
165 
166     // Sort resulting variant infos.
167     std::sort(varInfos.begin(), varInfos.end());
168 
169     // std::sort(breakpoints.begin(), breakpoints.end());
170     // std::cout << "SV Breakpoints\n";
171     // for (unsigned i = 0; i < breakpoints.size(); ++i)
172     //     std::cout << "  " << breakpoints[i] << "\n";
173 
174     // Copy out SV breakpoints.
175     posMap->svBreakpoints.insert(breakpoints.begin(), breakpoints.end());
176 
177     return 0;
178 }
179 
180 // ----------------------------------------------------------------------------
181 // Function VariantMaterializer::_materializeSmallVariants()
182 // ----------------------------------------------------------------------------
183 
_materializeSmallVariants(seqan::Dna5String & seq,TJournalEntries & journal,MethylationLevels * levelsSmallVariants,std::vector<SmallVarInfo> & smallVarInfos,seqan::Dna5String const & contig,Variants const & variants,MethylationLevels const * levels,int hId)184 int VariantMaterializer::_materializeSmallVariants(
185         seqan::Dna5String & seq,
186         TJournalEntries & journal,
187         MethylationLevels * levelsSmallVariants,
188         std::vector<SmallVarInfo> & smallVarInfos,
189         seqan::Dna5String const & contig,
190         Variants const & variants,
191         MethylationLevels const * levels,
192         int hId)
193 {
194     if (methSimOptions)
195     {
196         SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsSmallVariants != 0));
197         SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0));
198     }
199 
200     // Clear journal and output methylation levels.
201     reinit(journal, length(contig));
202     if (levelsSmallVariants)
203         levelsSmallVariants->clear();
204     // Store variation points with a flag whether it is a SNP (true) or a breakpoint (false).
205     seqan::String<std::pair<int, bool> > varPoints;
206 
207     // Fors this, we have to iterate in parallel over SNP and small indel records.
208     //
209     // Current index in snp/small indel array.
210     unsigned snpsIdx = 0;
211     unsigned smallIndelIdx = 0;
212     // Current SNP record, default to sentinel.
213     SnpRecord snpRecord;
214     snpRecord.rId = std::numeric_limits<int>::max();
215     if (snpsIdx < length(variants.snps))
216         snpRecord = variants.snps[snpsIdx++];
217     // Current small indel record, default to sentinel.
218     SmallIndelRecord smallIndelRecord;
219     smallIndelRecord.rId = std::numeric_limits<int>::max();
220     if (smallIndelIdx < length(variants.smallIndels))
221         smallIndelRecord = variants.smallIndels[smallIndelIdx++];
222     // Track last position from contig appended to seq so far.
223     int lastPos = 0;
224     if (verbosity >= 3)
225         std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
226 
227     // TODO(holtgrew): Extract contig building into their own functions.
228     if (verbosity >= 2)
229         std::cerr << "building output\n";
230     while (snpRecord.rId != std::numeric_limits<int>::max() || smallIndelRecord.rId != std::numeric_limits<int>::max())
231     {
232         // TODO(holtgrew): Extract SNP and small indel handling into their own functions.
233         if (snpRecord.getPos() < smallIndelRecord.getPos())  // process SNP records
234         {
235             if (snpRecord.haplotype == hId)  // Ignore all but the current contig.
236             {
237                 if (verbosity >= 3)
238                     std::cerr << "append(seq, infix(contig, " << lastPos << ", " << snpRecord.pos << ") " << __LINE__ << "\n";
239                 // Append interim sequence and methylation levels->
240                 append(seq, infix(contig, lastPos, snpRecord.pos));
241                 if (methSimOptions && methSimOptions->simulateMethylationLevels)
242                 {
243                     append(levelsSmallVariants->forward, infix(levels->forward, lastPos, snpRecord.pos + 1));
244                     append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, snpRecord.pos + 1));
245                     appendValue(varPoints, std::make_pair((int)length(seq), true));      // variation points before/after SNP
246                     appendValue(varPoints, std::make_pair((int)length(seq) + 1, true));
247                 }
248 
249                 SEQAN_ASSERT_GEQ(snpRecord.pos, lastPos);
250                 if (verbosity >= 3)
251                     std::cerr << "appendValue(seq, " << snpRecord.to << "')\n";
252                 appendValue(seq, snpRecord.to);
253                 lastPos = snpRecord.pos + 1;
254                 if (verbosity >= 3)
255                     std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
256 
257                 // Register SNP as small variant info.
258                 smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::SNP, length(seq) - 1, 1));
259             }
260 
261             if (snpsIdx >= length(variants.snps))
262                 snpRecord.rId = std::numeric_limits<int>::max();
263             else
264                 snpRecord = variants.snps[snpsIdx++];
265         }
266         else
267         {
268             if (smallIndelRecord.haplotype == hId)  // Ignore all but the current contig.
269             {
270                 if (smallIndelRecord.size > 0)
271                 {
272                     if (verbosity >= 3)
273                         std::cerr << "append(seq, infix(contig, " << lastPos << ", " << smallIndelRecord.pos << ") "
274                                   << __LINE__ << "\n";
275 
276                     // Simulate methylation levels for insertion.
277                     MethylationLevels lvls;
278                     if (methSimOptions && methSimOptions->simulateMethylationLevels)
279                     {
280                         MethylationLevelSimulator methSim(*rng, *methSimOptions);
281                         methSim.run(lvls, smallIndelRecord.seq);
282                     }
283 
284                     // Append interim sequence and methylation levels->
285                     append(seq, infix(contig, lastPos, smallIndelRecord.pos));
286                     if (methSimOptions && methSimOptions->simulateMethylationLevels)
287                     {
288                         append(levelsSmallVariants->forward, infix(levels->forward, lastPos, smallIndelRecord.pos));
289                         append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, smallIndelRecord.pos));
290                         appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point before insertion
291                     }
292 
293                     SEQAN_ASSERT_GEQ(smallIndelRecord.pos, lastPos);
294                     if (verbosity >= 3)
295                         std::cerr << "append(seq, \"" << smallIndelRecord.seq << "\") " << __LINE__ << "\n";
296                     // Register insertion as small variant info.
297                     for (unsigned i = 0; i < length(smallIndelRecord.seq); ++i)
298                         smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::INS, length(seq) + i, 1));
299                     // Append novel sequence and methylation levels->
300                     append(seq, smallIndelRecord.seq);
301                     if (methSimOptions && methSimOptions->simulateMethylationLevels)
302                     {
303                         append(levelsSmallVariants->forward, lvls.forward);
304                         append(levelsSmallVariants->reverse, lvls.reverse);
305                         appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point after insertion
306                     }
307                     lastPos = smallIndelRecord.pos;
308                     recordInsertion(journal, hostToVirtualPosition(journal, smallIndelRecord.pos),
309                                     0, smallIndelRecord.size);
310                     if (verbosity >= 3)
311                         std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
312                 }
313                 else  // deletion
314                 {
315                     if (verbosity >= 3)
316                         std::cerr << "append(seq, infix(contig, " << lastPos << ", " << smallIndelRecord.pos << ") " << __LINE__ << "\n";
317                     // Append interim sequence and methylation levels->
318                     append(seq, infix(contig, lastPos, smallIndelRecord.pos));  // interim chars
319                     if (methSimOptions && methSimOptions->simulateMethylationLevels)
320                     {
321                         appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point at deletion
322                         append(levelsSmallVariants->forward, infix(levels->forward, lastPos, smallIndelRecord.pos));
323                         append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, smallIndelRecord.pos));
324                     }
325 
326                     lastPos = smallIndelRecord.pos - smallIndelRecord.size;
327                     SEQAN_ASSERT_LT(lastPos, (int)length(contig));
328                     recordErase(journal,
329                                 hostToVirtualPosition(journal, smallIndelRecord.pos),
330                                 hostToVirtualPosition(journal, smallIndelRecord.pos - smallIndelRecord.size));
331                     if (verbosity >= 3)
332                         std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
333 
334                     // Register deletion as small variant info.
335                     smallVarInfos.push_back(SmallVarInfo(SmallVarInfo::DEL, length(seq), -smallIndelRecord.size));
336                 }
337             }
338 
339             if (smallIndelIdx >= length(variants.smallIndels))
340                 smallIndelRecord.rId = std::numeric_limits<int>::max();
341             else
342                 smallIndelRecord = variants.smallIndels[smallIndelIdx++];
343         }
344     }
345     // Insert remaining characters.
346     if (verbosity >= 3)
347         std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ")\n";
348     append(seq, infix(contig, lastPos, length(contig)));
349 
350     if (methSimOptions && methSimOptions->simulateMethylationLevels)
351     {
352         append(levelsSmallVariants->forward, infix(levels->forward, lastPos, length(contig)));
353         append(levelsSmallVariants->reverse, infix(levels->reverse, lastPos, length(contig)));
354 
355         SEQAN_ASSERT_EQ(length(seq), length(levelsSmallVariants->forward));
356         SEQAN_ASSERT_EQ(length(seq), length(levelsSmallVariants->reverse));
357 
358         fixVariationLevels(*levelsSmallVariants, *rng, seq, varPoints, *methSimOptions);
359     }
360 
361     return 0;
362 }
363 
364 // ----------------------------------------------------------------------------
365 // Function VariantMaterializer::_materializeLargeVariants()
366 // ----------------------------------------------------------------------------
367 
_materializeLargeVariants(seqan::Dna5String & seq,MethylationLevels * levelsLargeVariants,std::vector<SmallVarInfo> & varInfos,std::vector<std::pair<int,int>> & breakpoints,PositionMap & positionMap,TJournalEntries const & journal,seqan::Dna5String const & contig,std::vector<SmallVarInfo> const & smallVarInfos,Variants const & variants,MethylationLevels const * levels,int hId)368 int VariantMaterializer::_materializeLargeVariants(
369         seqan::Dna5String & seq,
370         MethylationLevels * levelsLargeVariants,
371         std::vector<SmallVarInfo> & varInfos,
372         std::vector<std::pair<int, int> > & breakpoints,
373         PositionMap & positionMap,
374         TJournalEntries const & journal,
375         seqan::Dna5String const & contig,
376         std::vector<SmallVarInfo> const & smallVarInfos,
377         Variants const & variants,
378         MethylationLevels const * levels,
379         int hId)
380 {
381     if (methSimOptions)
382     {
383         SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levelsLargeVariants != 0));
384         SEQAN_ASSERT_EQ(methSimOptions->simulateMethylationLevels, (levels != 0));
385     }
386 
387     // We will record all intervals for the positionMap.svIntervalTree in this String.
388     seqan::String<GenomicInterval> intervals;
389 
390     // Clear output methylation levels->
391     if (levelsLargeVariants)
392         levelsLargeVariants->clear();
393     // Store variation points.  We reuse the fixVariationLevels() function from small indel/snp simulation and thus
394     // have to store a bool that is always set to false.
395     seqan::String<std::pair<int, bool> > varPoints;
396 
397     // Track last position from contig appended to seq so far.
398     int lastPos = 0;
399     if (verbosity >= 3)
400         std::cerr << __LINE__ << "\tlastPos == " << lastPos << "\n";
401 
402     // Pointer to the current small variant to write out translated to varInfo.
403     std::vector<SmallVarInfo>::const_iterator itSmallVar = smallVarInfos.begin();
404 
405     // Number of bytes written out so far/current position in variant.
406     unsigned currentPos = 0;
407 
408     for (unsigned i = 0; i < length(variants.svRecords); ++i)
409     {
410         if (variants.svRecords[i].haplotype != hId)  // Ignore all but the current contig.
411             continue;
412         // We obtain a copy of the current SV record since we translate its positions below.
413         StructuralVariantRecord svRecord = variants.svRecords[i];
414 
415         // Translate positions and lengths of SV record.
416         if (verbosity >= 2)
417             std::cerr << "  Translating SvRecord\n  " << svRecord << '\n';
418         svRecord.pos = hostToVirtualPosition(journal, svRecord.pos);
419         SEQAN_ASSERT_LT(svRecord.pos, (int)length(contig));
420         // We do not need to adjust the sizes for insertions.
421         if (svRecord.kind != StructuralVariantRecord::INDEL || svRecord.size < 0)
422             svRecord.size = hostToVirtualPosition(journal, svRecord.pos + svRecord.size) -
423                     hostToVirtualPosition(journal, svRecord.pos);
424         if (svRecord.targetPos != -1)
425             svRecord.targetPos = hostToVirtualPosition(journal, svRecord.targetPos);
426         if (verbosity >= 2)
427             std::cerr << "  => " << svRecord << '\n';
428 
429         // Copy out small variant infos for interim chars.
430         for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar)
431         {
432             int offset = (int)currentPos - lastPos;
433             varInfos.push_back(*itSmallVar);
434             varInfos.back().pos += offset;
435         }
436 
437         // Copy from contig to seq with SVs.
438         if (verbosity >= 3)
439             std::cerr << "lastPos == " << lastPos << "\n";
440         append(seq, infix(contig, lastPos, svRecord.pos));  // interim chars
441         if (methSimOptions && methSimOptions->simulateMethylationLevels)
442         {
443             append(levelsLargeVariants->forward, infix(levels->forward, lastPos, svRecord.pos));
444             append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, svRecord.pos));
445             appendValue(varPoints, std::make_pair((int)length(seq), false));
446         }
447         if (currentPos != length(seq))
448             appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, svRecord.pos,
449                                                    '+', GenomicInterval::NORMAL));
450         currentPos = length(seq);
451         if (verbosity >= 3)
452             std::cerr << "append(seq, infix(contig, " << lastPos << ", " << svRecord.pos << ") " << __LINE__ << " (interim)\n";
453         switch (svRecord.kind)
454         {
455             case StructuralVariantRecord::INDEL:
456                 {
457                     if (svRecord.size > 0)  // insertion
458                     {
459                         SEQAN_ASSERT_EQ((int)length(svRecord.seq), svRecord.size);
460 
461                         // Simulate methylation levels for insertion.
462                         MethylationLevels lvls;
463                         if (methSimOptions && methSimOptions->simulateMethylationLevels)
464                         {
465                             MethylationLevelSimulator methSim(*rng, *methSimOptions);
466                             methSim.run(lvls, svRecord.seq);
467                         }
468 
469                         // Append novel sequence and methylation levels.
470                         append(seq, svRecord.seq);
471                         if (methSimOptions && methSimOptions->simulateMethylationLevels)
472                         {
473                             append(levelsLargeVariants->forward, lvls.forward);
474                             append(levelsLargeVariants->reverse, lvls.reverse);
475                             appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point after insertion
476                         }
477                         if (currentPos != length(seq))
478                             appendValue(intervals, GenomicInterval(currentPos, length(seq), -1, -1,
479                                                                    '+', GenomicInterval::INSERTED));
480                         if (verbosity >= 3)
481                             std::cerr << "append(seq, svRecord.seq (length == " << length(svRecord.seq) << ") " << __LINE__ << " (insertion)\n";
482                         lastPos = svRecord.pos;
483                         SEQAN_ASSERT_LT(lastPos, (int)length(contig));
484 
485                         // Copy out breakpoints.
486                         breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
487                         breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));
488 
489                         currentPos = length(seq);
490                     }
491                     else  // deletion
492                     {
493                         lastPos = svRecord.pos - svRecord.size;
494                         SEQAN_ASSERT_LT(lastPos, (int)length(contig));
495 
496                         // Copy out breakpoint.
497                         breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
498                     }
499                 }
500                 break;
501             case StructuralVariantRecord::INVERSION:
502                 {
503                     unsigned oldLen = length(seq);
504                     append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
505                     if (methSimOptions && methSimOptions->simulateMethylationLevels)
506                     {
507                         appendValue(varPoints, std::make_pair((int)length(seq), false));  // variation point at deletion
508                         append(levelsLargeVariants->forward, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
509                         reverse(infix(levelsLargeVariants->forward, oldLen, length(levelsLargeVariants->forward)));
510                         append(levelsLargeVariants->reverse, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
511                         reverse(infix(levelsLargeVariants->reverse, oldLen, length(levelsLargeVariants->reverse)));
512                     }
513                     if (currentPos != length(seq))
514                         appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
515                                                                '-', GenomicInterval::INVERTED));
516 
517                     // Copy out small variant infos for inversion.
518                     for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar)
519                     {
520                         varInfos.push_back(*itSmallVar);
521                         varInfos.back().pos = currentPos + svRecord.size - (varInfos.back().pos - lastPos);
522                     }
523 
524                     if (verbosity >= 3)
525                         std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (inversion)\n";
526                     reverseComplement(infix(seq, oldLen, length(seq)));
527                     lastPos = svRecord.pos + svRecord.size;
528                     SEQAN_ASSERT_LT(lastPos, (int)length(contig));
529 
530                     // Copy out breakpoints.
531                     breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
532                     breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));
533 
534                     currentPos = length(seq);
535                 }
536                 break;
537             case StructuralVariantRecord::TRANSLOCATION:
538                 {
539                     SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size);
540                     append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos));
541                     if (methSimOptions && methSimOptions->simulateMethylationLevels)
542                     {
543                         appendValue(varPoints, std::make_pair((int)length(seq), false));
544                         append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos));
545                         append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos));
546                     }
547                     if (currentPos != length(seq))
548                         appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos,
549                                                                '+', GenomicInterval::NORMAL));
550                     unsigned tmpCurrentPos = length(seq);
551                     append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
552                     if (methSimOptions && methSimOptions->simulateMethylationLevels)
553                     {
554                         appendValue(varPoints, std::make_pair((int)length(seq), false));
555                         append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
556                         append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
557                     }
558                     if (tmpCurrentPos != length(seq))
559                         appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
560                                                                '+', GenomicInterval::NORMAL));
561                     if (verbosity >= 3)
562                         std::cerr << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << " (translocation)\n"
563                                   << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n";
564                     lastPos = svRecord.targetPos;
565                     SEQAN_ASSERT_LT(lastPos, (int)length(contig));
566 
567                     // Copy out small variant infos for translocation, shift left to right and righ to left but keep
568                     // center intact.
569                     for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos; ++itSmallVar)
570                     {
571                         int offset = (int)currentPos - lastPos;
572                         varInfos.push_back(*itSmallVar);
573                         varInfos.back().pos += offset;
574 
575                         int bpLeft = svRecord.pos + svRecord.size;
576                         int bpRight = svRecord.targetPos;
577                         if (itSmallVar->pos < bpLeft)
578                             varInfos.back().pos -= (svRecord.targetPos - svRecord.pos);
579                         else if (itSmallVar->pos >= bpRight)
580                             varInfos.back().pos += (svRecord.targetPos - svRecord.pos);
581                     }
582 
583                     // Copy out breakpoints.
584                     breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
585                     breakpoints.push_back(std::make_pair(currentPos + svRecord.targetPos - svRecord.pos - svRecord.size, variants.posToIdx(Variants::SV, i)));
586                     breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));
587 
588                     currentPos = length(seq);
589                 }
590                 break;
591             case StructuralVariantRecord::DUPLICATION:
592                 {
593                     append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
594                     SEQAN_ASSERT_GEQ(svRecord.targetPos, svRecord.pos + svRecord.size);
595                     if (methSimOptions && methSimOptions->simulateMethylationLevels)  // first copy
596                     {
597                         appendValue(varPoints, std::make_pair((int)length(seq), false));
598                         append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
599                         append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
600                     }
601                     if (currentPos != length(seq))
602                         appendValue(intervals, GenomicInterval(currentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
603                                                                '+', GenomicInterval::DUPLICATED));
604                     unsigned tmpCurrentPos = length(seq);
605                     append(seq, infix(contig, svRecord.pos + svRecord.size, svRecord.targetPos));
606                     if (methSimOptions && methSimOptions->simulateMethylationLevels)
607                     {
608                         appendValue(varPoints, std::make_pair((int)length(seq), false));
609                         append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos + svRecord.size, svRecord.targetPos));
610                         append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos + svRecord.size, svRecord.targetPos));
611                     }
612                     if (tmpCurrentPos != length(seq))
613                         appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos + svRecord.size, svRecord.targetPos,
614                                                                '+', GenomicInterval::NORMAL));
615                     tmpCurrentPos = length(seq);
616                     append(seq, infix(contig, svRecord.pos, svRecord.pos + svRecord.size));
617                     if (methSimOptions && methSimOptions->simulateMethylationLevels)  // second copy
618                     {
619                         appendValue(varPoints, std::make_pair((int)length(seq), false));
620                         append(levelsLargeVariants->forward, infix(levels->forward, svRecord.pos, svRecord.pos + svRecord.size));
621                         append(levelsLargeVariants->reverse, infix(levels->reverse, svRecord.pos, svRecord.pos + svRecord.size));
622                     }
623                     if (tmpCurrentPos != length(seq))
624                         appendValue(intervals, GenomicInterval(tmpCurrentPos, length(seq), svRecord.pos, svRecord.pos + svRecord.size,
625                                                                '+', GenomicInterval::NORMAL));
626                     if (verbosity >= 3)
627                         std::cerr << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << " (duplication)\n"
628                                   << "append(seq, infix(contig, " << svRecord.pos + svRecord.size << ", " << svRecord.targetPos << ") " << __LINE__ << "\n"
629                                   << "append(seq, infix(contig, " << svRecord.pos << ", " << svRecord.pos + svRecord.size << ") " << __LINE__ << "\n";
630                     lastPos = svRecord.targetPos;
631                     SEQAN_ASSERT_LT(lastPos, (int)length(contig));
632 
633                     // Write out small variant infos for duplication.
634                     for (; itSmallVar != smallVarInfos.end() && itSmallVar->pos < svRecord.pos + svRecord.size; ++itSmallVar)
635                     {
636                         int offset = (int)currentPos - lastPos;
637                         varInfos.push_back(*itSmallVar);
638                         varInfos.back().pos += offset;
639 
640                         if (itSmallVar->pos < svRecord.pos + svRecord.size)
641                         {
642                             varInfos.push_back(*itSmallVar);
643                             varInfos.back().pos += (svRecord.targetPos - svRecord.pos);
644                         }
645                     }
646 
647                     // Copy out breakpoints.
648                     breakpoints.push_back(std::make_pair(currentPos, variants.posToIdx(Variants::SV, i)));
649                     breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos, variants.posToIdx(Variants::SV, i)));
650                     breakpoints.push_back(std::make_pair(currentPos + svRecord.pos + svRecord.size - svRecord.pos + svRecord.targetPos - (svRecord.pos + svRecord.size), variants.posToIdx(Variants::SV, i)));
651                     breakpoints.push_back(std::make_pair((int)length(seq), variants.posToIdx(Variants::SV, i)));
652 
653                     currentPos = length(seq);
654                 }
655                 break;
656             default:
657                 return 1;
658         }
659     }
660     if (verbosity >= 3)
661         std::cerr << "append(seq, infix(contig, " << lastPos << ", " << length(contig) << ") "
662                   << __LINE__ << " (last interim)\n";
663     append(seq, infix(contig, lastPos, length(contig)));
664     if (methSimOptions && methSimOptions->simulateMethylationLevels)
665     {
666         append(levelsLargeVariants->forward, infix(levels->forward, lastPos, length(contig)));
667         append(levelsLargeVariants->reverse, infix(levels->reverse, lastPos, length(contig)));
668 
669         SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->forward));
670         SEQAN_ASSERT_EQ(length(seq), length(levelsLargeVariants->reverse));
671 
672         fixVariationLevels(*levelsLargeVariants, *rng, seq, varPoints, *methSimOptions);
673     }
674     if (currentPos != length(seq))
675         appendValue(intervals, GenomicInterval(currentPos, length(seq), lastPos, length(contig),
676                                                '+', GenomicInterval::NORMAL));
677 
678     // Copy out small variant infos for trailing characters.
679     for (; itSmallVar != smallVarInfos.end(); ++itSmallVar)
680     {
681         int offset = (int)currentPos - lastPos;
682         varInfos.push_back(*itSmallVar);
683         varInfos.back().pos += offset;
684     }
685 
686     // Build the interval trees of the positionMap.
687     seqan::String<PositionMap::TInterval> svIntervals, svIntervalsSTL;
688     for (unsigned i = 0; i < length(intervals); ++i)
689         appendValue(svIntervals, PositionMap::TInterval(
690                 intervals[i].svBeginPos, intervals[i].svEndPos, intervals[i]));
691     for (unsigned i = 0; i < length(intervals); ++i)
692         if (intervals[i].smallVarBeginPos != -1)  // ignore insertions
693             appendValue(svIntervalsSTL, PositionMap::TInterval(
694                     intervals[i].smallVarBeginPos, intervals[i].smallVarEndPos, intervals[i]));
695     createIntervalTree(positionMap.svIntervalTree, svIntervals);
696     createIntervalTree(positionMap.svIntervalTreeSTL, svIntervalsSTL);
697 
698     return 0;
699 }
700 
701 // --------------------------------------------------------------------------
702 // Function PositionMap::overlapsWithBreakpoint()
703 // --------------------------------------------------------------------------
704 
overlapsWithBreakpoint(int svBeginPos,int svEndPos) const705 bool PositionMap::overlapsWithBreakpoint(int svBeginPos, int svEndPos) const
706 {
707     std::set<std::pair<int, int> >::const_iterator it = svBreakpoints.upper_bound(std::make_pair(svBeginPos, std::numeric_limits<int>::max()));
708     return (it != svBreakpoints.end() && it->first < svEndPos);
709 }
710 
711 // --------------------------------------------------------------------------
712 // Function PositionMap::getGenomicInterval()
713 // --------------------------------------------------------------------------
714 
getGenomicInterval(int svPos) const715 GenomicInterval PositionMap::getGenomicInterval(int svPos) const
716 {
717     seqan::String<GenomicInterval> intervals;
718     findIntervals(intervals, svIntervalTree, svPos);
719     SEQAN_ASSERT_EQ(length(intervals), 1u);
720     return intervals[0];
721 }
722 
723 // --------------------------------------------------------------------------
724 // PositionMap::getGenomicIntervalSmallVarPos()
725 // --------------------------------------------------------------------------
726 
727 // Returns the GenomicInterval on the sequence using a position on the small var reference.
getGenomicIntervalSmallVarPos(int smallVarPos) const728 GenomicInterval PositionMap::getGenomicIntervalSmallVarPos(int smallVarPos) const
729 {
730     seqan::String<GenomicInterval> intervals;
731     findIntervals(intervals, svIntervalTreeSTL, smallVarPos);
732     return intervals[0];
733 }
734 
735 // --------------------------------------------------------------------------
736 // Function PositionMap::toSmallVarInterval()
737 // --------------------------------------------------------------------------
738 
toSmallVarInterval(int svBeginPos,int svEndPos) const739 std::pair<int, int> PositionMap::toSmallVarInterval(int svBeginPos, int svEndPos) const
740 {
741     SEQAN_ASSERT(!overlapsWithBreakpoint(svBeginPos, svEndPos));
742     GenomicInterval gi = getGenomicInterval(svBeginPos);
743     if (gi.kind == GenomicInterval::INSERTED)
744     {
745         // novel sequence, cannot be projected
746         return std::make_pair(-1, -1);
747     }
748     if (gi.kind != GenomicInterval::INVERTED)
749     {
750         // forward
751         return std::make_pair(gi.smallVarBeginPos + (svBeginPos - gi.svBeginPos),
752                               gi.smallVarBeginPos + (svEndPos - gi.svBeginPos));
753     }
754     else
755     {
756         // reverse
757         return std::make_pair(gi.smallVarBeginPos + (gi.svEndPos - svBeginPos),
758                               gi.smallVarBeginPos + (gi.svEndPos - svEndPos));
759     }
760 
761     return std::make_pair(-1, -1);  // cannot reach here
762 }
763 
764 // --------------------------------------------------------------------------
765 // Function PositionMap::smallVarToLargeVarInterval()
766 // --------------------------------------------------------------------------
767 
smallVarToLargeVarInterval(int beginPos,int endPos) const768 std::pair<int, int> PositionMap::smallVarToLargeVarInterval(int beginPos, int endPos) const
769 {
770     // SEQAN_ASSERT(!overlapsWithBreakpoint(svBeginPos, svEndPos));
771     GenomicInterval gi = getGenomicIntervalSmallVarPos(beginPos);
772     SEQAN_ASSERT_NEQ(gi.kind, GenomicInterval::INSERTED);
773 
774     if (gi.kind != GenomicInterval::INVERTED)
775     {
776         // forward
777         return std::make_pair(gi.svBeginPos + (beginPos - gi.smallVarBeginPos),
778                               gi.svBeginPos + (endPos - gi.smallVarBeginPos));
779     }
780     else
781     {
782         // reverse
783         return std::make_pair(gi.svBeginPos + (gi.svEndPos - beginPos),
784                               gi.svBeginPos + (gi.svEndPos - endPos));
785     }
786 
787     return std::make_pair(-1, -1);  // cannot reach here
788 }
789 
790 // --------------------------------------------------------------------------
791 // Function PositionMap::orignalToSmallVarInterval()
792 // --------------------------------------------------------------------------
793 
originalToSmallVarInterval(int beginPos,int endPos) const794 std::pair<int, int> PositionMap::originalToSmallVarInterval(int beginPos, int endPos) const
795 {
796     // TODO(holtgrew): Project to the left at the end.
797 
798     // Get anchor gaps objects from anchors.
799     TGaps refGaps(seqan::Nothing(), refGapAnchors);
800     TGaps smallVarGaps(seqan::Nothing(), smallVarGapAnchors);
801 
802     // Translate begin and end position.
803     int beginPos2 = toViewPosition(refGaps, beginPos);
804     int beginPos3 = toSourcePosition(smallVarGaps, beginPos2);
805     int endPos2 = toViewPosition(refGaps, endPos);
806     int endPos3 = toSourcePosition(smallVarGaps, endPos2);
807     return std::make_pair(beginPos3, endPos3);
808 }
809 
810 // --------------------------------------------------------------------------
811 // Function PositionMap::toOriginalInterval()
812 // --------------------------------------------------------------------------
813 
toOriginalInterval(int smallVarBeginPos,int smallVarEndPos) const814 std::pair<int, int> PositionMap::toOriginalInterval(int smallVarBeginPos, int smallVarEndPos) const
815 {
816     // TODO(holtgrew): Project to the left at the end.
817 
818     // Get anchor gaps objects from anchors.
819     TGaps refGaps(seqan::Nothing(), refGapAnchors);
820     TGaps smallVarGaps(seqan::Nothing(), smallVarGapAnchors);
821 
822     // Translate begin and end position.
823     int smallVarBeginPos2 = toViewPosition(smallVarGaps, smallVarBeginPos);
824     int smallVarBeginPos3 = toSourcePosition(refGaps, smallVarBeginPos2);
825     int smallVarEndPos2 = toViewPosition(smallVarGaps, smallVarEndPos);
826     int smallVarEndPos3 = toSourcePosition(refGaps, smallVarEndPos2);
827     return std::make_pair(smallVarBeginPos3, smallVarEndPos3);
828 }
829 
830 // --------------------------------------------------------------------------
831 // Function PositionMap::reinit()
832 // --------------------------------------------------------------------------
833 
reinit(TJournalEntries const & journal)834 void PositionMap::reinit(TJournalEntries const & journal)
835 {
836     // Reset the interval tree and breakpoints.
837     // TODO(holtgrew): Better API support for IntervalTree?
838     svIntervalTree = TIntervalTree();
839     svIntervalTreeSTL = TIntervalTree();
840     svBreakpoints.clear();
841     clear(refGapAnchors);
842     clear(smallVarGapAnchors);
843 
844     // Convert the journal to two gaps.
845     //
846     // Get anchor gaps objects from anchors.
847     typedef seqan::Iterator<TGaps, seqan::Standard>::Type TGapsIter;
848     TGaps refGaps(seqan::Nothing(), refGapAnchors);
849     TGapsIter itRef = begin(refGaps, seqan::Standard());
850     TGaps smallVarGaps(seqan::Nothing(), smallVarGapAnchors);
851     TGapsIter itVar = begin(smallVarGaps, seqan::Standard());
852 
853     // Iterate over the journal.
854     typedef seqan::Iterator<TJournalEntries const, seqan::Standard>::Type TJournalEntriesIt;
855     TJournalEntriesIt it = begin(journal, seqan::Standard());
856     SEQAN_ASSERT_NEQ(it->segmentSource, seqan::SOURCE_NULL);
857     SEQAN_ASSERT_EQ(it->virtualPosition, 0u);
858 
859     unsigned lastRefPos = std::numeric_limits<unsigned>::max();  // Previous position from reference.
860     for (; it != end(journal, seqan::Standard()); ++it)
861     {
862         // std::cerr << *it << "\n";
863         SEQAN_ASSERT_NEQ(it->segmentSource, seqan::SOURCE_NULL);
864         if (it->segmentSource == seqan::SOURCE_ORIGINAL)
865         {
866             if (lastRefPos == std::numeric_limits<unsigned>::max())
867             {
868                 if (it->physicalPosition != 0)
869                 {
870                     insertGaps(itRef, it->physicalPosition);
871                     itRef += it->physicalPosition;
872                     itVar += it->physicalPosition;
873                     lastRefPos = it->physicalPosition + it->length;
874                     // std::cerr << "INSERT REF GAPS\t" << it->physicalPosition << "\n";
875                 }
876                 itRef += it->length;
877                 itVar += it->length;
878                 // std::cerr << "FORWARD\t" << it->length << "\n";
879             }
880             else
881             {
882                 if (it->physicalPosition != lastRefPos)
883                 {
884                     int len = it->physicalPosition - lastRefPos;
885                     insertGaps(itVar, len);
886                     // std::cerr << "INSERT VAR GAPS\t" << len << "\n";
887                     itRef += len;
888                     itVar += len;
889                     // std::cerr << "FORWARD\t" << len << "\n";
890                 }
891                 itRef += it->length;
892                 itVar += it->length;
893                 // std::cerr << "2 FORWARD\t" << it->length << "\n";
894             }
895             lastRefPos = it->physicalPosition + it->length;
896         }
897         else
898         {
899             insertGaps(itRef, it->length);
900             // std::cerr << "INSERT REF GAPS\t" << it->length << "\n";
901             itRef += it->length;
902             itVar += it->length;
903             // std::cerr << "FORWARD\t" << it->length << "\n";
904         }
905     }
906 
907     // std::cerr << "--> done\n";
908 
909     // typedef seqan::Gaps<seqan::CharString, seqan::AnchorGaps<TGapAnchors> > TGaps2;
910     // seqan::CharString seqH;
911     // seqan::CharString seqV;
912     // for (unsigned i = 0; i < 1000; ++i)
913     // {
914     //     appendValue(seqH, 'X');
915     //     appendValue(seqV, 'X');
916     // }
917     // TGaps2 gapsH(seqH, refGapAnchors);
918     // TGaps2 gapsV(seqV, smallVarGapAnchors);
919 
920     // std::cerr << "REF\t" << gapsH << "\n"
921     //           << "VAR\t" << gapsV << "\n";
922 }
923