1 // ==========================================================================
2 //                         Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 // Compute transcripts from a genome and a GFF/GTF file.  Optionally, you
35 // can apply a VCF file to the genome before splicing.
36 //
37 // Transcripts must not span structural variants.
38 // ==========================================================================
39 
40 // Note: We treat all given variants as phased.
41 
42 #include <seqan/arg_parse.h>
43 #include <seqan/basic.h>
44 #include <seqan/seq_io.h>
45 #include <seqan/sequence.h>
46 #include <seqan/vcf_io.h>
47 #include <seqan/gff_io.h>
48 
49 #include "vcf_materialization.h"
50 #include "mason_options.h"
51 #include "mason_types.h"
52 
53 // ==========================================================================
54 // Classes
55 // ==========================================================================
56 
57 // --------------------------------------------------------------------------
58 // Class MyGffRecord
59 // --------------------------------------------------------------------------
60 
61 // Subclass of GffRecord that has an rID member.
62 
63 class MyGffRecord : public seqan::GffRecord
64 {
65 public:
66     int rID;
67 
68     static const int INVALID_IDX;
69 
MyGffRecord()70     MyGffRecord() : seqan::GffRecord(), rID(std::numeric_limits<int>::max())
71     {}
72 };
73 
74 const int MyGffRecord::INVALID_IDX = std::numeric_limits<int>::max();
75 
76 // --------------------------------------------------------------------------
77 // Class SplicingInstruction
78 // --------------------------------------------------------------------------
79 
80 // Represents one exon.
81 
82 struct SplicingInstruction
83 {
84     // ID of the transcript that this instruction belongs to.
85     int transcriptID;
86     // Begin and end position of the exon.
87     int beginPos, endPos;
88     // The strand of the instruction '-' or '+'.
89     char strand;
90 
SplicingInstructionSplicingInstruction91     SplicingInstruction() : transcriptID(-1), beginPos(-1), endPos(-1), strand('.')
92     {}
93 
SplicingInstructionSplicingInstruction94     SplicingInstruction(int transcriptID, int beginPos, int endPos, char strand) :
95             transcriptID(transcriptID), beginPos(beginPos), endPos(endPos), strand(strand)
96     {}
97 
operator <SplicingInstruction98     bool operator<(SplicingInstruction const & other) const
99     {
100         return std::make_pair(transcriptID, std::make_pair(beginPos, std::make_pair(endPos, strand))) <
101                 std::make_pair(other.transcriptID,
102                                std::make_pair(other.beginPos, std::make_pair(other.endPos, other.strand)));
103     }
104 };
105 
differentTranscript(SplicingInstruction const & lhs,SplicingInstruction const & rhs)106 bool differentTranscript(SplicingInstruction const & lhs, SplicingInstruction const & rhs)
107 {
108     return lhs.transcriptID != rhs.transcriptID;
109 }
110 
111 // --------------------------------------------------------------------------
112 // Class MasonSplicingApp
113 // --------------------------------------------------------------------------
114 
115 class MasonSplicingApp
116 {
117 public:
118     // The configuration to use.
119     MasonSplicingOptions const & options;
120 
121     // The random number generation.
122     TRng rng;
123 
124     // Materialization of VCF.
125     VcfMaterializer vcfMat;
126 
127     // Input GFF/GTF stream.
128     seqan::GffFileIn gffFileIn;
129 
130     // Output sequence stream.
131     seqan::SeqFileOut seqFileOut;
132 
MasonSplicingApp(MasonSplicingOptions const & _options)133     MasonSplicingApp(MasonSplicingOptions const & _options) :
134             options(_options), rng(options.seed),
135             vcfMat(rng, toCString(options.matOptions.fastaFileName), toCString(options.matOptions.vcfFileName))
136     {}
137 
run()138     int run()
139     {
140         // Intialization
141         std::cerr << "__INITIALIZATION_____________________________________________________________\n"
142                   << "\n";
143 
144         std::cerr << "Opening files...";
145         try
146         {
147             vcfMat.init();
148 
149             if (!open(seqFileOut, toCString(options.outputFileName)))
150                 throw MasonIOException("Could not open output file.");
151 
152             if (!open(gffFileIn, toCString(options.inputGffFile)))
153                 throw MasonIOException("Could not open GFF/GTF file.");
154         }
155         catch (MasonIOException & e)
156         {
157             std::cerr << "\nERROR: " << e.what() << "\n";
158             return 1;
159         }
160         std::cerr << " OK\n";
161 
162         // Perform genome simulation.
163         std::cerr << "\n__COMPUTING TRANSCRIPTS______________________________________________________\n"
164                   << "\n";
165 
166         // Read first GFF record.
167         MyGffRecord record;
168         _readFirstRecord(record);
169         if (record.rID == std::numeric_limits<int>::max())
170             return 0;  // at end, could not read any, done
171 
172         // Transcript names.
173         typedef seqan::StringSet<seqan::CharString> TNameStore;
174         typedef seqan::NameStoreCache<TNameStore> TNameStoreCache;
175         TNameStore transcriptNames;
176         TNameStoreCache transcriptNamesCache(transcriptNames);
177 
178         // The splicing instructions for the current contig.
179         std::vector<SplicingInstruction> splicingInstructions;
180 
181         // Materialized sequence.
182         seqan::Dna5String seq;
183         // Tanscript ids, used as a buffer below.
184         seqan::String<unsigned> transcriptIDs;
185 
186         // Read GFF/GTF file contig by contig (must be sorted by reference name).  For each contig, we all recors,
187         // create simulation instructions and then build the transcripts for each haplotype.
188         while (record.rID != std::numeric_limits<int>::max())  // sentinel, at end
189         {
190             seqan::CharString refName = record.ref;
191             std::cerr << "Splicing for " << refName << " ...";
192 
193             // Read GFF records for this contig.
194             MyGffRecord firstGffRecord = record;
195             while (record.rID == firstGffRecord.rID)
196             {
197                 if (empty(options.gffType) || (record.type == options.gffType))
198                 {
199                     // Make transcript names known to the record.
200                     _appendTranscriptNames(transcriptIDs, transcriptNames, transcriptNamesCache, record);
201                     // Add the splicing instructions for this record to the list for this contig.
202                     for (unsigned i = 0; i < length(transcriptIDs); ++i)
203                         splicingInstructions.push_back(SplicingInstruction(transcriptIDs[i], record.beginPos,
204                                                                            record.endPos, record.strand));
205                 }
206 
207                 if (atEnd(gffFileIn))
208                 {
209                     record.rID = std::numeric_limits<int>::max();
210                     break;
211                 }
212 
213                 readRecord(record, gffFileIn);
214                 // Translate ref to idx from VCF.
215                 unsigned idx = 0;
216                 if (!getIdByName(idx, vcfMat.faiIndex, record.ref))
217                     throw MasonIOException("Reference name from GFF/GTF not in VCF!");
218                 record.rID = idx;
219             }
220 
221             // ---------------------------------------------------------------
222             // Process the splicing instructions.
223             // ---------------------------------------------------------------
224 
225             // First, sort them.
226             std::sort(splicingInstructions.begin(), splicingInstructions.end());
227 
228             // Materialize all haplotypes of this contig
229             int rID = 0, hID = 0;  // reference and haplotype id
230             // Get index of the gff record's reference in the VCF file.
231             unsigned idx = 0;
232             if (!getIdByName(idx, vcfMat.faiIndex, refName))
233             {
234                 std::stringstream ss;
235                 ss << "Reference from GFF file " << refName << " unknown in FASTA/FAI file.";
236                 throw MasonIOException(ss.str());
237             }
238             rID = idx;
239 
240             vcfMat.currRID = rID - 1;
241             std::vector<SmallVarInfo> varInfos;  // small variants for counting in read alignments
242             std::vector<std::pair<int, int> > breakpoints;  // unused/ignored
243             while (vcfMat.materializeNext(seq, varInfos, breakpoints, rID, hID))
244             {
245                 std::cerr << " (allele " << (hID + 1) << ")";
246                 if (rID != (int)idx)
247                     break;  // no more haplotypes for this reference
248                 _performSplicing(splicingInstructions, seq, transcriptNames, hID, vcfMat);
249             }
250 
251             std::cerr << " DONE.\n";
252 
253             // ---------------------------------------------------------------
254             // Handle contig switching.
255             // ---------------------------------------------------------------
256 
257             // Check that the input GFF file is clustered (weaker than sorted) by reference name.
258             if (record.rID < firstGffRecord.rID)
259                 throw MasonIOException("GFF file not sorted or clustered by reference.");
260             // Reset transcript names and cache.
261             clear(transcriptNames);
262             refresh(transcriptNamesCache);
263             // Flush splicing instructions.
264             splicingInstructions.clear();
265         }
266 
267         std::cerr << "\nDone splicing FASTA.\n";
268 
269         return 0;
270     }
271 
272     // Perform splicing of transcripts.
_performSplicing(std::vector<SplicingInstruction> const & instructions,seqan::Dna5String const & seq,seqan::StringSet<seqan::CharString> const & tNames,int hID,VcfMaterializer const & vcfMat)273     void _performSplicing(std::vector<SplicingInstruction> const & instructions,
274                           seqan::Dna5String const & seq,
275                           seqan::StringSet<seqan::CharString> const & tNames,
276                           int hID,  // -1 in case of no variants
277                           VcfMaterializer const & vcfMat)
278     {
279         typedef std::vector<SplicingInstruction>::const_iterator TIter;
280         TIter it = instructions.begin();
281         TIter itEnd = std::adjacent_find(it, instructions.end(), differentTranscript);
282         if (itEnd != instructions.end())
283             ++itEnd;
284 
285         seqan::Dna5String transcript, buffer;
286 
287         do
288         {
289             clear(transcript);
290 
291             bool onBreakpoint = false;
292             int tID = it->transcriptID;
293             for (; it != itEnd; ++it)
294             {
295                 // Convert from original coordinate system to coordinate system with SVs.
296                 std::pair<int, int> smallVarInt = vcfMat.posMap.originalToSmallVarInterval(
297                         it->beginPos, it->endPos);
298                 GenomicInterval gi = vcfMat.posMap.getGenomicIntervalSmallVarPos(smallVarInt.first);
299                 SEQAN_ASSERT_GT(smallVarInt.second, 0);
300                 GenomicInterval giR = vcfMat.posMap.getGenomicIntervalSmallVarPos(smallVarInt.second - 1);
301                 bool overlapsWithBreakpoint = (gi != giR);
302                 std::pair<int, int> largeVarInt = vcfMat.posMap.smallVarToLargeVarInterval(
303                         smallVarInt.first, smallVarInt.second);
304 
305                 // Transcripts with exons overlapping breakpoints are not written out.
306                 if (overlapsWithBreakpoint)
307                 {
308                     onBreakpoint = true;
309                     break;
310                 }
311 
312                 // Append buffer to transcript in original state or reverse-complemented.
313                 buffer = infix(seq, largeVarInt.first, largeVarInt.second);
314                 if (it->strand != gi.strand)
315                     reverseComplement(buffer);
316                 append(transcript, buffer);
317             }
318 
319             if (onBreakpoint)
320             {
321                 std::cerr << "\nWARNING: Exon lies on breakpoint!\n";
322                 while (it != instructions.end() && it->transcriptID == tID)
323                     ++it;
324             }
325             else
326             {
327                 std::stringstream ss;
328                 ss << tNames[tID];
329                 if (!empty(options.matOptions.vcfFileName))
330                     ss << options.haplotypeNameSep << (hID + 1);
331                 writeRecord(seqFileOut, ss.str(), transcript);
332             }
333 
334             // Search next range.
335             itEnd = std::adjacent_find(it, instructions.end(), differentTranscript);
336             if (itEnd != instructions.end())
337                 ++itEnd;
338         }
339         while (it != instructions.end());
340     }
341 
342     // Append the transcript names for the given record.
_appendTranscriptNames(seqan::String<unsigned> & tIDs,seqan::StringSet<seqan::CharString> & contigNames,seqan::NameStoreCache<seqan::StringSet<seqan::CharString>> & cache,MyGffRecord const & record)343     void _appendTranscriptNames(seqan::String<unsigned> & tIDs,  // transcript ids to write out
344                                 seqan::StringSet<seqan::CharString> & contigNames,
345                                 seqan::NameStoreCache<seqan::StringSet<seqan::CharString> > & cache,
346                                 MyGffRecord const & record)
347     {
348         clear(tIDs);
349 
350         seqan::CharString groupNames;
351         for (unsigned i = 0; i < length(record.tagNames); ++i)
352             if (record.tagNames[i] == options.gffGroupBy)
353                 groupNames = record.tagValues[i];
354         if (empty(groupNames))
355             return;  // Record has no group names.
356 
357         // Write out the ids of the transcripts that the record belongs to as indices in contigNames.
358         unsigned idx = 0;
359         seqan::StringSet<seqan::CharString> ss;
360         strSplit(ss, groupNames, seqan::EqualsChar<','>());
361         for (unsigned i = 0; i < length(ss); ++i)
362         {
363             if (empty(ss[i]))
364                 continue;
365             if (!getIdByName(idx, cache, ss[i]))
366             {
367                 appendValue(tIDs, length(contigNames));
368                 appendName(cache, ss[i]);
369             }
370             else
371             {
372                 appendValue(tIDs, idx);
373             }
374         }
375     }
376 
_readFirstRecord(MyGffRecord & record)377     void _readFirstRecord(MyGffRecord & record)
378     {
379         record.rID = record.INVALID_IDX;  // uninitialized
380 
381         bool found = false;
382         while (!found && !atEnd(gffFileIn))
383         {
384             readRecord(record, gffFileIn);
385 
386             // Translate ref to idx from VCF.
387             unsigned idx = 0;
388             if (!getIdByName(idx, vcfMat.faiIndex, record.ref))
389                 throw MasonIOException("Reference name from GFF/GTF not in VCF!");
390             record.rID = idx;
391 
392             if (empty(options.gffType) || (options.gffType == record.type))
393             {
394                 found = true;
395                 break;
396             }
397         }
398         if (!found)
399             record.rID = std::numeric_limits<int>::max();
400     }
401 };
402 
403 // ==========================================================================
404 // Functions
405 // ==========================================================================
406 
407 // --------------------------------------------------------------------------
408 // Function parseCommandLine()
409 // --------------------------------------------------------------------------
410 
411 seqan::ArgumentParser::ParseResult
parseCommandLine(MasonSplicingOptions & options,int argc,char const ** argv)412 parseCommandLine(MasonSplicingOptions & options, int argc, char const ** argv)
413 {
414     // Setup ArgumentParser.
415     seqan::ArgumentParser parser("mason_splicing");
416     // Set short description, version, and date.
417     setShortDescription(parser, "Generating Transcripts");
418     setDateAndVersion(parser);
419     setCategory(parser, "Simulators");
420 
421     // Define usage line and long description.
422     addUsageLine(parser,
423                  "[OPTIONS] \\fB-ir\\fP \\fIIN.fa\\fP \\fB-ig\\fP \\fIIN.gff\\fP [\\fB-iv\\fP \\fIIN.vcf\\fP] \\fB-o\\fP \\fIOUT.fa\\fP");
424     addDescription(parser,
425                    "Create transcripts from \\fIIN.fa\\fP using the annotations from \\fIIN.gff\\fP.  The resulting "
426                    "transcripts are written to \\fIOUT.fa\\fP.");
427     addDescription(parser,
428                    "You can pass an optional VCF file \\fIIN.vcf\\fP and the transcripts will be created from the "
429                    "haplotypes stored in the VCF file.");
430 
431     // Add option and text sections.
432     options.addOptions(parser);
433     options.addTextSections(parser);
434 
435     // Parse command line.
436     seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv);
437 
438     // Only extract  options if the program will continue after parseCommandLine()
439     if (res != seqan::ArgumentParser::PARSE_OK)
440         return res;
441 
442     options.getOptionValues(parser);
443 
444     return seqan::ArgumentParser::PARSE_OK;
445 }
446 
447 // --------------------------------------------------------------------------
448 // Function main()
449 // --------------------------------------------------------------------------
450 
451 // Program entry point.
452 
main(int argc,char const ** argv)453 int main(int argc, char const ** argv)
454 {
455     // Parse the command line.
456     MasonSplicingOptions options;
457     seqan::ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv);
458 
459     // If there was an error parsing or built-in argument parser functionality
460     // was triggered then we exit the program.  The return code is 1 if there
461     // were errors and 0 if there were none.
462     if (res != seqan::ArgumentParser::PARSE_OK)
463         return res == seqan::ArgumentParser::PARSE_ERROR;
464 
465     std::cerr << "MASON SPLICING\n"
466               << "==============\n\n";
467 
468     // Print the command line arguments back to the user.
469     if (options.verbosity > 0)
470     {
471         std::cerr << "__OPTIONS____________________________________________________________________\n"
472                   << "\n";
473         options.print(std::cerr);
474     }
475 
476     MasonSplicingApp app(options);
477     return app.run();
478 }
479