1 // ==========================================================================
2 // Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 // Compute transcripts from a genome and a GFF/GTF file. Optionally, you
35 // can apply a VCF file to the genome before splicing.
36 //
37 // Transcripts must not span structural variants.
38 // ==========================================================================
39
40 // Note: We treat all given variants as phased.
41
42 #include <seqan/arg_parse.h>
43 #include <seqan/basic.h>
44 #include <seqan/seq_io.h>
45 #include <seqan/sequence.h>
46 #include <seqan/vcf_io.h>
47 #include <seqan/gff_io.h>
48
49 #include "vcf_materialization.h"
50 #include "mason_options.h"
51 #include "mason_types.h"
52
53 // ==========================================================================
54 // Classes
55 // ==========================================================================
56
57 // --------------------------------------------------------------------------
58 // Class MyGffRecord
59 // --------------------------------------------------------------------------
60
61 // Subclass of GffRecord that has an rID member.
62
63 class MyGffRecord : public seqan::GffRecord
64 {
65 public:
66 int rID;
67
68 static const int INVALID_IDX;
69
MyGffRecord()70 MyGffRecord() : seqan::GffRecord(), rID(std::numeric_limits<int>::max())
71 {}
72 };
73
74 const int MyGffRecord::INVALID_IDX = std::numeric_limits<int>::max();
75
76 // --------------------------------------------------------------------------
77 // Class SplicingInstruction
78 // --------------------------------------------------------------------------
79
80 // Represents one exon.
81
82 struct SplicingInstruction
83 {
84 // ID of the transcript that this instruction belongs to.
85 int transcriptID;
86 // Begin and end position of the exon.
87 int beginPos, endPos;
88 // The strand of the instruction '-' or '+'.
89 char strand;
90
SplicingInstructionSplicingInstruction91 SplicingInstruction() : transcriptID(-1), beginPos(-1), endPos(-1), strand('.')
92 {}
93
SplicingInstructionSplicingInstruction94 SplicingInstruction(int transcriptID, int beginPos, int endPos, char strand) :
95 transcriptID(transcriptID), beginPos(beginPos), endPos(endPos), strand(strand)
96 {}
97
operator <SplicingInstruction98 bool operator<(SplicingInstruction const & other) const
99 {
100 return std::make_pair(transcriptID, std::make_pair(beginPos, std::make_pair(endPos, strand))) <
101 std::make_pair(other.transcriptID,
102 std::make_pair(other.beginPos, std::make_pair(other.endPos, other.strand)));
103 }
104 };
105
differentTranscript(SplicingInstruction const & lhs,SplicingInstruction const & rhs)106 bool differentTranscript(SplicingInstruction const & lhs, SplicingInstruction const & rhs)
107 {
108 return lhs.transcriptID != rhs.transcriptID;
109 }
110
111 // --------------------------------------------------------------------------
112 // Class MasonSplicingApp
113 // --------------------------------------------------------------------------
114
115 class MasonSplicingApp
116 {
117 public:
118 // The configuration to use.
119 MasonSplicingOptions const & options;
120
121 // The random number generation.
122 TRng rng;
123
124 // Materialization of VCF.
125 VcfMaterializer vcfMat;
126
127 // Input GFF/GTF stream.
128 seqan::GffFileIn gffFileIn;
129
130 // Output sequence stream.
131 seqan::SeqFileOut seqFileOut;
132
MasonSplicingApp(MasonSplicingOptions const & _options)133 MasonSplicingApp(MasonSplicingOptions const & _options) :
134 options(_options), rng(options.seed),
135 vcfMat(rng, toCString(options.matOptions.fastaFileName), toCString(options.matOptions.vcfFileName))
136 {}
137
run()138 int run()
139 {
140 // Intialization
141 std::cerr << "__INITIALIZATION_____________________________________________________________\n"
142 << "\n";
143
144 std::cerr << "Opening files...";
145 try
146 {
147 vcfMat.init();
148
149 if (!open(seqFileOut, toCString(options.outputFileName)))
150 throw MasonIOException("Could not open output file.");
151
152 if (!open(gffFileIn, toCString(options.inputGffFile)))
153 throw MasonIOException("Could not open GFF/GTF file.");
154 }
155 catch (MasonIOException & e)
156 {
157 std::cerr << "\nERROR: " << e.what() << "\n";
158 return 1;
159 }
160 std::cerr << " OK\n";
161
162 // Perform genome simulation.
163 std::cerr << "\n__COMPUTING TRANSCRIPTS______________________________________________________\n"
164 << "\n";
165
166 // Read first GFF record.
167 MyGffRecord record;
168 _readFirstRecord(record);
169 if (record.rID == std::numeric_limits<int>::max())
170 return 0; // at end, could not read any, done
171
172 // Transcript names.
173 typedef seqan::StringSet<seqan::CharString> TNameStore;
174 typedef seqan::NameStoreCache<TNameStore> TNameStoreCache;
175 TNameStore transcriptNames;
176 TNameStoreCache transcriptNamesCache(transcriptNames);
177
178 // The splicing instructions for the current contig.
179 std::vector<SplicingInstruction> splicingInstructions;
180
181 // Materialized sequence.
182 seqan::Dna5String seq;
183 // Tanscript ids, used as a buffer below.
184 seqan::String<unsigned> transcriptIDs;
185
186 // Read GFF/GTF file contig by contig (must be sorted by reference name). For each contig, we all recors,
187 // create simulation instructions and then build the transcripts for each haplotype.
188 while (record.rID != std::numeric_limits<int>::max()) // sentinel, at end
189 {
190 seqan::CharString refName = record.ref;
191 std::cerr << "Splicing for " << refName << " ...";
192
193 // Read GFF records for this contig.
194 MyGffRecord firstGffRecord = record;
195 while (record.rID == firstGffRecord.rID)
196 {
197 if (empty(options.gffType) || (record.type == options.gffType))
198 {
199 // Make transcript names known to the record.
200 _appendTranscriptNames(transcriptIDs, transcriptNames, transcriptNamesCache, record);
201 // Add the splicing instructions for this record to the list for this contig.
202 for (unsigned i = 0; i < length(transcriptIDs); ++i)
203 splicingInstructions.push_back(SplicingInstruction(transcriptIDs[i], record.beginPos,
204 record.endPos, record.strand));
205 }
206
207 if (atEnd(gffFileIn))
208 {
209 record.rID = std::numeric_limits<int>::max();
210 break;
211 }
212
213 readRecord(record, gffFileIn);
214 // Translate ref to idx from VCF.
215 unsigned idx = 0;
216 if (!getIdByName(idx, vcfMat.faiIndex, record.ref))
217 throw MasonIOException("Reference name from GFF/GTF not in VCF!");
218 record.rID = idx;
219 }
220
221 // ---------------------------------------------------------------
222 // Process the splicing instructions.
223 // ---------------------------------------------------------------
224
225 // First, sort them.
226 std::sort(splicingInstructions.begin(), splicingInstructions.end());
227
228 // Materialize all haplotypes of this contig
229 int rID = 0, hID = 0; // reference and haplotype id
230 // Get index of the gff record's reference in the VCF file.
231 unsigned idx = 0;
232 if (!getIdByName(idx, vcfMat.faiIndex, refName))
233 {
234 std::stringstream ss;
235 ss << "Reference from GFF file " << refName << " unknown in FASTA/FAI file.";
236 throw MasonIOException(ss.str());
237 }
238 rID = idx;
239
240 vcfMat.currRID = rID - 1;
241 std::vector<SmallVarInfo> varInfos; // small variants for counting in read alignments
242 std::vector<std::pair<int, int> > breakpoints; // unused/ignored
243 while (vcfMat.materializeNext(seq, varInfos, breakpoints, rID, hID))
244 {
245 std::cerr << " (allele " << (hID + 1) << ")";
246 if (rID != (int)idx)
247 break; // no more haplotypes for this reference
248 _performSplicing(splicingInstructions, seq, transcriptNames, hID, vcfMat);
249 }
250
251 std::cerr << " DONE.\n";
252
253 // ---------------------------------------------------------------
254 // Handle contig switching.
255 // ---------------------------------------------------------------
256
257 // Check that the input GFF file is clustered (weaker than sorted) by reference name.
258 if (record.rID < firstGffRecord.rID)
259 throw MasonIOException("GFF file not sorted or clustered by reference.");
260 // Reset transcript names and cache.
261 clear(transcriptNames);
262 refresh(transcriptNamesCache);
263 // Flush splicing instructions.
264 splicingInstructions.clear();
265 }
266
267 std::cerr << "\nDone splicing FASTA.\n";
268
269 return 0;
270 }
271
272 // Perform splicing of transcripts.
_performSplicing(std::vector<SplicingInstruction> const & instructions,seqan::Dna5String const & seq,seqan::StringSet<seqan::CharString> const & tNames,int hID,VcfMaterializer const & vcfMat)273 void _performSplicing(std::vector<SplicingInstruction> const & instructions,
274 seqan::Dna5String const & seq,
275 seqan::StringSet<seqan::CharString> const & tNames,
276 int hID, // -1 in case of no variants
277 VcfMaterializer const & vcfMat)
278 {
279 typedef std::vector<SplicingInstruction>::const_iterator TIter;
280 TIter it = instructions.begin();
281 TIter itEnd = std::adjacent_find(it, instructions.end(), differentTranscript);
282 if (itEnd != instructions.end())
283 ++itEnd;
284
285 seqan::Dna5String transcript, buffer;
286
287 do
288 {
289 clear(transcript);
290
291 bool onBreakpoint = false;
292 int tID = it->transcriptID;
293 for (; it != itEnd; ++it)
294 {
295 // Convert from original coordinate system to coordinate system with SVs.
296 std::pair<int, int> smallVarInt = vcfMat.posMap.originalToSmallVarInterval(
297 it->beginPos, it->endPos);
298 GenomicInterval gi = vcfMat.posMap.getGenomicIntervalSmallVarPos(smallVarInt.first);
299 SEQAN_ASSERT_GT(smallVarInt.second, 0);
300 GenomicInterval giR = vcfMat.posMap.getGenomicIntervalSmallVarPos(smallVarInt.second - 1);
301 bool overlapsWithBreakpoint = (gi != giR);
302 std::pair<int, int> largeVarInt = vcfMat.posMap.smallVarToLargeVarInterval(
303 smallVarInt.first, smallVarInt.second);
304
305 // Transcripts with exons overlapping breakpoints are not written out.
306 if (overlapsWithBreakpoint)
307 {
308 onBreakpoint = true;
309 break;
310 }
311
312 // Append buffer to transcript in original state or reverse-complemented.
313 buffer = infix(seq, largeVarInt.first, largeVarInt.second);
314 if (it->strand != gi.strand)
315 reverseComplement(buffer);
316 append(transcript, buffer);
317 }
318
319 if (onBreakpoint)
320 {
321 std::cerr << "\nWARNING: Exon lies on breakpoint!\n";
322 while (it != instructions.end() && it->transcriptID == tID)
323 ++it;
324 }
325 else
326 {
327 std::stringstream ss;
328 ss << tNames[tID];
329 if (!empty(options.matOptions.vcfFileName))
330 ss << options.haplotypeNameSep << (hID + 1);
331 writeRecord(seqFileOut, ss.str(), transcript);
332 }
333
334 // Search next range.
335 itEnd = std::adjacent_find(it, instructions.end(), differentTranscript);
336 if (itEnd != instructions.end())
337 ++itEnd;
338 }
339 while (it != instructions.end());
340 }
341
342 // Append the transcript names for the given record.
_appendTranscriptNames(seqan::String<unsigned> & tIDs,seqan::StringSet<seqan::CharString> & contigNames,seqan::NameStoreCache<seqan::StringSet<seqan::CharString>> & cache,MyGffRecord const & record)343 void _appendTranscriptNames(seqan::String<unsigned> & tIDs, // transcript ids to write out
344 seqan::StringSet<seqan::CharString> & contigNames,
345 seqan::NameStoreCache<seqan::StringSet<seqan::CharString> > & cache,
346 MyGffRecord const & record)
347 {
348 clear(tIDs);
349
350 seqan::CharString groupNames;
351 for (unsigned i = 0; i < length(record.tagNames); ++i)
352 if (record.tagNames[i] == options.gffGroupBy)
353 groupNames = record.tagValues[i];
354 if (empty(groupNames))
355 return; // Record has no group names.
356
357 // Write out the ids of the transcripts that the record belongs to as indices in contigNames.
358 unsigned idx = 0;
359 seqan::StringSet<seqan::CharString> ss;
360 strSplit(ss, groupNames, seqan::EqualsChar<','>());
361 for (unsigned i = 0; i < length(ss); ++i)
362 {
363 if (empty(ss[i]))
364 continue;
365 if (!getIdByName(idx, cache, ss[i]))
366 {
367 appendValue(tIDs, length(contigNames));
368 appendName(cache, ss[i]);
369 }
370 else
371 {
372 appendValue(tIDs, idx);
373 }
374 }
375 }
376
_readFirstRecord(MyGffRecord & record)377 void _readFirstRecord(MyGffRecord & record)
378 {
379 record.rID = record.INVALID_IDX; // uninitialized
380
381 bool found = false;
382 while (!found && !atEnd(gffFileIn))
383 {
384 readRecord(record, gffFileIn);
385
386 // Translate ref to idx from VCF.
387 unsigned idx = 0;
388 if (!getIdByName(idx, vcfMat.faiIndex, record.ref))
389 throw MasonIOException("Reference name from GFF/GTF not in VCF!");
390 record.rID = idx;
391
392 if (empty(options.gffType) || (options.gffType == record.type))
393 {
394 found = true;
395 break;
396 }
397 }
398 if (!found)
399 record.rID = std::numeric_limits<int>::max();
400 }
401 };
402
403 // ==========================================================================
404 // Functions
405 // ==========================================================================
406
407 // --------------------------------------------------------------------------
408 // Function parseCommandLine()
409 // --------------------------------------------------------------------------
410
411 seqan::ArgumentParser::ParseResult
parseCommandLine(MasonSplicingOptions & options,int argc,char const ** argv)412 parseCommandLine(MasonSplicingOptions & options, int argc, char const ** argv)
413 {
414 // Setup ArgumentParser.
415 seqan::ArgumentParser parser("mason_splicing");
416 // Set short description, version, and date.
417 setShortDescription(parser, "Generating Transcripts");
418 setDateAndVersion(parser);
419 setCategory(parser, "Simulators");
420
421 // Define usage line and long description.
422 addUsageLine(parser,
423 "[OPTIONS] \\fB-ir\\fP \\fIIN.fa\\fP \\fB-ig\\fP \\fIIN.gff\\fP [\\fB-iv\\fP \\fIIN.vcf\\fP] \\fB-o\\fP \\fIOUT.fa\\fP");
424 addDescription(parser,
425 "Create transcripts from \\fIIN.fa\\fP using the annotations from \\fIIN.gff\\fP. The resulting "
426 "transcripts are written to \\fIOUT.fa\\fP.");
427 addDescription(parser,
428 "You can pass an optional VCF file \\fIIN.vcf\\fP and the transcripts will be created from the "
429 "haplotypes stored in the VCF file.");
430
431 // Add option and text sections.
432 options.addOptions(parser);
433 options.addTextSections(parser);
434
435 // Parse command line.
436 seqan::ArgumentParser::ParseResult res = seqan::parse(parser, argc, argv);
437
438 // Only extract options if the program will continue after parseCommandLine()
439 if (res != seqan::ArgumentParser::PARSE_OK)
440 return res;
441
442 options.getOptionValues(parser);
443
444 return seqan::ArgumentParser::PARSE_OK;
445 }
446
447 // --------------------------------------------------------------------------
448 // Function main()
449 // --------------------------------------------------------------------------
450
451 // Program entry point.
452
main(int argc,char const ** argv)453 int main(int argc, char const ** argv)
454 {
455 // Parse the command line.
456 MasonSplicingOptions options;
457 seqan::ArgumentParser::ParseResult res = parseCommandLine(options, argc, argv);
458
459 // If there was an error parsing or built-in argument parser functionality
460 // was triggered then we exit the program. The return code is 1 if there
461 // were errors and 0 if there were none.
462 if (res != seqan::ArgumentParser::PARSE_OK)
463 return res == seqan::ArgumentParser::PARSE_ERROR;
464
465 std::cerr << "MASON SPLICING\n"
466 << "==============\n\n";
467
468 // Print the command line arguments back to the user.
469 if (options.verbosity > 0)
470 {
471 std::cerr << "__OPTIONS____________________________________________________________________\n"
472 << "\n";
473 options.print(std::cerr);
474 }
475
476 MasonSplicingApp app(options);
477 return app.run();
478 }
479