1 // ==========================================================================
2 // Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 // Management of fragment/read-to-content distribution.
35 //
36 // Simulating contig-wise saves memory to a comfortable amount and is easy.
37 // We first simulate for the fragments 0..(n-1) from which contig they come
38 // from write their ids to one file per contig. In a second step, we read
39 // contig-wise through the id files and simulate the fragments/reads. In a
40 // final step, we merge the fragments/reads by their id and write them to
41 // the final file.
42 //
43 // This header provides the data structures and routines to manage this.
44 // ==========================================================================
45
46 #ifndef APPS_MASON2_EXTERNAL_SPLIT_MERGE_H_
47 #define APPS_MASON2_EXTERNAL_SPLIT_MERGE_H_
48
49 #include <vector>
50 #include <iostream>
51
52 #include <seqan/bam_io.h>
53 #include <seqan/seq_io.h>
54
55 #include "mason_types.h"
56
57 // ============================================================================
58 // Forwards
59 // ============================================================================
60
61 inline bool ltBamAlignmentRecord(seqan::BamAlignmentRecord const & lhs,
62 seqan::BamAlignmentRecord const & rhs);
63 inline int strnum_cmp(const char *a, const char *b);
64
65 // ============================================================================
66 // Tags, Classes, Enums
67 // ============================================================================
68
69 // --------------------------------------------------------------------------
70 // Class ContigPicker
71 // --------------------------------------------------------------------------
72
73 // Distribute to contig and haplotypes.
74 //
75 // Contigs are picked with a probability proportional to their length and haplotypes are picked uniformly at random.
76
77 class ContigPicker
78 {
79 public:
80 // The random number generator to use.
81 TRng & rng;
82
83 // The length of the contigs.
84 std::vector<int64_t> lengthSums;
85 // The number of haplotypes.
86 int numHaplotypes;
87
ContigPicker(TRng & rng)88 ContigPicker(TRng & rng) : rng(rng)
89 {}
90
91 // Return a position (contig, haplotype) to distribute the read to.
92 std::pair<int, int> pick();
93
94 // Convert a position (contig, haplotype) to an integer.
toId(std::pair<int,int> pos)95 int toId(std::pair<int, int> pos) const
96 {
97 return pos.first * numHaplotypes + pos.second;
98 }
99 };
100
101 // ----------------------------------------------------------------------------
102 // Class IdSplitter
103 // ----------------------------------------------------------------------------
104
105 // Allows distributing ids from/to files.
106 //
107 // General protocol:
108 //
109 // * construct
110 // * open()
111 // * write ids, splitting
112 // * reset()
113 // * read ids, contig-wise
114 // * close()
115
116 // TODO(holtgrew): Name bogus, FileBundle would be better.
117
118 class IdSplitter
119 {
120 public:
121 // The number of contigs to split to.
122 unsigned numContigs;
123
124 // The file pointers for each contig.
125 std::vector<std::fstream *> files;
126 // The names of the temporary files (required on Windows).
127 std::vector<std::string> fileNames;
128
IdSplitter()129 IdSplitter() : numContigs(0)
130 {}
131
IdSplitter(unsigned numContigs)132 IdSplitter(unsigned numContigs) : numContigs(numContigs)
133 {}
134
~IdSplitter()135 ~IdSplitter()
136 {
137 close();
138 }
139
140 // Open files in the splitter.
141 void open();
142
143 // Reset all files in the splitter, ready for reading.
144 void reset();
145
146 // Close splitter.
147 void close();
148 };
149
150 // ----------------------------------------------------------------------------
151 // Class FastxJoiner
152 // ----------------------------------------------------------------------------
153
154 // Allows joining by id name from FASTA data stored in a IdSplitter.
155 //
156 // Construct with IdSplitter after reset() call.
157
158 // TODO(holtgrew): Could use a heap/tournament tree.
159
160 template <typename TTag>
161 class FastxJoiner
162 {
163 public:
164 // The type of the input iterator to use.
165 typedef typename seqan::DirectionIterator<std::fstream, seqan::Input>::Type TInputIterator;
166
167 // The IdSplitter to use.
168 IdSplitter * splitter;
169 // Number of active files.
170 unsigned numActive;
171 // Buffer for id and sequence for each input file.
172 seqan::StringSet<seqan::CharString> ids, seqs, quals;
173 // Maps files for activeness.
174 std::vector<bool> active;
175 // Input iterators, one for each input file.
176 std::vector<TInputIterator> inputIterators;
177
FastxJoiner()178 FastxJoiner() : splitter(), numActive(0)
179 {}
180
FastxJoiner(IdSplitter & splitter)181 FastxJoiner(IdSplitter & splitter) : splitter(&splitter), numActive(0)
182 {
183 _init();
184 }
185
186 void _init();
187
188 template <typename TSeq>
189 bool _loadNext(TSeq & id, TSeq & seq, TSeq & qual, unsigned idx);
190
atEnd()191 bool atEnd() const
192 {
193 return (numActive == 0);
194 }
195
196 int get(seqan::CharString & id, seqan::CharString & seq, seqan::CharString & qual);
197 };
198
199 // ----------------------------------------------------------------------------
200 // Class SamJoiner
201 // ----------------------------------------------------------------------------
202
203 // Allows joining by id name from FASTA data stored in a IdSplitter.
204 //
205 // Construct with IdSplitter after reset() call.
206
207 // TODO(holtgrew): Could use a heap/tournament tree.
208
209 // Compare two BAM alignment records by query name, tie is broken by first/last flag, first < last.
210
211 class SamJoiner
212 {
213 public:
214 // The IdSplitter to use.
215 IdSplitter * splitter;
216 // Number of active files.
217 unsigned numActive;
218 // Buffer for id and sequence for each input file.
219 seqan::String<seqan::BamAlignmentRecord> records;
220 // Maps files for activeness.
221 std::vector<bool> active;
222 // Input BAM files, one for each input file.
223 std::vector<seqan::BamFileIn *> bamFileIns;
224
225 // One of the identical BAM headers.
226 seqan::BamHeader header;
227
SamJoiner()228 SamJoiner() : splitter(), numActive(0)
229 {}
230
SamJoiner(IdSplitter & splitter,seqan::BamFileOut * outPtr)231 SamJoiner(IdSplitter & splitter, seqan::BamFileOut * outPtr) :
232 splitter(&splitter), numActive(0)
233 {
234 init(outPtr);
235 }
236
~SamJoiner()237 ~SamJoiner()
238 {
239 for (unsigned i = 0; i < bamFileIns.size(); ++i)
240 delete bamFileIns[i];
241 }
242
243 void init(seqan::BamFileOut * outPtr);
244
245 bool _loadNext(seqan::BamAlignmentRecord & record, unsigned idx);
246
atEnd()247 bool atEnd() const
248 {
249 return (numActive == 0);
250 }
251
252 // Get next BAM alignment record to lhs. If it is paired-end, load the second mate as well.
253 int get(seqan::BamAlignmentRecord & record);
254 };
255
256 // ============================================================================
257 // Metafunctions
258 // ============================================================================
259
260 // ============================================================================
261 // Functions
262 // ============================================================================
263
264 // ----------------------------------------------------------------------------
265 // Function FastxJoiner::_init()
266 // ----------------------------------------------------------------------------
267
268 template <typename TTag>
_init()269 void FastxJoiner<TTag>::_init()
270 {
271 resize(ids, splitter->files.size());
272 resize(seqs, splitter->files.size());
273 resize(quals, splitter->files.size());
274 active.resize(splitter->files.size());
275
276 for (unsigned i = 0; i < splitter->files.size(); ++i)
277 {
278 inputIterators.push_back(directionIterator(*splitter->files[i], seqan::Input()));
279 active[i] = _loadNext(ids[i], seqs[i], quals[i], i);
280 numActive += (active[i] != false);
281 }
282 }
283
284 // ----------------------------------------------------------------------------
285 // Function FastxJoiner::_loadNext()
286 // ----------------------------------------------------------------------------
287
288 template <typename TTag>
289 template <typename TSeq>
_loadNext(TSeq & id,TSeq & seq,TSeq & qual,unsigned idx)290 bool FastxJoiner<TTag>::_loadNext(TSeq & id, TSeq & seq, TSeq & qual, unsigned idx)
291 {
292 if (seqan::atEnd(inputIterators[idx]))
293 return false;
294 readRecord(id, seq, qual, inputIterators[idx], TTag());
295 return true;
296 }
297
298 // ----------------------------------------------------------------------------
299 // Function FastxJoiner::get()
300 // ----------------------------------------------------------------------------
301
302 template <typename TTag>
get(seqan::CharString & id,seqan::CharString & seq,seqan::CharString & qual)303 int FastxJoiner<TTag>::get(seqan::CharString & id, seqan::CharString & seq, seqan::CharString & qual)
304 {
305 unsigned idx = std::numeric_limits<unsigned>::max();
306 for (unsigned i = 0; i < length(ids); ++i)
307 {
308 if (!active[i])
309 continue;
310 if (idx == std::numeric_limits<unsigned>::max() || strnum_cmp(toCString(ids[i]), toCString(ids[idx])) < 0)
311 idx = i;
312 }
313 if (idx == std::numeric_limits<unsigned>::max())
314 return 1;
315
316 // We use double-buffering and the input parameters as buffers.
317 active[idx] = _loadNext(id, seq, qual, idx);
318 swap(id, ids[idx]);
319 swap(seq, seqs[idx]);
320 swap(qual, quals[idx]);
321 numActive -= !active[idx];
322
323 return 0;
324 }
325
326 // ----------------------------------------------------------------------------
327 // Function ltBamAlignmentRecord()
328 // ----------------------------------------------------------------------------
329
330 // Original comparison function for strings by Heng Li from samtools.
331
332 /* The MIT License
333
334 Copyright (c) 2008-2018 Genome Research Ltd (GRL).
335
336 Permission is hereby granted, free of charge, to any person obtaining
337 a copy of this software and associated documentation files (the
338 "Software"), to deal in the Software without restriction, including
339 without limitation the rights to use, copy, modify, merge, publish,
340 distribute, sublicense, and/or sell copies of the Software, and to
341 permit persons to whom the Software is furnished to do so, subject to
342 the following conditions:
343
344 The above copyright notice and this permission notice shall be
345 included in all copies or substantial portions of the Software.
346
347 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
348 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
349 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
350 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
351 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
352 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
353 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
354 SOFTWARE.
355 */
356
strnum_cmp(const char * a,const char * b)357 inline int strnum_cmp(const char *a, const char *b)
358 {
359 char *pa, *pb;
360 pa = (char*)a; pb = (char*)b;
361 while (*pa && *pb) {
362 if (isdigit(*pa) && isdigit(*pb)) {
363 long ai, bi;
364 ai = strtol(pa, &pa, 10);
365 bi = strtol(pb, &pb, 10);
366 if (ai != bi) return ai<bi? -1 : ai>bi? 1 : 0;
367 } else {
368 if (*pa != *pb) break;
369 ++pa; ++pb;
370 }
371 }
372 if (*pa == *pb)
373 return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0;
374 return *pa<*pb? -1 : *pa>*pb? 1 : 0;
375 }
376
ltBamAlignmentRecord(seqan::BamAlignmentRecord const & lhs,seqan::BamAlignmentRecord const & rhs)377 inline bool ltBamAlignmentRecord(seqan::BamAlignmentRecord const & lhs,
378 seqan::BamAlignmentRecord const & rhs)
379 {
380 int res = strnum_cmp(toCString(lhs.qName), toCString(rhs.qName));
381 return (res < 0) || (res == 0 && hasFlagFirst(lhs));
382 }
383
384 #endif // #ifndef APPS_MASON2_EXTERNAL_SPLIT_MERGE_H_
385