1 // ==========================================================================
2 //                         Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 // Management of fragment/read-to-content distribution.
35 //
36 // Simulating contig-wise saves memory to a comfortable amount and is easy.
37 // We first simulate for the fragments 0..(n-1) from which contig they come
38 // from write their ids to one file per contig.  In a second step, we read
39 // contig-wise through the id files and simulate the fragments/reads.  In a
40 // final step, we merge the fragments/reads by their id and write them to
41 // the final file.
42 //
43 // This header provides the data structures and routines to manage this.
44 // ==========================================================================
45 
46 #ifndef APPS_MASON2_EXTERNAL_SPLIT_MERGE_H_
47 #define APPS_MASON2_EXTERNAL_SPLIT_MERGE_H_
48 
49 #include <vector>
50 #include <iostream>
51 
52 #include <seqan/bam_io.h>
53 #include <seqan/seq_io.h>
54 
55 #include "mason_types.h"
56 
57 // ============================================================================
58 // Forwards
59 // ============================================================================
60 
61 inline bool ltBamAlignmentRecord(seqan::BamAlignmentRecord const & lhs,
62                                  seqan::BamAlignmentRecord const & rhs);
63 inline int strnum_cmp(const char *a, const char *b);
64 
65 // ============================================================================
66 // Tags, Classes, Enums
67 // ============================================================================
68 
69 // --------------------------------------------------------------------------
70 // Class ContigPicker
71 // --------------------------------------------------------------------------
72 
73 // Distribute to contig and haplotypes.
74 //
75 // Contigs are picked with a probability proportional to their length and haplotypes are picked uniformly at random.
76 
77 class ContigPicker
78 {
79 public:
80     // The random number generator to use.
81     TRng & rng;
82 
83     // The length of the contigs.
84     std::vector<int64_t> lengthSums;
85     // The number of haplotypes.
86     int numHaplotypes;
87 
ContigPicker(TRng & rng)88     ContigPicker(TRng & rng) : rng(rng)
89     {}
90 
91     // Return a position (contig, haplotype) to distribute the read to.
92     std::pair<int, int> pick();
93 
94     // Convert a position (contig, haplotype) to an integer.
toId(std::pair<int,int> pos)95     int toId(std::pair<int, int> pos) const
96     {
97         return pos.first * numHaplotypes + pos.second;
98     }
99 };
100 
101 // ----------------------------------------------------------------------------
102 // Class IdSplitter
103 // ----------------------------------------------------------------------------
104 
105 // Allows distributing ids from/to files.
106 //
107 // General protocol:
108 //
109 // * construct
110 // * open()
111 // * write ids, splitting
112 // * reset()
113 // * read ids, contig-wise
114 // * close()
115 
116 // TODO(holtgrew): Name bogus, FileBundle would be better.
117 
118 class IdSplitter
119 {
120 public:
121     // The number of contigs to split to.
122     unsigned numContigs;
123 
124     // The file pointers for each contig.
125     std::vector<std::fstream *> files;
126     // The names of the temporary files (required on Windows).
127     std::vector<std::string> fileNames;
128 
IdSplitter()129     IdSplitter() : numContigs(0)
130     {}
131 
IdSplitter(unsigned numContigs)132     IdSplitter(unsigned numContigs) : numContigs(numContigs)
133     {}
134 
~IdSplitter()135     ~IdSplitter()
136     {
137         close();
138     }
139 
140     // Open files in the splitter.
141     void open();
142 
143     // Reset all files in the splitter, ready for reading.
144     void reset();
145 
146     // Close splitter.
147     void close();
148 };
149 
150 // ----------------------------------------------------------------------------
151 // Class FastxJoiner
152 // ----------------------------------------------------------------------------
153 
154 // Allows joining by id name from FASTA data stored in a IdSplitter.
155 //
156 // Construct with IdSplitter after reset() call.
157 
158 // TODO(holtgrew): Could use a heap/tournament tree.
159 
160 template <typename TTag>
161 class FastxJoiner
162 {
163 public:
164     // The type of the input iterator to use.
165     typedef typename seqan::DirectionIterator<std::fstream, seqan::Input>::Type TInputIterator;
166 
167     // The IdSplitter to use.
168     IdSplitter * splitter;
169     // Number of active files.
170     unsigned numActive;
171     // Buffer for id and sequence for each input file.
172     seqan::StringSet<seqan::CharString> ids, seqs, quals;
173     // Maps files for activeness.
174     std::vector<bool> active;
175     // Input iterators, one for each input file.
176     std::vector<TInputIterator> inputIterators;
177 
FastxJoiner()178     FastxJoiner() : splitter(), numActive(0)
179     {}
180 
FastxJoiner(IdSplitter & splitter)181     FastxJoiner(IdSplitter & splitter) : splitter(&splitter), numActive(0)
182     {
183         _init();
184     }
185 
186     void _init();
187 
188     template <typename TSeq>
189     bool _loadNext(TSeq & id, TSeq & seq, TSeq & qual, unsigned idx);
190 
atEnd()191     bool atEnd() const
192     {
193         return (numActive == 0);
194     }
195 
196     int get(seqan::CharString & id, seqan::CharString & seq, seqan::CharString & qual);
197 };
198 
199 // ----------------------------------------------------------------------------
200 // Class SamJoiner
201 // ----------------------------------------------------------------------------
202 
203 // Allows joining by id name from FASTA data stored in a IdSplitter.
204 //
205 // Construct with IdSplitter after reset() call.
206 
207 // TODO(holtgrew): Could use a heap/tournament tree.
208 
209 // Compare two BAM alignment records by query name, tie is broken by first/last flag, first < last.
210 
211 class SamJoiner
212 {
213 public:
214     // The IdSplitter to use.
215     IdSplitter * splitter;
216     // Number of active files.
217     unsigned numActive;
218     // Buffer for id and sequence for each input file.
219     seqan::String<seqan::BamAlignmentRecord> records;
220     // Maps files for activeness.
221     std::vector<bool> active;
222     // Input BAM files, one for each input file.
223     std::vector<seqan::BamFileIn *> bamFileIns;
224 
225     // One of the identical BAM headers.
226     seqan::BamHeader header;
227 
SamJoiner()228     SamJoiner() : splitter(), numActive(0)
229     {}
230 
SamJoiner(IdSplitter & splitter,seqan::BamFileOut * outPtr)231     SamJoiner(IdSplitter & splitter, seqan::BamFileOut * outPtr) :
232             splitter(&splitter), numActive(0)
233     {
234         init(outPtr);
235     }
236 
~SamJoiner()237     ~SamJoiner()
238     {
239         for (unsigned i = 0; i < bamFileIns.size(); ++i)
240             delete bamFileIns[i];
241     }
242 
243     void init(seqan::BamFileOut * outPtr);
244 
245     bool _loadNext(seqan::BamAlignmentRecord & record, unsigned idx);
246 
atEnd()247     bool atEnd() const
248     {
249         return (numActive == 0);
250     }
251 
252     // Get next BAM alignment record to lhs.  If it is paired-end, load the second mate as well.
253     int get(seqan::BamAlignmentRecord & record);
254 };
255 
256 // ============================================================================
257 // Metafunctions
258 // ============================================================================
259 
260 // ============================================================================
261 // Functions
262 // ============================================================================
263 
264 // ----------------------------------------------------------------------------
265 // Function FastxJoiner::_init()
266 // ----------------------------------------------------------------------------
267 
268 template <typename TTag>
_init()269 void FastxJoiner<TTag>::_init()
270 {
271     resize(ids, splitter->files.size());
272     resize(seqs, splitter->files.size());
273     resize(quals, splitter->files.size());
274     active.resize(splitter->files.size());
275 
276     for (unsigned i = 0; i < splitter->files.size(); ++i)
277     {
278         inputIterators.push_back(directionIterator(*splitter->files[i], seqan::Input()));
279         active[i] = _loadNext(ids[i], seqs[i], quals[i], i);
280         numActive += (active[i] != false);
281     }
282 }
283 
284 // ----------------------------------------------------------------------------
285 // Function FastxJoiner::_loadNext()
286 // ----------------------------------------------------------------------------
287 
288 template <typename TTag>
289 template <typename TSeq>
_loadNext(TSeq & id,TSeq & seq,TSeq & qual,unsigned idx)290 bool FastxJoiner<TTag>::_loadNext(TSeq & id, TSeq & seq, TSeq & qual, unsigned idx)
291 {
292     if (seqan::atEnd(inputIterators[idx]))
293         return false;
294     readRecord(id, seq, qual, inputIterators[idx], TTag());
295     return true;
296 }
297 
298 // ----------------------------------------------------------------------------
299 // Function FastxJoiner::get()
300 // ----------------------------------------------------------------------------
301 
302 template <typename TTag>
get(seqan::CharString & id,seqan::CharString & seq,seqan::CharString & qual)303 int FastxJoiner<TTag>::get(seqan::CharString & id, seqan::CharString & seq, seqan::CharString & qual)
304 {
305     unsigned idx = std::numeric_limits<unsigned>::max();
306     for (unsigned i = 0; i < length(ids); ++i)
307     {
308         if (!active[i])
309             continue;
310         if (idx == std::numeric_limits<unsigned>::max() || strnum_cmp(toCString(ids[i]), toCString(ids[idx])) < 0)
311             idx = i;
312     }
313     if (idx == std::numeric_limits<unsigned>::max())
314         return 1;
315 
316     // We use double-buffering and the input parameters as buffers.
317     active[idx] = _loadNext(id, seq, qual, idx);
318     swap(id, ids[idx]);
319     swap(seq, seqs[idx]);
320     swap(qual, quals[idx]);
321     numActive -= !active[idx];
322 
323     return 0;
324 }
325 
326 // ----------------------------------------------------------------------------
327 // Function ltBamAlignmentRecord()
328 // ----------------------------------------------------------------------------
329 
330 // Original comparison function for strings by Heng Li from samtools.
331 
332 /* The MIT License
333 
334    Copyright (c) 2008-2018 Genome Research Ltd (GRL).
335 
336    Permission is hereby granted, free of charge, to any person obtaining
337    a copy of this software and associated documentation files (the
338    "Software"), to deal in the Software without restriction, including
339    without limitation the rights to use, copy, modify, merge, publish,
340    distribute, sublicense, and/or sell copies of the Software, and to
341    permit persons to whom the Software is furnished to do so, subject to
342    the following conditions:
343 
344    The above copyright notice and this permission notice shall be
345    included in all copies or substantial portions of the Software.
346 
347    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
348    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
349    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
350    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
351    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
352    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
353    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
354    SOFTWARE.
355 */
356 
strnum_cmp(const char * a,const char * b)357 inline int strnum_cmp(const char *a, const char *b)
358 {
359     char *pa, *pb;
360     pa = (char*)a; pb = (char*)b;
361     while (*pa && *pb) {
362         if (isdigit(*pa) && isdigit(*pb)) {
363             long ai, bi;
364             ai = strtol(pa, &pa, 10);
365             bi = strtol(pb, &pb, 10);
366             if (ai != bi) return ai<bi? -1 : ai>bi? 1 : 0;
367         } else {
368             if (*pa != *pb) break;
369             ++pa; ++pb;
370         }
371     }
372     if (*pa == *pb)
373         return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0;
374     return *pa<*pb? -1 : *pa>*pb? 1 : 0;
375 }
376 
ltBamAlignmentRecord(seqan::BamAlignmentRecord const & lhs,seqan::BamAlignmentRecord const & rhs)377 inline bool ltBamAlignmentRecord(seqan::BamAlignmentRecord const & lhs,
378                                  seqan::BamAlignmentRecord const & rhs)
379 {
380     int res = strnum_cmp(toCString(lhs.qName), toCString(rhs.qName));
381     return (res < 0) || (res == 0 && hasFlagFirst(lhs));
382 }
383 
384 #endif  // #ifndef APPS_MASON2_EXTERNAL_SPLIT_MERGE_H_
385