1 // ==========================================================================
2 //                         Mason - A Read Simulator
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // ==========================================================================
34 
35 #include "external_split_merge.h"
36 
37 // ---------------------------------------------------------------------------
38 // Function IdSplitter::open()
39 // ---------------------------------------------------------------------------
40 
open()41 void IdSplitter::open()
42 {
43     close();
44     for (unsigned i = 0; i < numContigs; ++i)
45     {
46 #if defined(STDLIB_VS)
47         char fileNameBuffer[1000];
48         char filePathBuffer[1000];
49         //  Gets the temp path env string (no guarantee it's a valid path).
50         DWORD dwRetVal = 0;
51         dwRetVal = GetTempPath(1000,            // length of the buffer
52                                filePathBuffer); // buffer for path
53         if (dwRetVal > 1000 || (dwRetVal == 0))
54         {
55             std::cerr << "GetTempPath failed" << std::endl;
56             exit(1);
57         }
58 
59         UINT uRetVal   = 0;
60         uRetVal = GetTempFileName(filePathBuffer,   // directory for tmp files
61                                   TEXT("MASON_"),   // temp file name prefix
62                                   0,                // create unique name
63                                   fileNameBuffer);  // buffer for name
64 
65         if (uRetVal == 0)
66         {
67             std::cerr << "GetTempFileName failed" << std::endl;
68             exit(1);
69         }
70 
71         // The file exists now and is already closed.
72         DeleteFile(fileNameBuffer);
73         files.push_back(new std::fstream(&fileNameBuffer[0], std::ios::binary | std::ios::in | std::ios::out));
74         // TODO: Deleting the file here will not actually remove it on windows.
75         fileNames.push_back(fileNameBuffer);
76 #else  // POSIX (Linux/Mac)
77         // Create temporary file using POSIX API, open with <cstdio>, delete, close POSIX.
78         char const * tmpdir = 0;
79         if ((tmpdir = getenv("TMPDIR")) == NULL)
80             tmpdir = "/tmp";
81         std::string pathTpl = tmpdir;
82         pathTpl += "/MASON_XXXXXX";
83 
84         mode_t cur_umask = umask(S_IRWXO | S_IRWXG);  // to silence Coverity warning
85         int fd = mkstemp(&pathTpl[0]);
86         files.push_back(new std::fstream(pathTpl.c_str(), std::ios::binary | std::ios::in | std::ios::out));
87         umask(cur_umask);
88         remove(pathTpl.c_str());
89         ::close(fd);
90 #endif
91 
92         if (!files.back())
93         {
94             std::cerr << "ERROR: Could not open temporary file!\n";
95             exit(1);
96         }
97     }
98 }
99 
100 // ---------------------------------------------------------------------------
101 // Function IdSplitter::reset()
102 // ---------------------------------------------------------------------------
103 
reset()104 void IdSplitter::reset()
105 {
106     for (unsigned i = 0; i < files.size(); ++i)
107         if (files[i] != 0)
108         {
109             SEQAN_ASSERT(files[i]->good());
110             files[i]->flush();
111             files[i]->seekg(0);
112             SEQAN_ASSERT(files[i]->good());
113         }
114 }
115 
116 // ---------------------------------------------------------------------------
117 // Function IdSplitter::close()
118 // ---------------------------------------------------------------------------
119 
close()120 void IdSplitter::close()
121 {
122     for (unsigned i = 0; i < files.size(); ++i)
123         if (files[i])
124         {
125             delete files[i];
126 #ifdef STDLIB_VS
127             DeleteFile(fileNames[i].c_str());
128 #endif  // #ifdef STDLIB_VS
129             files[i] = 0;
130         }
131     files.clear();
132     fileNames.clear();
133 }
134 
135 // ---------------------------------------------------------------------------
136 // Function SamJoiner::init()
137 // ---------------------------------------------------------------------------
138 
init(seqan::BamFileOut * outPtr)139 void SamJoiner::init(seqan::BamFileOut * outPtr)
140 {
141     resize(records, splitter->files.size());
142     active.resize(splitter->files.size());
143 
144     for (unsigned i = 0; i < splitter->files.size(); ++i)
145     {
146         if (i == 0u && outPtr)
147             bamFileIns.push_back(new seqan::BamFileIn(*outPtr, *splitter->files[i]));
148         else
149             bamFileIns.push_back(new seqan::BamFileIn(*splitter->files[i]));
150 
151         // We use a separate header structure and name stores and caches.  Since the headers of all files are equal, we
152         // will write out the first one only.
153         seqan::BamHeader tmpHeader;
154         readHeader(tmpHeader, *bamFileIns[i]);
155         if (i == 0u)
156             header = tmpHeader;
157 
158         active[i] = _loadNext(records[i], i);
159         numActive += (active[i] != false);
160     }
161 }
162 
163 // ---------------------------------------------------------------------------
164 // Function SamJoiner::_loadNext()
165 // ---------------------------------------------------------------------------
166 
_loadNext(seqan::BamAlignmentRecord & record,unsigned idx)167 bool SamJoiner::_loadNext(seqan::BamAlignmentRecord & record, unsigned idx)
168 {
169     if (seqan::atEnd(*bamFileIns[idx]))
170         return false;
171     readRecord(record, *bamFileIns[idx]);
172     return true;
173 }
174 
175 // ---------------------------------------------------------------------------
176 // Function SamJoiner::get()
177 // ---------------------------------------------------------------------------
178 
get(seqan::BamAlignmentRecord & record)179 int SamJoiner::get(seqan::BamAlignmentRecord & record)
180 {
181     unsigned idx = std::numeric_limits<unsigned>::max();
182     for (unsigned i = 0; i < length(records); ++i)
183     {
184         if (!active[i])
185             continue;
186         if (idx == std::numeric_limits<unsigned>::max() || ltBamAlignmentRecord(records[i], records[idx]))
187             idx = i;
188     }
189     if (idx == std::numeric_limits<unsigned>::max())
190         return 1;
191 
192     // We use double-buffering and the input parameters as buffers.
193     using std::swap;
194     active[idx] = _loadNext(record, idx);
195     swap(record, records[idx]);
196     numActive -= !active[idx];
197 
198     return 0;
199 }
200 
201 // ---------------------------------------------------------------------------
202 // Function ContigPicker::pick()
203 // ---------------------------------------------------------------------------
204 
pick()205 std::pair<int, int> ContigPicker::pick()
206 {
207     // Pick reference id.
208     int rID = 0;
209     if (lengthSums.size() > 1u)
210     {
211         std::uniform_int_distribution<int64_t> dist(0, lengthSums.back() - 1);
212         int64_t x = dist(rng);
213         for (unsigned i = 0; i < lengthSums.size(); ++i)
214         {
215             if (x >= lengthSums[i])
216                 rID = i + 1;
217             if (x < lengthSums[i])
218                 break;
219         }
220     }
221 
222     // Pick haplotype id.
223     std::uniform_int_distribution<int> dist2(0, numHaplotypes - 1);
224     int hID = dist2(rng);
225 
226     return std::make_pair(rID, hID);
227 }
228