1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // Author: David Weese <david.weese@fu-berlin.de>
34 // ==========================================================================
35 // A simple converter between SAM and BAM. BAM files can be dumped region-
36 // wise, using a .bai index.
37 // ==========================================================================
38
39 #include <iostream>
40 #include <fstream>
41
42 #include <seqan/basic.h>
43 #include <seqan/sequence.h>
44 #include <seqan/file.h> // For printing SeqAn Strings.
45 #include <seqan/arg_parse.h>
46 #include <seqan/seq_io.h>
47 #include <seqan/bam_io.h>
48 #include <seqan/stream.h>
49
50 using namespace seqan;
51
52 struct Options
53 {
54 bool showHelp;
55 bool showVersion;
56 CharString inFile;
57 CharString baiFile;
58 CharString outFile;
59 unsigned verbosity;
60 bool bamFormat;
61 String<GenomicRegion> regions; // <(refName, beginPos, endPos)>
62
OptionsOptions63 Options()
64 {
65 showHelp = false;
66 showVersion = false;
67 verbosity = 1;
68 bamFormat = false;
69 }
70
71 };
72
73 void
setupArgumentParser(ArgumentParser & parser,Options const & options)74 setupArgumentParser(ArgumentParser & parser, Options const & options)
75 {
76 setVersion(parser, "1.0");
77
78 addDescription(parser, "***********");
79 addDescription(parser, "* BAMUTIL *");
80 addDescription(parser, "***********");
81 addDescription(parser, "");
82 addDescription(parser, "SeqAn SAM/BAM Utility.");
83 addDescription(parser, "");
84 addDescription(parser, "Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>");
85
86 addUsageLine(parser, "bamutil [OPTIONS] <IN >OUT");
87
88 addSection(parser, "General Options");
89 addOption(parser, ArgParseOption("v", "verbose", "Enable verbose mode."));
90 addOption(parser, ArgParseOption("vv", "very-verbose", "Enable very verbose mode."));
91
92 addSection(parser, "Input Specification");
93 addOption(parser, ArgParseOption("i", "input-file", "Path to input, '-' for stdin.", ArgParseOption::STRING));
94 addOption(parser, ArgParseOption("bi", "bai-index-file", "Path to BAI index, default: input + '.bai'.", ArgParseOption::STRING));
95
96 addSection(parser, "Range Specification");
97 addOption(parser, ArgParseOption("r", "region", "Regions to dump (in which aligments start). REF:FROM-TO, e.g. IV:1,000-2,000 will alignments with ref IV in range [1000,2000) (zero based).", ArgParseOption::STRING));
98
99 addSection(parser, "Output Specification");
100 addOption(parser, ArgParseOption("o", "output-file", "Path to output, '-' for stdout.", ArgParseOption::STRING));
101 addOption(parser, ArgParseOption("b", "output-bam", "Output file is BAM."));
102 }
103
104 // Parse the command line and check for any syntatical errors.
105
106 ArgumentParser::ParseResult
parseArguments(Options & options,ArgumentParser & parser,int argc,char const ** argv)107 parseArguments(Options & options,
108 ArgumentParser & parser,
109 int argc,
110 char const ** argv)
111 {
112 ArgumentParser::ParseResult res = parse(parser, argc, argv);
113
114 if (res != ArgumentParser::PARSE_OK)
115 return res;
116
117 if (isSet(parser, "verbose"))
118 options.verbosity = 2;
119 if (isSet(parser, "very-verbose"))
120 options.verbosity = 3;
121
122 getOptionValue(options.inFile, parser, "input-file");
123 getOptionValue(options.baiFile, parser, "bai-index-file");
124 if (empty(options.baiFile) && !empty(options.inFile))
125 {
126 options.baiFile = options.inFile;
127 append(options.baiFile, ".bai");
128 }
129
130 getOptionValue(options.outFile, parser, "output-file");
131 options.bamFormat = isSet(parser, "output-bam");
132 if (isSet(parser, "region"))
133 {
134 String<CharString> regions = getOptionValues(parser, "region");
135 GenomicRegion region;
136 for (unsigned i = 0; i < length(regions); ++i)
137 {
138 try
139 {
140 parse(region, regions[i]);
141 region.beginPos++;
142 appendValue(options.regions, region);
143 if (options.verbosity >= 2)
144 std::cerr << "[VERBOSE] Region " << region.seqName << ":"
145 << region.beginPos << "-" << region.endPos << std::endl;
146 }
147 catch (ParseError & e)
148 {
149 std::cerr << "[WARNING] could not parse region \"" << regions[i] << "\". IGNORING." << std::endl;
150 }
151 }
152 }
153
154 return ArgumentParser::PARSE_OK;
155 }
156
157 template <typename TOptions>
_dumpRegion(BamFileIn & in,BamFileOut & out,TOptions const & options)158 int _dumpRegion(BamFileIn & in, BamFileOut & out, TOptions const & options)
159 {
160 // TOOD(holtgrew): The index is loaded for each region. This should probably not be the case!
161
162 // Dump each region after loading the index.
163 BamIndex<Bai> bamIndex;
164 if (!open(bamIndex, toCString(options.baiFile)))
165 {
166 std::cerr << "[ERROR] Could not open index file " << options.baiFile << ", required when specifying regions." << std::endl;
167 return 1;
168 }
169
170 for (unsigned i = 0; i < length(options.regions); ++i)
171 {
172 // Jump near range.
173 CharString refName = options.regions[i].seqName;
174 unsigned refId = 0;
175 if (!getIdByName(refId, contigNamesCache(context(in)), refName))
176 {
177 std::cerr << "[ERROR] Unknown reference " << refName << std::endl;
178 return 1;
179 }
180 bool hasAlignments = false;
181 int beginPos = options.regions[i].beginPos;
182 int endPos = options.regions[i].endPos;
183 if (!jumpToRegion(in, hasAlignments, refId, beginPos, endPos, bamIndex))
184 {
185 std::cerr << "[ERROR] Could not jump to " << refName << ":" << beginPos << "-" << endPos << std::endl;
186 return 1;
187 }
188 if (!hasAlignments)
189 continue;
190 // Dump range.
191 BamAlignmentRecord record;
192 while (!atEnd(in))
193 {
194 readRecord(record, in);
195 if (record.beginPos < options.regions[i].beginPos)
196 continue; // Skip, before region.
197 if (record.beginPos >= options.regions[i].endPos)
198 return 0;
199
200 writeRecord(out, record);
201 }
202 }
203 return 0;
204 }
205
206 template <typename TOptions>
performConversion(BamFileIn & in,BamFileOut & out,TOptions const & options)207 int performConversion(BamFileIn & in, BamFileOut & out, TOptions const & options)
208 {
209 BamHeader header;
210 readHeader(header, in);
211 writeHeader(out, header);
212
213 if (length(options.regions) == 0u)
214 {
215 // Simply dump whole file.
216 BamAlignmentRecord record;
217 while (!atEnd(in))
218 {
219 readRecord(record, in);
220 writeRecord(out, record);
221 }
222 return 0;
223 }
224 else
225 {
226 return _dumpRegion(in, out, options);
227 }
228 }
229
230 // The main function.
231 //
232 // Don't be intimidated by its longness, most of the code is for branching the different options to types.
233
main(int argc,char const * argv[])234 int main(int argc, char const * argv[])
235 {
236 // -----------------------------------------------------------------------
237 // Handle Command Line
238 // -----------------------------------------------------------------------
239
240 // Setup command line parser.
241 ArgumentParser parser;
242 Options options;
243 setupArgumentParser(parser, options);
244
245 // Then, parse the command line and handle the cases where help display
246 // is requested or erroneous parameters were given.
247 ArgumentParser::ParseResult res = parseArguments(options, parser, argc, argv);
248
249 if (res != ArgumentParser::PARSE_OK)
250 return res == ArgumentParser::PARSE_ERROR;
251
252 // -----------------------------------------------------------------------
253 // Open Input / Output Files.
254 // -----------------------------------------------------------------------
255
256 BamFileIn reader;
257 BamFileOut writer(reader);
258 bool success = true;
259
260 if (!empty(options.inFile))
261 success = open(reader, toCString(options.inFile));
262 else
263 success = open(reader, std::cin);
264
265 if (!success)
266 {
267 std::cerr << "Couldn't open " << options.inFile << " for reading." << std::endl;
268 return 1;
269 }
270
271 if (!empty(options.outFile))
272 success = open(writer, toCString(options.outFile));
273 else
274 {
275 // write to stdout
276 #if SEQAN_HAS_ZLIB
277 if (options.bamFormat)
278 success = open(writer, std::cout, Bam());
279 else
280 #endif
281 success = open(writer, std::cout, Sam());
282 }
283
284 if (!success)
285 {
286 std::cerr << "Couldn't open " << options.outFile << " for writing." << std::endl;
287 return 1;
288 }
289
290 // (weese:) We go through no pain here anymore.
291
292 if (performConversion(reader, writer, options) != 0)
293 {
294 std::cerr << "Error during conversion!" << std::endl;
295 return 1;
296 }
297
298 return 0;
299 }
300