1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>
33 // Author: David Weese <david.weese@fu-berlin.de>
34 // ==========================================================================
35 // A simple converter between SAM and BAM.  BAM files can be dumped region-
36 // wise, using a .bai index.
37 // ==========================================================================
38 
39 #include <iostream>
40 #include <fstream>
41 
42 #include <seqan/basic.h>
43 #include <seqan/sequence.h>
44 #include <seqan/file.h>      // For printing SeqAn Strings.
45 #include <seqan/arg_parse.h>
46 #include <seqan/seq_io.h>
47 #include <seqan/bam_io.h>
48 #include <seqan/stream.h>
49 
50 using namespace seqan;
51 
52 struct Options
53 {
54     bool showHelp;
55     bool showVersion;
56     CharString inFile;
57     CharString baiFile;
58     CharString outFile;
59     unsigned verbosity;
60     bool bamFormat;
61     String<GenomicRegion> regions;  // <(refName, beginPos, endPos)>
62 
OptionsOptions63     Options()
64     {
65         showHelp = false;
66         showVersion = false;
67         verbosity = 1;
68         bamFormat = false;
69     }
70 
71 };
72 
73 void
setupArgumentParser(ArgumentParser & parser,Options const & options)74 setupArgumentParser(ArgumentParser & parser, Options const & options)
75 {
76     setVersion(parser, "1.0");
77 
78     addDescription(parser, "***********");
79     addDescription(parser, "* BAMUTIL *");
80     addDescription(parser, "***********");
81     addDescription(parser, "");
82     addDescription(parser, "SeqAn SAM/BAM Utility.");
83     addDescription(parser, "");
84     addDescription(parser, "Author: Manuel Holtgrewe <manuel.holtgrewe@fu-berlin.de>");
85 
86     addUsageLine(parser, "bamutil [OPTIONS] <IN >OUT");
87 
88     addSection(parser, "General Options");
89     addOption(parser, ArgParseOption("v", "verbose", "Enable verbose mode."));
90     addOption(parser, ArgParseOption("vv", "very-verbose", "Enable very verbose mode."));
91 
92     addSection(parser, "Input Specification");
93     addOption(parser, ArgParseOption("i", "input-file", "Path to input, '-' for stdin.", ArgParseOption::STRING));
94     addOption(parser, ArgParseOption("bi", "bai-index-file", "Path to BAI index, default: input + '.bai'.", ArgParseOption::STRING));
95 
96     addSection(parser, "Range Specification");
97     addOption(parser, ArgParseOption("r", "region", "Regions to dump (in which aligments start). REF:FROM-TO, e.g. IV:1,000-2,000 will alignments with ref IV in range [1000,2000) (zero based).", ArgParseOption::STRING));
98 
99     addSection(parser, "Output Specification");
100     addOption(parser, ArgParseOption("o", "output-file", "Path to output, '-' for stdout.", ArgParseOption::STRING));
101     addOption(parser, ArgParseOption("b", "output-bam", "Output file is BAM."));
102 }
103 
104 // Parse the command line and check for any syntatical errors.
105 
106 ArgumentParser::ParseResult
parseArguments(Options & options,ArgumentParser & parser,int argc,char const ** argv)107 parseArguments(Options & options,
108                ArgumentParser & parser,
109                int argc,
110                char const ** argv)
111 {
112     ArgumentParser::ParseResult res = parse(parser, argc, argv);
113 
114     if (res != ArgumentParser::PARSE_OK)
115         return res;
116 
117     if (isSet(parser, "verbose"))
118         options.verbosity = 2;
119     if (isSet(parser, "very-verbose"))
120         options.verbosity = 3;
121 
122     getOptionValue(options.inFile, parser, "input-file");
123     getOptionValue(options.baiFile, parser, "bai-index-file");
124     if (empty(options.baiFile) && !empty(options.inFile))
125     {
126         options.baiFile = options.inFile;
127         append(options.baiFile, ".bai");
128     }
129 
130     getOptionValue(options.outFile, parser, "output-file");
131     options.bamFormat = isSet(parser, "output-bam");
132     if (isSet(parser, "region"))
133     {
134         String<CharString> regions = getOptionValues(parser, "region");
135         GenomicRegion region;
136         for (unsigned i = 0; i < length(regions); ++i)
137         {
138             try
139             {
140                 parse(region, regions[i]);
141                 region.beginPos++;
142                 appendValue(options.regions, region);
143                 if (options.verbosity >= 2)
144                     std::cerr << "[VERBOSE] Region " << region.seqName << ":"
145                               << region.beginPos << "-" << region.endPos << std::endl;
146             }
147             catch (ParseError & e)
148             {
149                 std::cerr << "[WARNING] could not parse region \"" << regions[i] << "\". IGNORING." << std::endl;
150             }
151         }
152     }
153 
154     return ArgumentParser::PARSE_OK;
155 }
156 
157 template <typename TOptions>
_dumpRegion(BamFileIn & in,BamFileOut & out,TOptions const & options)158 int _dumpRegion(BamFileIn & in, BamFileOut & out, TOptions const & options)
159 {
160     // TOOD(holtgrew): The index is loaded for each region.  This should probably not be the case!
161 
162     // Dump each region after loading the index.
163     BamIndex<Bai> bamIndex;
164     if (!open(bamIndex, toCString(options.baiFile)))
165     {
166         std::cerr << "[ERROR] Could not open index file " << options.baiFile << ", required when specifying regions." << std::endl;
167         return 1;
168     }
169 
170     for (unsigned i = 0; i < length(options.regions); ++i)
171     {
172         // Jump near range.
173         CharString refName = options.regions[i].seqName;
174         unsigned refId = 0;
175         if (!getIdByName(refId, contigNamesCache(context(in)), refName))
176         {
177             std::cerr << "[ERROR] Unknown reference " << refName << std::endl;
178             return 1;
179         }
180         bool hasAlignments = false;
181         int beginPos = options.regions[i].beginPos;
182         int endPos = options.regions[i].endPos;
183         if (!jumpToRegion(in, hasAlignments, refId, beginPos, endPos, bamIndex))
184         {
185             std::cerr << "[ERROR] Could not jump to " << refName << ":" << beginPos << "-" << endPos << std::endl;
186             return 1;
187         }
188         if (!hasAlignments)
189             continue;
190         // Dump range.
191         BamAlignmentRecord record;
192         while (!atEnd(in))
193         {
194             readRecord(record, in);
195             if (record.beginPos < options.regions[i].beginPos)
196                 continue;  // Skip, before region.
197             if (record.beginPos >= options.regions[i].endPos)
198                 return 0;
199 
200             writeRecord(out, record);
201         }
202     }
203     return 0;
204 }
205 
206 template <typename TOptions>
performConversion(BamFileIn & in,BamFileOut & out,TOptions const & options)207 int performConversion(BamFileIn & in, BamFileOut & out, TOptions const & options)
208 {
209     BamHeader header;
210     readHeader(header, in);
211     writeHeader(out, header);
212 
213     if (length(options.regions) == 0u)
214     {
215         // Simply dump whole file.
216         BamAlignmentRecord record;
217         while (!atEnd(in))
218         {
219             readRecord(record, in);
220             writeRecord(out, record);
221         }
222         return 0;
223     }
224     else
225     {
226         return _dumpRegion(in, out, options);
227     }
228 }
229 
230 // The main function.
231 //
232 // Don't be intimidated by its longness, most of the code is for branching the different options to types.
233 
main(int argc,char const * argv[])234 int main(int argc, char const * argv[])
235 {
236     // -----------------------------------------------------------------------
237     // Handle Command Line
238     // -----------------------------------------------------------------------
239 
240     // Setup command line parser.
241     ArgumentParser parser;
242     Options options;
243     setupArgumentParser(parser, options);
244 
245     // Then, parse the command line and handle the cases where help display
246     // is requested or erroneous parameters were given.
247     ArgumentParser::ParseResult res = parseArguments(options, parser, argc, argv);
248 
249     if (res != ArgumentParser::PARSE_OK)
250         return res == ArgumentParser::PARSE_ERROR;
251 
252     // -----------------------------------------------------------------------
253     // Open Input / Output Files.
254     // -----------------------------------------------------------------------
255 
256     BamFileIn reader;
257     BamFileOut writer(reader);
258     bool success = true;
259 
260     if (!empty(options.inFile))
261         success = open(reader, toCString(options.inFile));
262     else
263         success = open(reader, std::cin);
264 
265     if (!success)
266     {
267         std::cerr << "Couldn't open " << options.inFile << " for reading." << std::endl;
268         return 1;
269     }
270 
271     if (!empty(options.outFile))
272         success = open(writer, toCString(options.outFile));
273     else
274     {
275         // write to stdout
276 #if SEQAN_HAS_ZLIB
277         if (options.bamFormat)
278             success = open(writer, std::cout, Bam());
279         else
280 #endif
281         success = open(writer, std::cout, Sam());
282     }
283 
284     if (!success)
285     {
286         std::cerr << "Couldn't open " << options.outFile << " for writing." << std::endl;
287         return 1;
288     }
289 
290     // (weese:) We go through no pain here anymore.
291 
292     if (performConversion(reader, writer, options) != 0)
293     {
294         std::cerr << "Error during conversion!" << std::endl;
295         return 1;
296     }
297 
298     return 0;
299 }
300