1 /*
2  *  Copyright (C) 2010  Regents of the University of Michigan
3  *
4  *   This program is free software: you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation, either version 3 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *   GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 //////////////////////////////////////////////////////////////////////////
19 // This file contains the processing for the executable option "convert"
20 // which reads an SAM/BAM file and writes a SAM/BAM file (it can convert
21 // between SAM and BAM formats).
22 
23 #include "Convert.h"
24 #include "SamFile.h"
25 #include "Parameters.h"
26 #include "BgzfFileType.h"
27 #include "SamValidation.h"
28 
printConvertDescription(std::ostream & os)29 void Convert::printConvertDescription(std::ostream& os)
30 {
31     os << " convert - Convert SAM/BAM to SAM/BAM" << std::endl;
32 }
33 
34 
printDescription(std::ostream & os)35 void Convert::printDescription(std::ostream& os)
36 {
37     printConvertDescription(os);
38 }
39 
40 
printUsage(std::ostream & os)41 void Convert::printUsage(std::ostream& os)
42 {
43     BamExecutable::printUsage(os);
44     os << "\t./bam convert --in <inputFile> --out <outputFile.sam/bam/ubam (ubam is uncompressed bam)> [--refFile <reference filename>] [--useBases|--useEquals|--useOrigSeq] [--lshift] [--noeof] [--params]" << std::endl;
45     os << "\tRequired Parameters:" << std::endl;
46     os << "\t\t--in         : the SAM/BAM file to be read" << std::endl;
47     os << "\t\t--out        : the SAM/BAM file to be written" << std::endl;
48     os << "\tOptional Parameters:" << std::endl;
49     os << "\t\t--refFile    : reference file name" << std::endl;
50     os << "\t\t--lshift     : left shift indels when writing records\n";
51     os << "\t\t--noeof      : do not expect an EOF block on a bam file" << std::endl;
52     os << "\t\t--params     : print the parameter settings" << std::endl;
53     os << "\t\t--recover    : attempt error recovery while reading a bam file" << std::endl;
54     os << "\tOptional Sequence Parameters (only specify one):" << std::endl;
55     os << "\t\t--useOrigSeq : Leave the sequence as is (default & used if reference is not specified)" << std::endl;
56     os << "\t\t--useBases   : Convert any '=' in the sequence to the appropriate base using the reference (requires --refFile)" << std::endl;
57     os << "\t\t--useEquals  : Convert any bases that match the reference to '=' (requires --refFile)" << std::endl;
58 }
59 
60 #define SIGNATURE_LENGTH (sizeof(bamRecordStruct))
checkSignature(void * data)61 static bool checkSignature(void *data)
62 {
63     bamRecordStruct *record = (bamRecordStruct *) data;
64 
65     if(record->myBlockSize > 60000) return false;
66 
67     if(record->myReferenceID < -1 || record->myReferenceID > 1000) return false;
68 
69     if(record->myCigarLength > 100) return false;
70 
71     if(record->myReadLength > 1000) return false;
72 
73     if(record->myMateReferenceID < -1 || record->myMateReferenceID > 1000) return false;
74 
75     if(record->myMatePosition < -1) return false;
76 
77     if(record->myInsertSize > 100000) return false;
78 
79     return true;
80 }
81 
82 
83 
execute(int argc,char ** argv)84 int Convert::execute(int argc, char **argv)
85 {
86     // Extract command line arguments.
87     String inFile = "";
88     String outFile = "";
89     String refFile = "";
90     bool lshift = false;
91     bool noeof = false;
92     bool params = false;
93 
94     bool useBases = false;
95     bool useEquals = false;
96     bool useOrigSeq = false;
97 
98     bool recover = false;
99 
100     ParameterList inputParameters;
101     BEGIN_LONG_PARAMETERS(longParameterList)
102         LONG_STRINGPARAMETER("in", &inFile)
103         LONG_STRINGPARAMETER("out", &outFile)
104         LONG_STRINGPARAMETER("refFile", &refFile)
105         LONG_PARAMETER("lshift", &lshift)
106         LONG_PARAMETER("noeof", &noeof)
107         LONG_PARAMETER("recover", &recover)
108         LONG_PARAMETER("params", &params)
109         LONG_PARAMETER_GROUP("SequenceConversion")
110             EXCLUSIVE_PARAMETER("useBases", &useBases)
111             EXCLUSIVE_PARAMETER("useEquals", &useEquals)
112             EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq)
113         LONG_PHONEHOME(VERSION)
114         END_LONG_PARAMETERS();
115 
116     inputParameters.Add(new LongParameters ("Input Parameters",
117                                             longParameterList));
118 
119     // parameters start at index 2 rather than 1.
120     inputParameters.Read(argc, argv, 2);
121 
122     // If no eof block is required for a bgzf file, set the bgzf file type to
123     // not look for it.
124     if(noeof)
125     {
126         // Set that the eof block is not required.
127         BgzfFileType::setRequireEofBlock(false);
128     }
129 
130     // Check to see if the in file was specified, if not, report an error.
131     if(inFile == "")
132     {
133         printUsage(std::cerr);
134         inputParameters.Status();
135         // In file was not specified but it is mandatory.
136         std::cerr << "--in is a mandatory argument, "
137                   << "but was not specified" << std::endl;
138         return(-1);
139     }
140 
141     if(outFile == "")
142     {
143         printUsage(std::cerr);
144         inputParameters.Status();
145         // In file was not specified but it is mandatory.
146         std::cerr << "--out is a mandatory argument, "
147                   << "but was not specified" << std::endl;
148         return(-1);
149     }
150 
151     // Check to see if the ref file was specified.
152     // Open the reference.
153     GenomeSequence* refPtr = NULL;
154     if(refFile != "")
155     {
156         refPtr = new GenomeSequence(refFile);
157     }
158 
159     SamRecord::SequenceTranslation translation;
160     if((useBases) && (refPtr != NULL))
161     {
162         translation = SamRecord::BASES;
163     }
164     else if((useEquals) && (refPtr != NULL))
165     {
166         translation = SamRecord::EQUAL;
167     }
168     else
169     {
170         useOrigSeq = true;
171         translation = SamRecord::NONE;
172     }
173 
174     if(params)
175     {
176         inputParameters.Status();
177     }
178 
179     // Open the input file for reading.
180     SamFile samIn;
181     if(recover) samIn.setAttemptRecovery(true);
182     samIn.OpenForRead(inFile);
183 
184     // Open the output file for writing.
185     SamFile samOut;
186     samOut.OpenForWrite(outFile);
187     samOut.SetWriteSequenceTranslation(translation);
188     samOut.SetReference(refPtr);
189 
190     // Read the sam header.
191     SamFileHeader samHeader;
192     samIn.ReadHeader(samHeader);
193 
194     // Write the sam header.
195     samOut.WriteHeader(samHeader);
196 
197     SamRecord samRecord;
198 
199     // Set returnStatus to success.  It will be changed
200     // to the failure reason if any of the writes fail.
201     SamStatus::Status returnStatus = SamStatus::SUCCESS;
202 
203     while(1) {
204         try {
205             // Keep reading records until ReadRecord returns false.
206             while(samIn.ReadRecord(samHeader, samRecord))
207             {
208                 // left shift if necessary.
209                 if(lshift)
210                 {
211                     samRecord.shiftIndelsLeft();
212                 }
213 
214                 // Successfully read a record from the file, so write it.
215                 if(!samOut.WriteRecord(samHeader, samRecord))
216                 {
217                     // Failed to write a record.
218                     fprintf(stderr, "%s\n", samOut.GetStatusMessage());
219                     returnStatus = samOut.GetStatus();
220                 }
221             }
222             break;
223         } catch (std::runtime_error e) {
224             std::cerr << "Caught runtime error: " << e.what() << "\n";
225             if(!recover) {
226                 std::cerr << "Corrupted BAM file detected - consider using --recover option.\n";
227                 break;
228             }
229             std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n";
230             // XXX need to resync SamFile stream here
231             bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH);
232             if(rc) {
233                 std::cerr << "Successful resync - some data lost.\n";
234                 continue;    // succeeded
235             }
236             std::cerr << "Failed to re-sync on data stream.\n";
237             break;              // failed to resync
238         }
239     }
240 
241     std::cerr << std::endl << "Number of records read = " <<
242         samIn.GetCurrentRecordCount() << std::endl;
243     std::cerr << "Number of records written = " <<
244         samOut.GetCurrentRecordCount() << std::endl;
245 
246     if(refPtr != NULL)
247     {
248         delete(refPtr);
249     }
250 
251     // Since the reads were successful, return the status based
252     // on the status of the writes.  If any failed, return
253     // their failure status.
254     return(returnStatus);
255 }
256 
257 
258