1 /*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 //////////////////////////////////////////////////////////////////////////
19 // This file contains the processing for the executable option "convert"
20 // which reads an SAM/BAM file and writes a SAM/BAM file (it can convert
21 // between SAM and BAM formats).
22
23 #include "Convert.h"
24 #include "SamFile.h"
25 #include "Parameters.h"
26 #include "BgzfFileType.h"
27 #include "SamValidation.h"
28
printConvertDescription(std::ostream & os)29 void Convert::printConvertDescription(std::ostream& os)
30 {
31 os << " convert - Convert SAM/BAM to SAM/BAM" << std::endl;
32 }
33
34
printDescription(std::ostream & os)35 void Convert::printDescription(std::ostream& os)
36 {
37 printConvertDescription(os);
38 }
39
40
printUsage(std::ostream & os)41 void Convert::printUsage(std::ostream& os)
42 {
43 BamExecutable::printUsage(os);
44 os << "\t./bam convert --in <inputFile> --out <outputFile.sam/bam/ubam (ubam is uncompressed bam)> [--refFile <reference filename>] [--useBases|--useEquals|--useOrigSeq] [--lshift] [--noeof] [--params]" << std::endl;
45 os << "\tRequired Parameters:" << std::endl;
46 os << "\t\t--in : the SAM/BAM file to be read" << std::endl;
47 os << "\t\t--out : the SAM/BAM file to be written" << std::endl;
48 os << "\tOptional Parameters:" << std::endl;
49 os << "\t\t--refFile : reference file name" << std::endl;
50 os << "\t\t--lshift : left shift indels when writing records\n";
51 os << "\t\t--noeof : do not expect an EOF block on a bam file" << std::endl;
52 os << "\t\t--params : print the parameter settings" << std::endl;
53 os << "\t\t--recover : attempt error recovery while reading a bam file" << std::endl;
54 os << "\tOptional Sequence Parameters (only specify one):" << std::endl;
55 os << "\t\t--useOrigSeq : Leave the sequence as is (default & used if reference is not specified)" << std::endl;
56 os << "\t\t--useBases : Convert any '=' in the sequence to the appropriate base using the reference (requires --refFile)" << std::endl;
57 os << "\t\t--useEquals : Convert any bases that match the reference to '=' (requires --refFile)" << std::endl;
58 }
59
60 #define SIGNATURE_LENGTH (sizeof(bamRecordStruct))
checkSignature(void * data)61 static bool checkSignature(void *data)
62 {
63 bamRecordStruct *record = (bamRecordStruct *) data;
64
65 if(record->myBlockSize > 60000) return false;
66
67 if(record->myReferenceID < -1 || record->myReferenceID > 1000) return false;
68
69 if(record->myCigarLength > 100) return false;
70
71 if(record->myReadLength > 1000) return false;
72
73 if(record->myMateReferenceID < -1 || record->myMateReferenceID > 1000) return false;
74
75 if(record->myMatePosition < -1) return false;
76
77 if(record->myInsertSize > 100000) return false;
78
79 return true;
80 }
81
82
83
execute(int argc,char ** argv)84 int Convert::execute(int argc, char **argv)
85 {
86 // Extract command line arguments.
87 String inFile = "";
88 String outFile = "";
89 String refFile = "";
90 bool lshift = false;
91 bool noeof = false;
92 bool params = false;
93
94 bool useBases = false;
95 bool useEquals = false;
96 bool useOrigSeq = false;
97
98 bool recover = false;
99
100 ParameterList inputParameters;
101 BEGIN_LONG_PARAMETERS(longParameterList)
102 LONG_STRINGPARAMETER("in", &inFile)
103 LONG_STRINGPARAMETER("out", &outFile)
104 LONG_STRINGPARAMETER("refFile", &refFile)
105 LONG_PARAMETER("lshift", &lshift)
106 LONG_PARAMETER("noeof", &noeof)
107 LONG_PARAMETER("recover", &recover)
108 LONG_PARAMETER("params", ¶ms)
109 LONG_PARAMETER_GROUP("SequenceConversion")
110 EXCLUSIVE_PARAMETER("useBases", &useBases)
111 EXCLUSIVE_PARAMETER("useEquals", &useEquals)
112 EXCLUSIVE_PARAMETER("useOrigSeq", &useOrigSeq)
113 LONG_PHONEHOME(VERSION)
114 END_LONG_PARAMETERS();
115
116 inputParameters.Add(new LongParameters ("Input Parameters",
117 longParameterList));
118
119 // parameters start at index 2 rather than 1.
120 inputParameters.Read(argc, argv, 2);
121
122 // If no eof block is required for a bgzf file, set the bgzf file type to
123 // not look for it.
124 if(noeof)
125 {
126 // Set that the eof block is not required.
127 BgzfFileType::setRequireEofBlock(false);
128 }
129
130 // Check to see if the in file was specified, if not, report an error.
131 if(inFile == "")
132 {
133 printUsage(std::cerr);
134 inputParameters.Status();
135 // In file was not specified but it is mandatory.
136 std::cerr << "--in is a mandatory argument, "
137 << "but was not specified" << std::endl;
138 return(-1);
139 }
140
141 if(outFile == "")
142 {
143 printUsage(std::cerr);
144 inputParameters.Status();
145 // In file was not specified but it is mandatory.
146 std::cerr << "--out is a mandatory argument, "
147 << "but was not specified" << std::endl;
148 return(-1);
149 }
150
151 // Check to see if the ref file was specified.
152 // Open the reference.
153 GenomeSequence* refPtr = NULL;
154 if(refFile != "")
155 {
156 refPtr = new GenomeSequence(refFile);
157 }
158
159 SamRecord::SequenceTranslation translation;
160 if((useBases) && (refPtr != NULL))
161 {
162 translation = SamRecord::BASES;
163 }
164 else if((useEquals) && (refPtr != NULL))
165 {
166 translation = SamRecord::EQUAL;
167 }
168 else
169 {
170 useOrigSeq = true;
171 translation = SamRecord::NONE;
172 }
173
174 if(params)
175 {
176 inputParameters.Status();
177 }
178
179 // Open the input file for reading.
180 SamFile samIn;
181 if(recover) samIn.setAttemptRecovery(true);
182 samIn.OpenForRead(inFile);
183
184 // Open the output file for writing.
185 SamFile samOut;
186 samOut.OpenForWrite(outFile);
187 samOut.SetWriteSequenceTranslation(translation);
188 samOut.SetReference(refPtr);
189
190 // Read the sam header.
191 SamFileHeader samHeader;
192 samIn.ReadHeader(samHeader);
193
194 // Write the sam header.
195 samOut.WriteHeader(samHeader);
196
197 SamRecord samRecord;
198
199 // Set returnStatus to success. It will be changed
200 // to the failure reason if any of the writes fail.
201 SamStatus::Status returnStatus = SamStatus::SUCCESS;
202
203 while(1) {
204 try {
205 // Keep reading records until ReadRecord returns false.
206 while(samIn.ReadRecord(samHeader, samRecord))
207 {
208 // left shift if necessary.
209 if(lshift)
210 {
211 samRecord.shiftIndelsLeft();
212 }
213
214 // Successfully read a record from the file, so write it.
215 if(!samOut.WriteRecord(samHeader, samRecord))
216 {
217 // Failed to write a record.
218 fprintf(stderr, "%s\n", samOut.GetStatusMessage());
219 returnStatus = samOut.GetStatus();
220 }
221 }
222 break;
223 } catch (std::runtime_error e) {
224 std::cerr << "Caught runtime error: " << e.what() << "\n";
225 if(!recover) {
226 std::cerr << "Corrupted BAM file detected - consider using --recover option.\n";
227 break;
228 }
229 std::cerr << "Attempting to resync at next good BGZF block and BAM record.\n";
230 // XXX need to resync SamFile stream here
231 bool rc = samIn.attemptRecoverySync(checkSignature, SIGNATURE_LENGTH);
232 if(rc) {
233 std::cerr << "Successful resync - some data lost.\n";
234 continue; // succeeded
235 }
236 std::cerr << "Failed to re-sync on data stream.\n";
237 break; // failed to resync
238 }
239 }
240
241 std::cerr << std::endl << "Number of records read = " <<
242 samIn.GetCurrentRecordCount() << std::endl;
243 std::cerr << "Number of records written = " <<
244 samOut.GetCurrentRecordCount() << std::endl;
245
246 if(refPtr != NULL)
247 {
248 delete(refPtr);
249 }
250
251 // Since the reads were successful, return the status based
252 // on the status of the writes. If any failed, return
253 // their failure status.
254 return(returnStatus);
255 }
256
257
258