1 /*
2 * sam2pairwise.cpp
3 *
4 * Created on: Aug 8, 2014
5 * Author: lafavemc
6 */
7
8 #include <iostream>
9 #include <sstream>
10 #include <string>
11 #include <vector>
12
13 using std::istream;
14 using std::stringstream;
15 using std::string;
16 using std::getline;
17 using std::cin;
18 using std::cout;
19 using std::cerr;
20 using std::clog;
21 using std::endl;
22 using std::vector;
23
24 #include "isolate_md.hh"
25 #include "shift_cigar.hh"
26 #include "shift_md.hh"
27 #include "translate_cigar.hh"
28 #include "translate_md.hh"
29
30
31
main(int argc,char ** argv)32 int main(int argc, char** argv)
33 {
34
35 if(argc == 2 && string(argv[1]) == "--version")
36 {
37 clog << "1.0.0" << endl;
38 return 0;
39 }
40
41 string input_string;
42 int pastheader = 0;
43
44 while ( getline( cin, input_string ) )
45 {
46
47 // Verify that you aren't in the header
48 if( !pastheader )
49 {
50 if(input_string.substr(0,1) == "@")
51 {
52 continue;
53 }
54 else
55 {
56 pastheader = 1;
57 }
58 }
59
60 // Split the line into a vector
61
62 string buf; // a buffer
63 stringstream ss(input_string);
64
65 vector<string> line;
66
67 while (ss >> buf)
68 {
69 line.push_back(buf);
70 }
71
72
73
74 // Isolate MD tag (quit if no MD tag)
75 // Already know it can't be the first 11 columns, due to SAM spec.
76
77 string md = isolate_md(line);
78
79
80
81 // Initialize variables for the three substring positions, the content of the
82 // CIGAR and MD, and the sequences for the new read, the reference, and the matches
83
84 string read = line[9];
85
86 int subpos = 0;
87 int cigarpos = 0;
88 int mdpos = 0;
89
90 stringstream cigarstream(line[5]);
91 int cigar_number;
92 char cigar_letter;
93
94 stringstream mdstream(md);
95 int md_number;
96 char md_letter;
97
98 // Flags to tell if an integer or character should be next in the MD tag
99 int mdintnext_flag = 1;
100
101 string modseq;
102 string matches;
103 string reference;
104
105
106
107 // While the substring counter is less than the length of the read
108 while ( subpos < line[9].size() )
109 {
110
111 // If cigar position is 0, shift the first element(s) of the CIGAR vector
112 // to get the distance and the letter
113
114 if( cigarpos == 0 )
115 {
116 start:
117
118 try {
119 shift_cigar( cigarstream, cigar_number, cigar_letter );
120 } catch (int e) {
121 cerr << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
122 << line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
123 << line[6] << "\t" << line[7] << "\t" << line[8] << endl
124 << "shift_cigar failed. Exiting." << endl;
125 return 1;
126 }
127
128 if (cigar_letter == 'H')
129 {
130
131 if (mdstream.rdbuf()->in_avail() != 0)
132 {
133 // If you find hard clipping at the beginning of the read,
134 // just skip it. Back up and take the next element.
135 goto start;
136 }
137 else
138 {
139 //If there's nothing more in the stream, it's hard clipping
140 // at the end, and you can just finish up.
141 break;
142 }
143
144 }
145
146 }
147
148
149
150 // If there's still something in the mdstream, mdposition is 0, CIGAR
151 // is not S (meaning you're not in the softclip region), and CIGAR is
152 //not I (meaning you're not in an insertion), shift the elements of
153 // the MD until it's something other than 0.
154
155 if ( (mdstream.rdbuf()->in_avail() != 0) && mdpos == 0 && cigar_letter != 'S' && cigar_letter != 'I' )
156 {
157
158 try{
159
160 shift_md( mdstream, mdintnext_flag, md_number, md_letter);
161
162 } catch (char e){
163
164 cerr << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
165 << line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
166 << line[6] << "\t" << line[7] << "\t" << line[8] << endl
167 << "shift_md failed; didn't grab character. Exiting." << endl;
168 return 1;
169
170 }
171
172 }
173
174
175
176 // Initialize flags for insertions, mismatches, and softclips
177
178 int insert_flag = 0;
179 int nonmatch_flag = 0;
180 int n_flag = 0;
181 int pad_flag = 0;
182
183 // If the cigar position is less than the current cigar distance number:
184
185 if( cigarpos < cigar_number )
186 {
187
188 try{
189
190 translate_cigar( modseq, read, cigar_letter, subpos, cigarpos, nonmatch_flag, insert_flag, n_flag, pad_flag );
191
192 } catch (int e){
193
194 clog << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
195 << line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
196 << line[6] << "\t" << line[7] << "\t" << line[8] << endl
197 << "Unsupported CIGAR character encountered" << endl
198 << endl << endl;
199
200 break;
201
202 }
203
204 }
205
206
207
208 // Now deal with the reference.
209
210 if( !md.empty() )
211 {
212
213 try {
214
215 translate_md( mdstream, read, reference, subpos, mdpos, md_number, md_letter, mdintnext_flag, insert_flag, nonmatch_flag, n_flag, pad_flag );
216
217 } catch (int e){
218
219 cerr << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
220 << line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
221 << line[6] << "\t" << line[7] << "\t" << line[8] << endl
222 << "Character not detected after ^ in MD tag. Exiting" << endl;
223 return 1;
224
225 }
226
227 }
228 else
229 {
230
231 if ( insert_flag )
232 {
233 reference += '-';
234 }
235 else if ( pad_flag )
236 {
237 reference += '*';
238 }
239 else if ( n_flag )
240 {
241 reference += 'N';
242 }
243 else if ( cigar_letter != 'D' )
244 {
245 reference += read.substr(subpos, 1);
246 }
247 else
248 {
249 // If there's no MD tag but the CIGAR has a D, there's no way
250 // to print this correctly.
251
252 clog << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
253 << line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
254 << line[6] << "\t" << line[7] << "\t" << line[8] << endl
255 << "D encountered in CIGAR without MD tag" << endl
256 << endl << endl;
257
258 break;
259
260 }
261
262 }
263
264
265 // If the mismatch flag is set, that line gets a space. If not, a |.
266
267 if ( nonmatch_flag )
268 {
269 matches += ' ';
270 }
271 else
272 {
273 matches += '|';
274 }
275
276
277
278 // Check if either the cigarpos or mdpos need to be reset to 0.
279 // If cigar position equals the cigar number, reset to 0.
280
281 if ( cigarpos == cigar_number )
282 {
283 cigarpos = 0;
284 }
285
286
287
288 // Check the mdpos:
289
290 if ( cigar_letter != 'S' && !(mdintnext_flag) && mdpos == md_number )
291 {
292
293 // If the CIGAR isn't S (and therefore it's reasonable to assume
294 // that you're not going to examine md_number before it's actually
295 // been defined by the contents of the MD tag), you're currently
296 // looking at an integer, and the mdpos position is equal to that
297 // integer, reset mdpos to 0.
298
299 mdpos = 0;
300
301 }
302 else if ( mdintnext_flag )
303 {
304 // If you're currently looking at a character, and for some reason
305 // mdpos isn't already 0, reset it to 0 now.
306
307 mdpos = 0;
308
309 }
310
311
312
313 // If the cigar letter was anything except a D, increment the "subpos"
314 // counter that tells you where you are in the read.
315
316 if ( cigar_letter != 'D' && cigar_letter != 'P' && cigar_letter != 'N' )
317 {
318 ++subpos;
319 }
320
321 }
322
323
324
325 // Print the alignment
326
327 if (subpos == line[9].size())
328 {
329 cout << line[0] << "\t" << line[1] << "\t"<< line[2] << "\t"<< line[3]
330 << "\t"<< line[4] << "\t"<< line[5] << "\t"<< line[6] << "\t"
331 << line[7] << "\t"<< line[8] << endl << modseq << endl << matches
332 << endl << reference << endl;
333 }
334
335 }
336
337 return 0;
338
339 }
340