1 /*
2  * sam2pairwise.cpp
3  *
4  *  Created on: Aug 8, 2014
5  *      Author: lafavemc
6  */
7 
8 #include <iostream>
9 #include <sstream>
10 #include <string>
11 #include <vector>
12 
13 using std::istream;
14 using std::stringstream;
15 using std::string;
16 using std::getline;
17 using std::cin;
18 using std::cout;
19 using std::cerr;
20 using std::clog;
21 using std::endl;
22 using std::vector;
23 
24 #include "isolate_md.hh"
25 #include "shift_cigar.hh"
26 #include "shift_md.hh"
27 #include "translate_cigar.hh"
28 #include "translate_md.hh"
29 
30 
31 
main(int argc,char ** argv)32 int main(int argc, char** argv)
33 {
34 
35 	if(argc == 2 && string(argv[1]) == "--version")
36 	{
37 		clog << "1.0.0" << endl;
38 		return 0;
39 	}
40 
41 	string input_string;
42 	int pastheader = 0;
43 
44 	while ( getline( cin, input_string ) )
45 	{
46 
47 		// Verify that you aren't in the header
48 		if( !pastheader )
49 		{
50 			if(input_string.substr(0,1) == "@")
51 			{
52 				continue;
53 			}
54 			else
55 			{
56 				pastheader = 1;
57 			}
58 		}
59 
60 		// Split the line into a vector
61 
62 		string buf; // a buffer
63 		stringstream ss(input_string);
64 
65 		vector<string> line;
66 
67 		while (ss >> buf)
68 		{
69 			line.push_back(buf);
70 		}
71 
72 
73 
74 		// Isolate MD tag (quit if no MD tag)
75 		// Already know it can't be the first 11 columns, due to SAM spec.
76 
77 		string md = isolate_md(line);
78 
79 
80 
81 		// Initialize variables for the three substring positions, the content of the
82 		// CIGAR and MD, and the sequences for the new read, the reference, and the matches
83 
84 		string read = line[9];
85 
86 		int subpos = 0;
87 		int cigarpos = 0;
88 		int mdpos = 0;
89 
90 		stringstream cigarstream(line[5]);
91 		int cigar_number;
92 		char cigar_letter;
93 
94 		stringstream mdstream(md);
95 		int md_number;
96 		char md_letter;
97 
98 		// Flags to tell if an integer or character should be next in the MD tag
99 		int mdintnext_flag = 1;
100 
101 		string modseq;
102 		string matches;
103 		string reference;
104 
105 
106 
107 		// While the substring counter is less than the length of the read
108 		while ( subpos < line[9].size() )
109 		{
110 
111 			// If cigar position is 0, shift the first element(s) of the CIGAR vector
112 			// to get the distance and the letter
113 
114 			if( cigarpos == 0 )
115 			{
116 				start:
117 
118 				try {
119 					shift_cigar( cigarstream, cigar_number, cigar_letter );
120 				} catch (int e) {
121 					cerr << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
122 					<< line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
123 					<< line[6] << "\t" << line[7] << "\t" << line[8] << endl
124 					<< "shift_cigar failed. Exiting." << endl;
125 					return 1;
126 				}
127 
128 				if (cigar_letter == 'H')
129 				{
130 
131 					if (mdstream.rdbuf()->in_avail() != 0)
132 					{
133 						// If you find hard clipping at the beginning of the read,
134 						// just skip it. Back up and take the next element.
135 						goto start;
136 					}
137 					else
138 					{
139 						//If there's nothing more in the stream, it's hard clipping
140 						// at the end, and you can just finish up.
141 						break;
142 					}
143 
144 				}
145 
146 			}
147 
148 
149 
150 			// If there's still something in the mdstream, mdposition is 0, CIGAR
151 			// is not S (meaning you're not in the softclip region), and CIGAR is
152 			//not I (meaning you're not in an insertion), shift the elements of
153 			// the MD until it's something other than 0.
154 
155 			if ( (mdstream.rdbuf()->in_avail() != 0) && mdpos == 0 && cigar_letter != 'S' && cigar_letter != 'I' )
156 			{
157 
158 				try{
159 
160 					shift_md( mdstream, mdintnext_flag, md_number, md_letter);
161 
162 				} catch (char e){
163 
164 					cerr << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
165 					<< line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
166 					<< line[6] << "\t" << line[7] << "\t" << line[8] << endl
167 					<< "shift_md failed; didn't grab character. Exiting." << endl;
168 					return 1;
169 
170 				}
171 
172 			}
173 
174 
175 
176 			// Initialize flags for insertions, mismatches, and softclips
177 
178 			int insert_flag = 0;
179 			int nonmatch_flag = 0;
180 			int n_flag = 0;
181 			int pad_flag = 0;
182 
183 			// If the cigar position is less than the current cigar distance number:
184 
185 			if( cigarpos < cigar_number )
186 			{
187 
188 				try{
189 
190 					translate_cigar( modseq, read, cigar_letter, subpos, cigarpos, nonmatch_flag, insert_flag, n_flag, pad_flag );
191 
192 				} catch (int e){
193 
194 					clog << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
195 					<< line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
196 					<< line[6] << "\t" << line[7] << "\t" << line[8] << endl
197 					<< "Unsupported CIGAR character encountered" << endl
198 					<< endl << endl;
199 
200 					break;
201 
202 				}
203 
204 			}
205 
206 
207 
208 			// Now deal with the reference.
209 
210 			if( !md.empty() )
211 			{
212 
213 				try {
214 
215 					translate_md( mdstream,  read, reference, subpos, mdpos, md_number, md_letter, mdintnext_flag, insert_flag, nonmatch_flag, n_flag, pad_flag );
216 
217 				} catch (int e){
218 
219 					cerr << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
220 					<< line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
221 					<< line[6] << "\t" << line[7] << "\t" << line[8] << endl
222 					<< "Character not detected after ^ in MD tag. Exiting" << endl;
223 					return 1;
224 
225 				}
226 
227 			}
228 			else
229 			{
230 
231 				if ( insert_flag )
232 				{
233 					reference += '-';
234 				}
235 				else if ( pad_flag )
236 				{
237 					reference += '*';
238 				}
239 				else if ( n_flag )
240 				{
241 					reference += 'N';
242 				}
243 				else if ( cigar_letter != 'D' )
244 				{
245 					reference += read.substr(subpos, 1);
246 				}
247 				else
248 				{
249 					// If there's no MD tag but the CIGAR has a D, there's no way
250 					// to print this correctly.
251 
252 					clog << line[0] << "\t" << line[1] << "\t" << line[2] << "\t"
253 					<< line[3] << "\t" << line[4] << "\t" << line[5] << "\t"
254 					<< line[6] << "\t" << line[7] << "\t" << line[8] << endl
255 					<< "D encountered in CIGAR without MD tag" << endl
256 					<< endl << endl;
257 
258 					break;
259 
260 				}
261 
262 			}
263 
264 
265 			// If the mismatch flag is set, that line gets a space. If not, a |.
266 
267 			if ( nonmatch_flag )
268 			{
269 				matches += ' ';
270 			}
271 			else
272 			{
273 				matches += '|';
274 			}
275 
276 
277 
278 			// Check if either the cigarpos or mdpos need to be reset to 0.
279 			// If cigar position equals the cigar number, reset to 0.
280 
281 			if ( cigarpos == cigar_number )
282 			{
283 				cigarpos = 0;
284 			}
285 
286 
287 
288 			// Check the mdpos:
289 
290 			if ( cigar_letter != 'S' && !(mdintnext_flag) && mdpos == md_number )
291 			{
292 
293 				// If the CIGAR isn't S (and therefore it's reasonable to assume
294 				// that you're not going to examine md_number before it's actually
295 				// been defined by the contents of the MD tag), you're currently
296 				// looking at an integer, and the mdpos position is equal to that
297 				// integer, reset mdpos to 0.
298 
299 				mdpos = 0;
300 
301 			}
302 			else if ( mdintnext_flag )
303 			{
304 				// If you're currently looking at a character, and for some reason
305 				// mdpos isn't already 0, reset it to 0 now.
306 
307 				mdpos = 0;
308 
309 			}
310 
311 
312 
313 			// If the cigar letter was anything except a D, increment the "subpos"
314 			// counter that tells you where you are in the read.
315 
316 			if ( cigar_letter != 'D' && cigar_letter != 'P' && cigar_letter != 'N' )
317 			{
318 				++subpos;
319 			}
320 
321 		}
322 
323 
324 
325 		// Print the alignment
326 
327 		if (subpos == line[9].size())
328 		{
329 			cout << line[0] << "\t" << line[1] << "\t"<< line[2] << "\t"<< line[3]
330 				 << "\t"<< line[4] << "\t"<< line[5] << "\t"<< line[6] << "\t"
331 				 << line[7] << "\t"<< line[8] << endl << modseq << endl << matches
332 				 << endl << reference << endl;
333 		}
334 
335 	}
336 
337 	return 0;
338 
339 }
340