1 #ifndef SAM_H
2 #define SAM_H 1
3 
4 #include "config.h" // for SAM_SEQ_QUAL
5 #include "IOUtil.h"
6 #include "Alignment.h"
7 #include "ContigID.h" // for g_contigNames
8 #include <algorithm> // for swap
9 #include <cstdlib> // for exit
10 #include <iostream>
11 #include <limits> // for numeric_limits
12 #include <sstream>
13 #include <string>
14 
15 namespace opt {
16 	/** The minimal alignment size. */
17 	static unsigned minAlign = 1;
18 }
19 
20 /** A SAM alignment of a single query. */
21 struct SAMAlignment {
22 	std::string rname;
23 	int pos;
24 	unsigned short flag;
25 	unsigned short mapq;
26 	std::string cigar;
27 
28 	/** Flag */
29 	enum {
30 		/** the read is paired in sequencing, no matter whether it is
31 		 * mapped in a pair */
32 		FPAIRED = 1,
33 		/** the read is mapped in a proper pair */
34 		FPROPER_PAIR = 2,
35 		/** the read itself is unmapped; conflictive with FPROPER_PAIR
36 		 */
37 		FUNMAP = 4,
38 		/** the mate is unmapped */
39 		FMUNMAP = 8,
40 		/** the read is mapped to the reverse strand */
41 		FREVERSE = 16,
42 		/** the mate is mapped to the reverse strand */
43 		FMREVERSE = 32,
44 		/** this is read1 */
45 		FREAD1 = 64,
46 		/** this is read2 */
47 		FREAD2 = 128,
48 		/** not primary alignment */
49 		FSECONDARY = 256,
50 		/** QC failure */
51 		FQCFAIL = 512,
52 		/** optical or PCR duplicate */
53 		FDUP = 1024,
54 	};
55 
SAMAlignmentSAMAlignment56 	SAMAlignment() :
57 		rname("*"),
58 		pos(-1),
59 		flag(FUNMAP),
60 		mapq(0) { }
61 
62 	/** Consturct a single-end alignment. */
SAMAlignmentSAMAlignment63 	SAMAlignment(const Alignment& a) :
64 		rname(a.contig),
65 		pos(a.contig_start_pos),
66 		flag(a.isRC ? FREVERSE : 0),
67 		mapq(255)
68 	{
69 		unsigned qend = a.read_start_pos + a.align_length;
70 		int clip0 = a.read_start_pos;
71 		int clip1 = a.read_length - qend;
72 		assert(clip1 >= 0);
73 		if (a.isRC)
74 			std::swap(clip0, clip1);
75 		std::ostringstream s;
76 		if (clip0 > 0)
77 			s << clip0 << 'S';
78 		s << a.align_length << 'M';
79 		if (clip1 > 0)
80 			s << clip1 << 'S';
81 		cigar = s.str();
82 	}
83 
isPairedSAMAlignment84 	bool isPaired() const { return flag & FPAIRED; }
isUnmappedSAMAlignment85 	bool isUnmapped() const { return flag & FUNMAP; }
isMateUnmappedSAMAlignment86 	bool isMateUnmapped() const { return flag & FMUNMAP; }
isReverseSAMAlignment87 	bool isReverse() const { return flag & FREVERSE; }
isMateReverseSAMAlignment88 	bool isMateReverse() const { return flag & FMREVERSE; }
isRead1SAMAlignment89 	bool isRead1() const { return flag & FREAD1; }
isRead2SAMAlignment90 	bool isRead2() const { return flag & FREAD2; }
91 
92 	/** The alignment coordinates of a gapped alignment. */
93 	struct CigarCoord {
94 		/** The length of the query sequence. */
95 		unsigned qlen;
96 		/** The start of the alignment on the query. */
97 		unsigned qstart;
98 		/** The length of the alignment on the query. */
99 		unsigned qspan;
100 		/** The length of the alignment on the target. */
101 		unsigned tspan;
102 
103 		/** Parse the specified CIGAR string. */
CigarCoordSAMAlignment::CigarCoord104 		CigarCoord(const std::string& cigar)
105 			: qlen(0), qstart(0), qspan(0), tspan(0)
106 		{
107 			if (cigar == "*")
108 				return;
109 			std::istringstream in(cigar);
110 			bool first = true;
111 			unsigned len;
112 			char type;
113 			while (in >> len >> type) {
114 				switch (type) {
115 				  case 'H': case 'S':
116 					if (first)
117 						qstart = len;
118 					qlen += len;
119 					break;
120 				  case 'M': case 'X': case '=':
121 					qlen += len;
122 					qspan += len;
123 					tspan += len;
124 					break;
125 				  case 'I':
126 					qlen += len;
127 					qspan += len;
128 					break;
129 				  case 'D': case 'N': case 'P':
130 					tspan += len;
131 					break;
132 				  default:
133 					std::cerr << "error: invalid CIGAR: `"
134 						<< cigar << "'\n";
135 					exit(EXIT_FAILURE);
136 				}
137 				first = false;
138 			}
139 			assert(in.eof());
140 		}
141 	};
142 
143 	/**
144 	 * Return the position of the first base of the query on the
145 	 * target extrapolated from the start of the alignment.
146 	 */
targetAtQueryStartSAMAlignment147 	int targetAtQueryStart() const
148 	{
149 		CigarCoord a(cigar);
150 		assert(a.qstart + a.qspan <= a.qlen);
151 		return isReverse()
152 			? pos + a.tspan + (a.qlen - a.qspan - a.qstart)
153 			: pos - a.qstart;
154 	}
155 
156 	/** Parse the specified CIGAR string.
157 	 * @return an alignment setting the fields read_start_pos,
158 	 * align_length, and read_length. The other fields will be
159 	 * uninitialized.
160 	 */
parseCigarSAMAlignment161 	static Alignment parseCigar(const std::string& cigar, bool isRC) {
162 		Alignment a;
163 		std::istringstream in(cigar);
164 		unsigned len;
165 		char type;
166 		unsigned clip0 = 0;
167 		a.align_length = 0;
168 		unsigned qlen = 0;
169 		unsigned clip1 = 0;
170 		while (in >> len >> type) {
171 			switch (type) {
172 			  case 'I': case 'X': case '=':
173 				qlen += len;
174 				clip1 += len;
175 				// fallthrough
176 			  case 'D': case 'N': case 'P':
177 				if (a.align_length == 0) {
178 					// Ignore a malformatted CIGAR string whose first
179 					// non-clipping operation is not M.
180 					std::cerr << "warning: malformatted CIGAR: "
181 						<< cigar << std::endl;
182 				}
183 				break;
184 			  case 'M':
185 				if ((unsigned)a.align_length < len) {
186 					clip0 += a.align_length + clip1;
187 					a.align_length = len;
188 					qlen += len;
189 					clip1 = 0;
190 					break;
191 				}
192 				// fallthrough
193 			  case 'H': case 'S':
194 				qlen += len;
195 				clip1 += len;
196 				break;
197 			  default:
198 				std::cerr << "error: invalid CIGAR: `"
199 					<< cigar << "'\n";
200 				exit(EXIT_FAILURE);
201 			}
202 		}
203 		a.read_start_pos = isRC ? clip1 : clip0;
204 		a.read_length = qlen;
205 		if (!in.eof()){
206 			std::cerr << "error: invalid CIGAR: `"
207 				<< cigar << "'\n";
208 			exit(EXIT_FAILURE);
209 		}
210 		return a;
211 	}
212 
AlignmentSAMAlignment213 	operator Alignment() const {
214 		assert(~flag & FUNMAP);
215 		bool isRC = flag & FREVERSE; // strand of the query
216 		Alignment a = parseCigar(cigar, isRC);
217 		a.contig = rname;
218 		a.contig_start_pos = pos;
219 		a.isRC = isRC;
220 		return a;
221 	}
222 };
223 
224 /** A SAM alignment of a query and its mate. */
225 struct SAMRecord : SAMAlignment {
226 	std::string qname;
227 	std::string mrnm;
228 	int mpos;
229 	int isize;
230 #if SAM_SEQ_QUAL
231 	std::string seq;
232 	std::string qual;
233 	std::string tags;
234 #endif
235 
236 	/** Consturct a single-end alignment. */
237 	explicit SAMRecord(const SAMAlignment& a = SAMAlignment(),
238 			const std::string& qname = "*",
239 #if SAM_SEQ_QUAL
240 			const std::string& seq = "*",
241 			const std::string& qual = "*",
242 			const std::string& tags = ""
243 #else
244 			const std::string& /*seq*/ = "*",
245 			const std::string& /*qual*/ = "*",
246 			const std::string& /*tags*/ = ""
247 #endif
248 			) :
SAMAlignmentSAMRecord249 		SAMAlignment(a),
250 		qname(qname),
251 		mrnm("*"),
252 		mpos(-1),
253 		isize(0)
254 #if SAM_SEQ_QUAL
255 		,
256 		seq(seq),
257 		qual(qual),
258 		tags(tags)
259 #endif
260 	{
261 	}
262 
263 	/** Construct a paired-end alignment. */
SAMRecordSAMRecord264 	SAMRecord(const Alignment& a0, const Alignment& a1)
265 	{
266 		*this = SAMRecord(a0);
267 		flag |= FPAIRED;
268 		if (a1.isRC)
269 			flag |= FMREVERSE;
270 		mrnm = a1.contig;
271 		mpos = a1.contig_start_pos;
272 		isize = a1.targetAtQueryStart() - a0.targetAtQueryStart();
273 	}
274 
275 	/** Set the mate mapping fields. */
fixMateSAMRecord276 	void fixMate(const SAMAlignment& o)
277 	{
278 		flag &= ~(FPROPER_PAIR | FMUNMAP | FMREVERSE);
279 		flag |= FPAIRED;
280 		if (rname == o.rname && isReverse() != o.isReverse())
281 			flag |= FPROPER_PAIR;
282 		if (o.isUnmapped())
283 			flag |= FMUNMAP;
284 		if (o.isReverse())
285 			flag |= FMREVERSE;
286 		mrnm = o.rname;
287 		mpos = o.pos;
288 		isize = isMateUnmapped() ? 0
289 			: o.targetAtQueryStart() - targetAtQueryStart();
290 
291 		// Fix unaligned mates
292 		if (!o.isUnmapped() && isUnmapped()) {
293 			rname = o.rname;
294 			pos = o.pos;
295 		} else if (o.isUnmapped() && !isUnmapped()) {
296 			mrnm = "=";
297 			mpos = pos;
298 		}
299 	}
300 
noMateSAMRecord301 	void noMate() { flag &= ~FPAIRED; }
302 
303 	/**
304 	 * Return the position of the first base of the mate query on the
305 	 * target extrapolated from the start of the alignment.
306 	 */
mateTargetAtQueryStartSAMRecord307 	int mateTargetAtQueryStart() const
308 	{
309 		return targetAtQueryStart() + isize;
310 	}
311 
312 	friend std::ostream& operator <<(std::ostream& out,
313 			const SAMRecord& o)
314 	{
315 		out << o.qname
316 			<< '\t' << o.flag
317 			<< '\t' << o.rname
318 			<< '\t' << (1 + o.pos)
319 			<< '\t' << o.mapq
320 			<< '\t' << o.cigar
321 			<< '\t' << (o.mrnm == o.rname ? "=" : o.mrnm)
322 			<< '\t' << (1 + o.mpos)
323 			<< '\t' << o.isize;
324 #if SAM_SEQ_QUAL
325 		out << '\t' << o.seq
326 			<< '\t' << o.qual;
327 		if (!o.tags.empty())
328 			out << '\t' << o.tags;
329 #else
330 		out << "\t*\t*";
331 #endif
332 		return out;
333 	}
334 
335 	friend std::istream& operator >>(std::istream& in, SAMRecord& o)
336 	{
337 		in >> o.qname
338 			>> o.flag >> o.rname >> o.pos >> o.mapq
339 			>> o.cigar >> o.mrnm >> o.mpos >> o.isize;
340 #if SAM_SEQ_QUAL
341 		in >> o.seq >> o.qual;
342 		getline(in >> Skip('\t'), o.tags);
343 #else
344 		in.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
345 #endif
346 		if (!in)
347 			return in;
348 		o.pos--;
349 		o.mpos--;
350 		if (o.mrnm == "=")
351 			o.mrnm = o.rname;
352 
353 		// Set the paired flags if qname ends in /1 or /2.
354 		unsigned l = o.qname.length();
355 		if (l >= 2 && o.qname[l-2] == '/') {
356 			switch (o.qname[l-1]) {
357 				case '1': o.flag |= FPAIRED | FREAD1; break;
358 				case '2':
359 				case '3': o.flag |= FPAIRED | FREAD2; break;
360 				default: return in;
361 			}
362 			o.qname.resize(l - 2);
363 			assert(!o.qname.empty());
364 		}
365 
366 		// Set the unmapped flag if the alignment is not long enough.
367 		CigarCoord a(o.cigar);
368 		if (a.qspan < opt::minAlign || a.tspan < opt::minAlign)
369 			o.flag |= FUNMAP;
370 		return in;
371 	}
372 };
373 
374 /** Set the mate mapping fields of a0 and a1. */
fixMate(SAMRecord & a0,SAMRecord & a1)375 static inline void fixMate(SAMRecord& a0, SAMRecord& a1)
376 {
377 	a0.fixMate(a1);
378 	a1.fixMate(a0);
379 }
380 
381 /** Read contig lengths from SAM headers. */
readContigLengths(std::istream & in,std::vector<unsigned> & lengths)382 static inline unsigned readContigLengths(std::istream& in, std::vector<unsigned>& lengths)
383 {
384 	assert(in);
385 	assert(lengths.empty());
386 	assert(g_contigNames.empty());
387 	for (std::string line; in.peek() == '@' && getline(in, line);) {
388 		std::istringstream ss(line);
389 		std::string type;
390 		ss >> type;
391 		if (type != "@SQ")
392 			continue;
393 
394 		std::string s;
395 		unsigned len;
396 		ss >> expect(" SN:") >> s >> expect(" LN:") >> len;
397 		assert(ss);
398 
399 		put(g_contigNames, lengths.size(), s);
400 		lengths.push_back(len);
401 	}
402 	if (lengths.empty()) {
403 		std::cerr << "error: no @SQ records in the SAM header\n";
404 		exit(EXIT_FAILURE);
405 	}
406 	return lengths.size();
407 }
408 
409 #endif
410