1 #ifndef SAM_H
2 #define SAM_H 1
3
4 #include "config.h" // for SAM_SEQ_QUAL
5 #include "IOUtil.h"
6 #include "Alignment.h"
7 #include "ContigID.h" // for g_contigNames
8 #include <algorithm> // for swap
9 #include <cstdlib> // for exit
10 #include <iostream>
11 #include <limits> // for numeric_limits
12 #include <sstream>
13 #include <string>
14
15 namespace opt {
16 /** The minimal alignment size. */
17 static unsigned minAlign = 1;
18 }
19
20 /** A SAM alignment of a single query. */
21 struct SAMAlignment {
22 std::string rname;
23 int pos;
24 unsigned short flag;
25 unsigned short mapq;
26 std::string cigar;
27
28 /** Flag */
29 enum {
30 /** the read is paired in sequencing, no matter whether it is
31 * mapped in a pair */
32 FPAIRED = 1,
33 /** the read is mapped in a proper pair */
34 FPROPER_PAIR = 2,
35 /** the read itself is unmapped; conflictive with FPROPER_PAIR
36 */
37 FUNMAP = 4,
38 /** the mate is unmapped */
39 FMUNMAP = 8,
40 /** the read is mapped to the reverse strand */
41 FREVERSE = 16,
42 /** the mate is mapped to the reverse strand */
43 FMREVERSE = 32,
44 /** this is read1 */
45 FREAD1 = 64,
46 /** this is read2 */
47 FREAD2 = 128,
48 /** not primary alignment */
49 FSECONDARY = 256,
50 /** QC failure */
51 FQCFAIL = 512,
52 /** optical or PCR duplicate */
53 FDUP = 1024,
54 };
55
SAMAlignmentSAMAlignment56 SAMAlignment() :
57 rname("*"),
58 pos(-1),
59 flag(FUNMAP),
60 mapq(0) { }
61
62 /** Consturct a single-end alignment. */
SAMAlignmentSAMAlignment63 SAMAlignment(const Alignment& a) :
64 rname(a.contig),
65 pos(a.contig_start_pos),
66 flag(a.isRC ? FREVERSE : 0),
67 mapq(255)
68 {
69 unsigned qend = a.read_start_pos + a.align_length;
70 int clip0 = a.read_start_pos;
71 int clip1 = a.read_length - qend;
72 assert(clip1 >= 0);
73 if (a.isRC)
74 std::swap(clip0, clip1);
75 std::ostringstream s;
76 if (clip0 > 0)
77 s << clip0 << 'S';
78 s << a.align_length << 'M';
79 if (clip1 > 0)
80 s << clip1 << 'S';
81 cigar = s.str();
82 }
83
isPairedSAMAlignment84 bool isPaired() const { return flag & FPAIRED; }
isUnmappedSAMAlignment85 bool isUnmapped() const { return flag & FUNMAP; }
isMateUnmappedSAMAlignment86 bool isMateUnmapped() const { return flag & FMUNMAP; }
isReverseSAMAlignment87 bool isReverse() const { return flag & FREVERSE; }
isMateReverseSAMAlignment88 bool isMateReverse() const { return flag & FMREVERSE; }
isRead1SAMAlignment89 bool isRead1() const { return flag & FREAD1; }
isRead2SAMAlignment90 bool isRead2() const { return flag & FREAD2; }
91
92 /** The alignment coordinates of a gapped alignment. */
93 struct CigarCoord {
94 /** The length of the query sequence. */
95 unsigned qlen;
96 /** The start of the alignment on the query. */
97 unsigned qstart;
98 /** The length of the alignment on the query. */
99 unsigned qspan;
100 /** The length of the alignment on the target. */
101 unsigned tspan;
102
103 /** Parse the specified CIGAR string. */
CigarCoordSAMAlignment::CigarCoord104 CigarCoord(const std::string& cigar)
105 : qlen(0), qstart(0), qspan(0), tspan(0)
106 {
107 if (cigar == "*")
108 return;
109 std::istringstream in(cigar);
110 bool first = true;
111 unsigned len;
112 char type;
113 while (in >> len >> type) {
114 switch (type) {
115 case 'H': case 'S':
116 if (first)
117 qstart = len;
118 qlen += len;
119 break;
120 case 'M': case 'X': case '=':
121 qlen += len;
122 qspan += len;
123 tspan += len;
124 break;
125 case 'I':
126 qlen += len;
127 qspan += len;
128 break;
129 case 'D': case 'N': case 'P':
130 tspan += len;
131 break;
132 default:
133 std::cerr << "error: invalid CIGAR: `"
134 << cigar << "'\n";
135 exit(EXIT_FAILURE);
136 }
137 first = false;
138 }
139 assert(in.eof());
140 }
141 };
142
143 /**
144 * Return the position of the first base of the query on the
145 * target extrapolated from the start of the alignment.
146 */
targetAtQueryStartSAMAlignment147 int targetAtQueryStart() const
148 {
149 CigarCoord a(cigar);
150 assert(a.qstart + a.qspan <= a.qlen);
151 return isReverse()
152 ? pos + a.tspan + (a.qlen - a.qspan - a.qstart)
153 : pos - a.qstart;
154 }
155
156 /** Parse the specified CIGAR string.
157 * @return an alignment setting the fields read_start_pos,
158 * align_length, and read_length. The other fields will be
159 * uninitialized.
160 */
parseCigarSAMAlignment161 static Alignment parseCigar(const std::string& cigar, bool isRC) {
162 Alignment a;
163 std::istringstream in(cigar);
164 unsigned len;
165 char type;
166 unsigned clip0 = 0;
167 a.align_length = 0;
168 unsigned qlen = 0;
169 unsigned clip1 = 0;
170 while (in >> len >> type) {
171 switch (type) {
172 case 'I': case 'X': case '=':
173 qlen += len;
174 clip1 += len;
175 // fallthrough
176 case 'D': case 'N': case 'P':
177 if (a.align_length == 0) {
178 // Ignore a malformatted CIGAR string whose first
179 // non-clipping operation is not M.
180 std::cerr << "warning: malformatted CIGAR: "
181 << cigar << std::endl;
182 }
183 break;
184 case 'M':
185 if ((unsigned)a.align_length < len) {
186 clip0 += a.align_length + clip1;
187 a.align_length = len;
188 qlen += len;
189 clip1 = 0;
190 break;
191 }
192 // fallthrough
193 case 'H': case 'S':
194 qlen += len;
195 clip1 += len;
196 break;
197 default:
198 std::cerr << "error: invalid CIGAR: `"
199 << cigar << "'\n";
200 exit(EXIT_FAILURE);
201 }
202 }
203 a.read_start_pos = isRC ? clip1 : clip0;
204 a.read_length = qlen;
205 if (!in.eof()){
206 std::cerr << "error: invalid CIGAR: `"
207 << cigar << "'\n";
208 exit(EXIT_FAILURE);
209 }
210 return a;
211 }
212
AlignmentSAMAlignment213 operator Alignment() const {
214 assert(~flag & FUNMAP);
215 bool isRC = flag & FREVERSE; // strand of the query
216 Alignment a = parseCigar(cigar, isRC);
217 a.contig = rname;
218 a.contig_start_pos = pos;
219 a.isRC = isRC;
220 return a;
221 }
222 };
223
224 /** A SAM alignment of a query and its mate. */
225 struct SAMRecord : SAMAlignment {
226 std::string qname;
227 std::string mrnm;
228 int mpos;
229 int isize;
230 #if SAM_SEQ_QUAL
231 std::string seq;
232 std::string qual;
233 std::string tags;
234 #endif
235
236 /** Consturct a single-end alignment. */
237 explicit SAMRecord(const SAMAlignment& a = SAMAlignment(),
238 const std::string& qname = "*",
239 #if SAM_SEQ_QUAL
240 const std::string& seq = "*",
241 const std::string& qual = "*",
242 const std::string& tags = ""
243 #else
244 const std::string& /*seq*/ = "*",
245 const std::string& /*qual*/ = "*",
246 const std::string& /*tags*/ = ""
247 #endif
248 ) :
SAMAlignmentSAMRecord249 SAMAlignment(a),
250 qname(qname),
251 mrnm("*"),
252 mpos(-1),
253 isize(0)
254 #if SAM_SEQ_QUAL
255 ,
256 seq(seq),
257 qual(qual),
258 tags(tags)
259 #endif
260 {
261 }
262
263 /** Construct a paired-end alignment. */
SAMRecordSAMRecord264 SAMRecord(const Alignment& a0, const Alignment& a1)
265 {
266 *this = SAMRecord(a0);
267 flag |= FPAIRED;
268 if (a1.isRC)
269 flag |= FMREVERSE;
270 mrnm = a1.contig;
271 mpos = a1.contig_start_pos;
272 isize = a1.targetAtQueryStart() - a0.targetAtQueryStart();
273 }
274
275 /** Set the mate mapping fields. */
fixMateSAMRecord276 void fixMate(const SAMAlignment& o)
277 {
278 flag &= ~(FPROPER_PAIR | FMUNMAP | FMREVERSE);
279 flag |= FPAIRED;
280 if (rname == o.rname && isReverse() != o.isReverse())
281 flag |= FPROPER_PAIR;
282 if (o.isUnmapped())
283 flag |= FMUNMAP;
284 if (o.isReverse())
285 flag |= FMREVERSE;
286 mrnm = o.rname;
287 mpos = o.pos;
288 isize = isMateUnmapped() ? 0
289 : o.targetAtQueryStart() - targetAtQueryStart();
290
291 // Fix unaligned mates
292 if (!o.isUnmapped() && isUnmapped()) {
293 rname = o.rname;
294 pos = o.pos;
295 } else if (o.isUnmapped() && !isUnmapped()) {
296 mrnm = "=";
297 mpos = pos;
298 }
299 }
300
noMateSAMRecord301 void noMate() { flag &= ~FPAIRED; }
302
303 /**
304 * Return the position of the first base of the mate query on the
305 * target extrapolated from the start of the alignment.
306 */
mateTargetAtQueryStartSAMRecord307 int mateTargetAtQueryStart() const
308 {
309 return targetAtQueryStart() + isize;
310 }
311
312 friend std::ostream& operator <<(std::ostream& out,
313 const SAMRecord& o)
314 {
315 out << o.qname
316 << '\t' << o.flag
317 << '\t' << o.rname
318 << '\t' << (1 + o.pos)
319 << '\t' << o.mapq
320 << '\t' << o.cigar
321 << '\t' << (o.mrnm == o.rname ? "=" : o.mrnm)
322 << '\t' << (1 + o.mpos)
323 << '\t' << o.isize;
324 #if SAM_SEQ_QUAL
325 out << '\t' << o.seq
326 << '\t' << o.qual;
327 if (!o.tags.empty())
328 out << '\t' << o.tags;
329 #else
330 out << "\t*\t*";
331 #endif
332 return out;
333 }
334
335 friend std::istream& operator >>(std::istream& in, SAMRecord& o)
336 {
337 in >> o.qname
338 >> o.flag >> o.rname >> o.pos >> o.mapq
339 >> o.cigar >> o.mrnm >> o.mpos >> o.isize;
340 #if SAM_SEQ_QUAL
341 in >> o.seq >> o.qual;
342 getline(in >> Skip('\t'), o.tags);
343 #else
344 in.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
345 #endif
346 if (!in)
347 return in;
348 o.pos--;
349 o.mpos--;
350 if (o.mrnm == "=")
351 o.mrnm = o.rname;
352
353 // Set the paired flags if qname ends in /1 or /2.
354 unsigned l = o.qname.length();
355 if (l >= 2 && o.qname[l-2] == '/') {
356 switch (o.qname[l-1]) {
357 case '1': o.flag |= FPAIRED | FREAD1; break;
358 case '2':
359 case '3': o.flag |= FPAIRED | FREAD2; break;
360 default: return in;
361 }
362 o.qname.resize(l - 2);
363 assert(!o.qname.empty());
364 }
365
366 // Set the unmapped flag if the alignment is not long enough.
367 CigarCoord a(o.cigar);
368 if (a.qspan < opt::minAlign || a.tspan < opt::minAlign)
369 o.flag |= FUNMAP;
370 return in;
371 }
372 };
373
374 /** Set the mate mapping fields of a0 and a1. */
fixMate(SAMRecord & a0,SAMRecord & a1)375 static inline void fixMate(SAMRecord& a0, SAMRecord& a1)
376 {
377 a0.fixMate(a1);
378 a1.fixMate(a0);
379 }
380
381 /** Read contig lengths from SAM headers. */
readContigLengths(std::istream & in,std::vector<unsigned> & lengths)382 static inline unsigned readContigLengths(std::istream& in, std::vector<unsigned>& lengths)
383 {
384 assert(in);
385 assert(lengths.empty());
386 assert(g_contigNames.empty());
387 for (std::string line; in.peek() == '@' && getline(in, line);) {
388 std::istringstream ss(line);
389 std::string type;
390 ss >> type;
391 if (type != "@SQ")
392 continue;
393
394 std::string s;
395 unsigned len;
396 ss >> expect(" SN:") >> s >> expect(" LN:") >> len;
397 assert(ss);
398
399 put(g_contigNames, lengths.size(), s);
400 lengths.push_back(len);
401 }
402 if (lengths.empty()) {
403 std::cerr << "error: no @SQ records in the SAM header\n";
404 exit(EXIT_FAILURE);
405 }
406 return lengths.size();
407 }
408
409 #endif
410