1 #include <iostream>
2 #include <map>
3
4 #include <seqan/basic.h>
5 #include <seqan/sequence.h>
6 #include <seqan/stream.h>
7 #include <seqan/bam_io.h>
8
9 // USAGE: error_rate_from_sam IN.sam > OUT.txt
10
11 // There must only be one alignment per read.
12 //
13 // We ignore unaligned reads, and just count them.
14
main(int argc,char const ** argv)15 int main(int argc, char const ** argv)
16 {
17 using namespace seqan;
18
19 // Checking command line parameters.
20
21 if (argc != 2)
22 {
23 std::cerr << "USAGE: error_rate IN.sam > OUT.txt\n";
24 return 1;
25 }
26
27 // Opening file and record reader.
28
29 std::fstream in(argv[1], std::ios::in | std::ios::binary);
30 RecordReader<std::fstream, SinglePass<> > reader(in);
31
32 // Allocating BAM header, reference name store, and cache.
33
34 typedef StringSet<CharString> TNameStore;
35 typedef NameStoreCache<TNameStore> TNameStoreCache;
36
37 TNameStore refNameStore;
38 TNameStoreCache refNameStoreCache(refNameStore);
39 BamIOContext<TNameStore> context(refNameStore, refNameStoreCache);
40
41 // Read header.
42
43 BamHeader header;
44 int res = 0;
45 res = readRecord(header, context, reader, Sam());
46 if (res != 0)
47 {
48 std::cerr << "Could not read SAM header!\n";
49 return 1;
50 }
51
52 // Check that the SAM file is sorted by QNAME.
53 CharString sortOrder;
54 for (unsigned i = 0; i < length(header.records); ++i)
55 {
56 if (header.records[i].type != BAM_HEADER_FIRST)
57 continue;
58 unsigned idx = 0;
59 if (findTagKey(idx, "SO", header.records[i]))
60 sortOrder = header.records[i].tags[idx].i2;
61 }
62 if (sortOrder != "queryname")
63 {
64 std::cerr << "SAM file not sorted by 'queryname'!\n";
65 return 1;
66 }
67
68 // Allocate data structures for counting / histogram building.
69 unsigned totalBaseCount = 0; // Number of bases read.
70 unsigned totalErrorCount = 0; // Number of errors read.
71 unsigned totalReadCount = 0; // Number of reads read.
72 unsigned totalErrorneousReadCount = 0; // Number of reads with errors read, excluding unaligned reads.
73 unsigned totalUnalignedReadCount = 0; // Number of reads without alignments.
74 std::map<unsigned, unsigned> histo; // Histogram error count -> num occurrences.
75
76 // Read records
77
78 BamAlignmentRecord record;
79 CharString previousQName;
80 bool previousIsFirst = false;
81
82 while (!atEnd(reader))
83 {
84 res = readRecord(record, context, reader, Sam());
85 if (res != 0)
86 {
87 std::cerr << "Error reading SAM record!\n";
88 return 1;
89 }
90
91 // Skip secondary records.
92 if (hasFlagSecondary(record))
93 continue;
94
95 // Skip non-aligning records.
96 if (hasFlagUnaligned(record))
97 continue;
98
99 // Check that this is not a duplicate non-secondary record.
100 if (record.qName == previousQName && hasFlagFirst(record) == previousIsFirst)
101 {
102 std::cerr << "ERROR: Duplicate non-secondary record for " << record.qName << "\n";
103 return 1;
104 }
105
106 BamTagsDict bamTags(record.tags);
107
108 // Counter: Total reads, unaligned reads.
109 if (record.rId == BamAlignmentRecord::INVALID_REFID)
110 {
111 totalUnalignedReadCount += 1;
112 continue;
113 }
114 totalReadCount += 1;
115
116 // Get tag with edit distance, must be present for aligned reads.
117 unsigned idx = 0;
118 if (!findTagKey(idx, bamTags, "NM"))
119 {
120 std::cerr << "ERROR: Could not find NM tag!\n";
121 return 1;
122 }
123 int editDistance = 0;
124 if (!extractTagValue(editDistance, bamTags, idx))
125 {
126 std::cerr << "ERROR: Could not cast NM tag to int!\n";
127 return 1;
128 }
129
130 // Count: Reads with errors.
131 totalErrorneousReadCount += (editDistance != 0);
132
133 // Count: Bases.
134 totalBaseCount += length(record.seq);
135 totalErrorCount += editDistance;
136
137 // Register with histogram.
138 histo[editDistance] += 1;
139
140 // Update previous QNAME and is-first flag.
141 previousQName = record.qName;
142 previousIsFirst = hasFlagFirst(record);
143 }
144
145 // Print results.
146
147 // The quick stats are what we need for the table in the paper.
148 std::cout << "QUICK STATS\n"
149 << "by base %\tby read %\n";
150 fprintf(stdout, "%5.2f\t\t%5.2f\n\n", 100.0 * totalErrorCount / totalBaseCount, 100.0 * totalErrorneousReadCount / totalReadCount);
151
152 // Print detailed statistics and histogram.
153 std::cout << "STATISTICS\n"
154 << "total read count " << totalReadCount << "\t\t(excludes unaligned reads)\n"
155 << "unaligned read count " << totalUnalignedReadCount << "\n"
156 << "erroneous read count " << totalErrorneousReadCount << "\n"
157 << "per read error rate " << 100.0 * totalErrorneousReadCount / totalReadCount << "\n"
158 << "\n"
159 << "total bases " << totalBaseCount << "\n"
160 << "total errors " << totalErrorCount << "\n"
161 << "per base error rate " << 100.0 * totalErrorCount / totalBaseCount << "\n"
162 << "\n"
163 << "HISTOGRAM\n"
164 << "errors\tcount\t\tpercent\n";
165 for (std::map<unsigned, unsigned>::const_iterator it = histo.begin(); it != histo.end(); ++it)
166 {
167 fprintf(stdout, "%3d\t%12u\t%5.2f\n", it->first, it->second, 100.0 * it->second / totalReadCount);
168 }
169
170 return 0;
171 }
172