1 #include <iostream>
2 #include <map>
3 
4 #include <seqan/basic.h>
5 #include <seqan/sequence.h>
6 #include <seqan/stream.h>
7 #include <seqan/bam_io.h>
8 
9 // USAGE: error_rate_from_sam IN.sam > OUT.txt
10 
11 // There must only be one alignment per read.
12 //
13 // We ignore unaligned reads, and just count them.
14 
main(int argc,char const ** argv)15 int main(int argc, char const ** argv)
16 {
17     using namespace seqan;
18 
19     // Checking command line parameters.
20 
21     if (argc != 2)
22     {
23         std::cerr << "USAGE: error_rate IN.sam > OUT.txt\n";
24         return 1;
25     }
26 
27     // Opening file and record reader.
28 
29     std::fstream in(argv[1], std::ios::in | std::ios::binary);
30     RecordReader<std::fstream, SinglePass<> > reader(in);
31 
32     // Allocating BAM header, reference name store, and cache.
33 
34     typedef StringSet<CharString>      TNameStore;
35     typedef NameStoreCache<TNameStore> TNameStoreCache;
36 
37     TNameStore refNameStore;
38     TNameStoreCache refNameStoreCache(refNameStore);
39     BamIOContext<TNameStore> context(refNameStore, refNameStoreCache);
40 
41     // Read header.
42 
43     BamHeader header;
44     int res = 0;
45     res = readRecord(header, context, reader, Sam());
46     if (res != 0)
47     {
48         std::cerr << "Could not read SAM header!\n";
49         return 1;
50     }
51 
52     // Check that the SAM file is sorted by QNAME.
53     CharString sortOrder;
54     for (unsigned i = 0; i < length(header.records); ++i)
55     {
56         if (header.records[i].type != BAM_HEADER_FIRST)
57             continue;
58         unsigned idx = 0;
59         if (findTagKey(idx, "SO", header.records[i]))
60             sortOrder = header.records[i].tags[idx].i2;
61     }
62     if (sortOrder != "queryname")
63     {
64         std::cerr << "SAM file not sorted by 'queryname'!\n";
65         return 1;
66     }
67 
68     // Allocate data structures for counting / histogram building.
69     unsigned totalBaseCount = 0;            // Number of bases read.
70     unsigned totalErrorCount = 0;           // Number of errors read.
71     unsigned totalReadCount = 0;            // Number of reads read.
72     unsigned totalErrorneousReadCount = 0;  // Number of reads with errors read, excluding unaligned reads.
73     unsigned totalUnalignedReadCount = 0;   // Number of reads without alignments.
74     std::map<unsigned, unsigned> histo;     // Histogram error count -> num occurrences.
75 
76     // Read records
77 
78     BamAlignmentRecord record;
79     CharString previousQName;
80     bool previousIsFirst = false;
81 
82     while (!atEnd(reader))
83     {
84         res = readRecord(record, context, reader, Sam());
85         if (res != 0)
86         {
87             std::cerr << "Error reading SAM record!\n";
88             return 1;
89         }
90 
91         // Skip secondary records.
92         if (hasFlagSecondary(record))
93             continue;
94 
95         // Skip non-aligning records.
96         if (hasFlagUnaligned(record))
97             continue;
98 
99         // Check that this is not a duplicate non-secondary record.
100         if (record.qName == previousQName && hasFlagFirst(record) == previousIsFirst)
101         {
102             std::cerr << "ERROR: Duplicate non-secondary record for " << record.qName << "\n";
103             return 1;
104         }
105 
106         BamTagsDict bamTags(record.tags);
107 
108         // Counter: Total reads, unaligned reads.
109         if (record.rId == BamAlignmentRecord::INVALID_REFID)
110         {
111             totalUnalignedReadCount += 1;
112             continue;
113         }
114         totalReadCount += 1;
115 
116         // Get tag with edit distance, must be present for aligned reads.
117         unsigned idx = 0;
118         if (!findTagKey(idx, bamTags, "NM"))
119         {
120             std::cerr << "ERROR: Could not find NM tag!\n";
121             return 1;
122         }
123         int editDistance = 0;
124         if (!extractTagValue(editDistance, bamTags, idx))
125         {
126             std::cerr << "ERROR: Could not cast NM tag to int!\n";
127             return 1;
128         }
129 
130         // Count: Reads with errors.
131         totalErrorneousReadCount += (editDistance != 0);
132 
133         // Count: Bases.
134         totalBaseCount += length(record.seq);
135         totalErrorCount += editDistance;
136 
137         // Register with histogram.
138         histo[editDistance] += 1;
139 
140         // Update previous QNAME and is-first flag.
141         previousQName = record.qName;
142         previousIsFirst = hasFlagFirst(record);
143     }
144 
145     // Print results.
146 
147     // The quick stats are what we need for the table in the paper.
148     std::cout << "QUICK STATS\n"
149               << "by base %\tby read %\n";
150     fprintf(stdout, "%5.2f\t\t%5.2f\n\n", 100.0 * totalErrorCount / totalBaseCount, 100.0 * totalErrorneousReadCount / totalReadCount);
151 
152     // Print detailed statistics and histogram.
153     std::cout << "STATISTICS\n"
154               << "total read count      " << totalReadCount << "\t\t(excludes unaligned reads)\n"
155               << "unaligned read count  " << totalUnalignedReadCount << "\n"
156               << "erroneous read count " << totalErrorneousReadCount << "\n"
157               << "per read error rate   " << 100.0 * totalErrorneousReadCount / totalReadCount << "\n"
158               << "\n"
159               << "total bases           " << totalBaseCount << "\n"
160               << "total errors          " << totalErrorCount << "\n"
161               << "per base error rate   " << 100.0 * totalErrorCount / totalBaseCount << "\n"
162               << "\n"
163               << "HISTOGRAM\n"
164               << "errors\tcount\t\tpercent\n";
165     for (std::map<unsigned, unsigned>::const_iterator it = histo.begin(); it != histo.end(); ++it)
166     {
167         fprintf(stdout, "%3d\t%12u\t%5.2f\n", it->first, it->second, 100.0 * it->second / totalReadCount);
168     }
169 
170     return 0;
171 }
172