1 #include "Aligner.h"
2 #include "Iterator.h"
3 #include "SAM.h"
4 #include "Sequence.h"
5 #include <algorithm>
6 #include <cassert>
7 #include <cstdlib>
8 #include <iterator>
9 #include <utility>
10
11 using namespace std;
12
13 namespace opt {
14 /** For a duplicate k-mer in the target
15 * ERROR: report an error and exit
16 * MULTIMAP: report all alignments
17 * IGNORE: do not report any alignments
18 */
19 int multimap;
20 }
21
22 template <>
addReferenceSequence(const Kmer & kmer,Position pos)23 void Aligner<SeqPosHashMultiMap>::addReferenceSequence(
24 const Kmer& kmer, Position pos)
25 {
26 assert(opt::multimap == opt::MULTIMAP);
27 m_target.insert(make_pair(kmer, pos));
28 }
29
30 template <class SeqPosHashMap>
addReferenceSequence(const Kmer & kmer,Position pos)31 void Aligner<SeqPosHashMap>::addReferenceSequence(
32 const Kmer& kmer, Position pos)
33 {
34 assert(opt::multimap != opt::MULTIMAP);
35 Kmer rc_kmer = reverseComplement(kmer);
36 map_iterator findIt = m_target.find(rc_kmer);
37
38 if (findIt != m_target.end()) {
39 if (!findIt->second.isDuplicate())
40 findIt->second.setDuplicate(
41 contigIndexToID(findIt->second.contig),
42 contigIndexToID(pos.contig), kmer.str());
43 return;
44 }
45
46 pair<map_iterator, bool> inserted
47 = m_target.insert(make_pair(kmer, pos));
48
49 if (!inserted.second)
50 if (!inserted.first->second.isDuplicate())
51 inserted.first->second.setDuplicate(
52 contigIndexToID(inserted.first->second.contig),
53 contigIndexToID(pos.contig), kmer.str());
54 }
55
56 /** Create an index of the target sequence. */
57 template <class SeqPosHashMap>
addReferenceSequence(const StringID & idString,const Sequence & seq)58 void Aligner<SeqPosHashMap>::addReferenceSequence(
59 const StringID& idString, const Sequence& seq)
60 {
61 unsigned id = contigIDToIndex(idString);
62 int size = seq.length();
63 for(int i = 0; i < (size - m_hashSize + 1); ++i)
64 {
65 Sequence subseq(seq, i, m_hashSize);
66 if (subseq.find_first_not_of("ACGT0123") != string::npos)
67 continue;
68 addReferenceSequence(Kmer(subseq), Position(id, i));
69 }
70 }
71
72 template <class SeqPosHashMap>
73 template <class oiterator>
alignRead(const string & qid,const Sequence & seq,oiterator dest)74 void Aligner<SeqPosHashMap>::alignRead(
75 const string& qid, const Sequence& seq,
76 oiterator dest)
77 {
78 coalesceAlignments(qid, seq,
79 getAlignmentsInternal(seq, false), dest);
80 Sequence seqrc = reverseComplement(seq);
81 coalesceAlignments(qid, seqrc,
82 getAlignmentsInternal(seqrc, true), dest);
83 }
84
85 /** Store all alignments for a given Kmer in the parameter aligns.
86 * @param[out] aligns Map of contig IDs to alignment vectors.
87 */
88 template <class SeqPosHashMap>
alignKmer(AlignmentSet & aligns,const Sequence & seq,bool isRC,bool good,int read_ind,int seqLen)89 void Aligner<SeqPosHashMap>::alignKmer(
90 AlignmentSet& aligns, const Sequence& seq,
91 bool isRC, bool good, int read_ind, int seqLen)
92 {
93 assert(read_ind >= 0);
94 Sequence kmer(seq, read_ind, m_hashSize);
95 if (!good && kmer.find_first_not_of("ACGT0123") != string::npos)
96 return;
97
98 pair<map_const_iterator, map_const_iterator> range
99 = m_target.equal_range(Kmer(kmer));
100
101 if (range.first != range.second
102 && opt::multimap == opt::IGNORE
103 && range.first->second.isDuplicate())
104 return;
105
106 for (map_const_iterator resultIter = range.first;
107 resultIter != range.second; ++resultIter) {
108 assert(opt::multimap != opt::IGNORE
109 || !resultIter->second.isDuplicate());
110
111 int read_pos = !isRC ? read_ind
112 : Alignment::calculateReverseReadStart(
113 read_ind, seqLen, m_hashSize);
114 unsigned ctgIndex = resultIter->second.contig;
115 Alignment align(string(),
116 resultIter->second.pos, read_pos, m_hashSize,
117 seqLen, isRC);
118 aligns[ctgIndex].push_back(align);
119 }
120 }
121
122 template <class SeqPosHashMap>
123 typename Aligner<SeqPosHashMap>::AlignmentSet
getAlignmentsInternal(const Sequence & seq,bool isRC)124 Aligner<SeqPosHashMap>::getAlignmentsInternal(
125 const Sequence& seq, bool isRC)
126 {
127 // The results
128 AlignmentSet aligns;
129
130 bool good = seq.find_first_not_of("ACGT0123") == string::npos;
131 int seqLen = seq.length();
132 int last_kmer = seqLen - m_hashSize;
133
134 if (last_kmer < 0)
135 return aligns;
136
137 // Align the first kmer
138 alignKmer(aligns, seq, isRC, good, 0, seqLen);
139
140 if (last_kmer == 0)
141 return aligns;
142
143 // Align the last kmer
144 alignKmer(aligns, seq, isRC, good, last_kmer, seqLen);
145
146 // Short-cut logic ignoring the middle alignments if the first
147 // and last kmers overlap, and align to the same contig
148 if (good && seqLen <= 2 * m_hashSize && aligns.size() == 1) {
149 AlignmentSet::const_iterator ctgIter = aligns.begin();
150 const AlignmentVector& a = ctgIter->second;
151 if (ctgIter->second.size() == 2) {
152 int qstep = isRC
153 ? a[0].read_start_pos - a[1].read_start_pos
154 : a[1].read_start_pos - a[0].read_start_pos;
155 assert(qstep >= 0);
156
157 // Verify this isn't a kmer aligning to two parts of a
158 // contig, and that the alignments are coalescable.
159 if (qstep == last_kmer &&
160 a[1].contig_start_pos
161 == a[0].contig_start_pos + qstep)
162 return aligns;
163 }
164 }
165
166 // Align middle kmers
167 for(int i = 1; i < last_kmer; ++i)
168 alignKmer(aligns, seq, isRC, good, i, seqLen);
169
170 return aligns;
171 }
172
compareQueryPos(const Alignment & a1,const Alignment & a2)173 static int compareQueryPos(const Alignment& a1, const Alignment& a2)
174 {
175 return a1.read_start_pos < a2.read_start_pos;
176 }
177
178 /** Coalesce the k-mer alignments into a read alignment. */
179 template <class SeqPosHashMap>
180 template <class oiterator>
coalesceAlignments(const string & qid,const string & seq,const AlignmentSet & alignSet,oiterator & dest)181 void Aligner<SeqPosHashMap>::coalesceAlignments(
182 const string& qid, const string& seq,
183 const AlignmentSet& alignSet,
184 oiterator& dest)
185 {
186 typedef typename output_iterator_traits<oiterator>::value_type
187 value_type;
188 for (AlignmentSet::const_iterator ctgIter = alignSet.begin();
189 ctgIter != alignSet.end(); ++ctgIter) {
190 AlignmentVector alignVec = ctgIter->second;
191 assert(!alignVec.empty());
192
193 sort(alignVec.begin(), alignVec.end(), compareQueryPos);
194
195 AlignmentVector::iterator prevIter = alignVec.begin();
196 AlignmentVector::iterator currIter = alignVec.begin() + 1;
197 Alignment currAlign = *prevIter;
198 while (currIter != alignVec.end()) {
199 int qstep = currIter->read_start_pos -
200 prevIter->read_start_pos;
201 assert(qstep >= 0);
202 int tstep = currIter->isRC ? -qstep : qstep;
203 if (currIter->contig_start_pos
204 == prevIter->contig_start_pos + tstep
205 && qstep <= m_hashSize) {
206 currAlign.align_length += qstep;
207 if (currAlign.isRC)
208 currAlign.contig_start_pos -= qstep;
209 } else {
210 currAlign.contig = contigIndexToID(ctgIter->first);
211 *dest++ = value_type(currAlign, qid, seq);
212 currAlign = *currIter;
213 }
214
215 prevIter = currIter;
216 currIter++;
217 }
218
219 currAlign.contig = contigIndexToID(ctgIter->first);
220 *dest++ = value_type(currAlign, qid, seq);
221 }
222 }
223
224 // Explicit instantiation.
225 template void Aligner<SeqPosHashMultiMap>::addReferenceSequence(
226 const StringID& id, const Sequence& seq);
227
228 template void Aligner<SeqPosHashUniqueMap>::addReferenceSequence(
229 const StringID& id, const Sequence& seq);
230
231 template void Aligner<SeqPosHashMultiMap>::
232 alignRead<affix_ostream_iterator<Alignment> >(
233 const string& qid, const Sequence& seq,
234 affix_ostream_iterator<Alignment> dest);
235
236 template void Aligner<SeqPosHashUniqueMap>::
237 alignRead<affix_ostream_iterator<Alignment> >(
238 const string& qid, const Sequence& seq,
239 affix_ostream_iterator<Alignment> dest);
240
241 template void Aligner<SeqPosHashMultiMap>::
242 alignRead<ostream_iterator<SAMRecord> >(
243 const string& qid, const Sequence& seq,
244 ostream_iterator<SAMRecord> dest);
245
246 template void Aligner<SeqPosHashUniqueMap>::
247 alignRead<ostream_iterator<SAMRecord> >(
248 const string& qid, const Sequence& seq,
249 ostream_iterator<SAMRecord> dest);
250