1 #include "Aligner.h"
2 #include "Iterator.h"
3 #include "SAM.h"
4 #include "Sequence.h"
5 #include <algorithm>
6 #include <cassert>
7 #include <cstdlib>
8 #include <iterator>
9 #include <utility>
10 
11 using namespace std;
12 
13 namespace opt {
14 	/** For a duplicate k-mer in the target
15 	 * ERROR: report an error and exit
16 	 * MULTIMAP: report all alignments
17 	 * IGNORE: do not report any alignments
18 	 */
19 	int multimap;
20 }
21 
22 template <>
addReferenceSequence(const Kmer & kmer,Position pos)23 void Aligner<SeqPosHashMultiMap>::addReferenceSequence(
24 		const Kmer& kmer, Position pos)
25 {
26 	assert(opt::multimap == opt::MULTIMAP);
27 	m_target.insert(make_pair(kmer, pos));
28 }
29 
30 template <class SeqPosHashMap>
addReferenceSequence(const Kmer & kmer,Position pos)31 void Aligner<SeqPosHashMap>::addReferenceSequence(
32 		const Kmer& kmer, Position pos)
33 {
34 	assert(opt::multimap != opt::MULTIMAP);
35 	Kmer rc_kmer = reverseComplement(kmer);
36 	map_iterator findIt = m_target.find(rc_kmer);
37 
38 	if (findIt != m_target.end()) {
39 		if (!findIt->second.isDuplicate())
40 			findIt->second.setDuplicate(
41 					contigIndexToID(findIt->second.contig),
42 					contigIndexToID(pos.contig), kmer.str());
43 		return;
44 	}
45 
46 	pair<map_iterator, bool> inserted
47 		= m_target.insert(make_pair(kmer, pos));
48 
49 	if (!inserted.second)
50 		if (!inserted.first->second.isDuplicate())
51 			inserted.first->second.setDuplicate(
52 					contigIndexToID(inserted.first->second.contig),
53 					contigIndexToID(pos.contig), kmer.str());
54 }
55 
56 /** Create an index of the target sequence. */
57 template <class SeqPosHashMap>
addReferenceSequence(const StringID & idString,const Sequence & seq)58 void Aligner<SeqPosHashMap>::addReferenceSequence(
59 		const StringID& idString, const Sequence& seq)
60 {
61 	unsigned id = contigIDToIndex(idString);
62 	int size = seq.length();
63 	for(int i = 0; i < (size - m_hashSize + 1); ++i)
64 	{
65 		Sequence subseq(seq, i, m_hashSize);
66 		if (subseq.find_first_not_of("ACGT0123") != string::npos)
67 			continue;
68 		addReferenceSequence(Kmer(subseq), Position(id, i));
69 	}
70 }
71 
72 template <class SeqPosHashMap>
73 template <class oiterator>
alignRead(const string & qid,const Sequence & seq,oiterator dest)74 void Aligner<SeqPosHashMap>::alignRead(
75 		const string& qid, const Sequence& seq,
76 		oiterator dest)
77 {
78 	coalesceAlignments(qid, seq,
79 			getAlignmentsInternal(seq, false), dest);
80 	Sequence seqrc = reverseComplement(seq);
81 	coalesceAlignments(qid, seqrc,
82 			getAlignmentsInternal(seqrc, true), dest);
83 }
84 
85 /** Store all alignments for a given Kmer in the parameter aligns.
86  *  @param[out] aligns Map of contig IDs to alignment vectors.
87  */
88 template <class SeqPosHashMap>
alignKmer(AlignmentSet & aligns,const Sequence & seq,bool isRC,bool good,int read_ind,int seqLen)89 void Aligner<SeqPosHashMap>::alignKmer(
90 		AlignmentSet& aligns, const Sequence& seq,
91 		bool isRC, bool good, int read_ind, int seqLen)
92 {
93 	assert(read_ind >= 0);
94 	Sequence kmer(seq, read_ind, m_hashSize);
95 	if (!good && kmer.find_first_not_of("ACGT0123") != string::npos)
96 		return;
97 
98 	pair<map_const_iterator, map_const_iterator> range
99 		= m_target.equal_range(Kmer(kmer));
100 
101 	if (range.first != range.second
102 				&& opt::multimap == opt::IGNORE
103 				&& range.first->second.isDuplicate())
104 		return;
105 
106 	for (map_const_iterator resultIter = range.first;
107 			resultIter != range.second; ++resultIter) {
108 		assert(opt::multimap != opt::IGNORE
109 				|| !resultIter->second.isDuplicate());
110 
111 		int read_pos = !isRC ? read_ind
112 			: Alignment::calculateReverseReadStart(
113 					read_ind, seqLen, m_hashSize);
114 		unsigned ctgIndex = resultIter->second.contig;
115 		Alignment align(string(),
116 				resultIter->second.pos, read_pos, m_hashSize,
117 				seqLen, isRC);
118 		aligns[ctgIndex].push_back(align);
119 	}
120 }
121 
122 template <class SeqPosHashMap>
123 typename Aligner<SeqPosHashMap>::AlignmentSet
getAlignmentsInternal(const Sequence & seq,bool isRC)124 Aligner<SeqPosHashMap>::getAlignmentsInternal(
125 		const Sequence& seq, bool isRC)
126 {
127 	// The results
128 	AlignmentSet aligns;
129 
130 	bool good = seq.find_first_not_of("ACGT0123") == string::npos;
131 	int seqLen = seq.length();
132 	int last_kmer = seqLen - m_hashSize;
133 
134 	if (last_kmer < 0)
135 		return aligns;
136 
137 	// Align the first kmer
138 	alignKmer(aligns, seq, isRC, good, 0, seqLen);
139 
140 	if (last_kmer == 0)
141 		return aligns;
142 
143 	// Align the last kmer
144 	alignKmer(aligns, seq, isRC, good, last_kmer, seqLen);
145 
146 	// Short-cut logic ignoring the middle alignments if the first
147 	// and last kmers overlap, and align to the same contig
148 	if (good && seqLen <= 2 * m_hashSize && aligns.size() == 1) {
149 		AlignmentSet::const_iterator ctgIter = aligns.begin();
150 		const AlignmentVector& a = ctgIter->second;
151 		if (ctgIter->second.size() == 2) {
152 			int qstep = isRC
153 					? a[0].read_start_pos - a[1].read_start_pos
154 					: a[1].read_start_pos - a[0].read_start_pos;
155 			assert(qstep >= 0);
156 
157 			// Verify this isn't a kmer aligning to two parts of a
158 			// contig, and that the alignments are coalescable.
159 			if (qstep == last_kmer &&
160 					a[1].contig_start_pos
161 						== a[0].contig_start_pos + qstep)
162 				return aligns;
163 		}
164 	}
165 
166 	// Align middle kmers
167 	for(int i = 1; i < last_kmer; ++i)
168 		alignKmer(aligns, seq, isRC, good, i, seqLen);
169 
170 	return aligns;
171 }
172 
compareQueryPos(const Alignment & a1,const Alignment & a2)173 static int compareQueryPos(const Alignment& a1, const Alignment& a2)
174 {
175 	return a1.read_start_pos < a2.read_start_pos;
176 }
177 
178 /** Coalesce the k-mer alignments into a read alignment. */
179 template <class SeqPosHashMap>
180 template <class oiterator>
coalesceAlignments(const string & qid,const string & seq,const AlignmentSet & alignSet,oiterator & dest)181 void Aligner<SeqPosHashMap>::coalesceAlignments(
182 		const string& qid, const string& seq,
183 		const AlignmentSet& alignSet,
184 		oiterator& dest)
185 {
186 	typedef typename output_iterator_traits<oiterator>::value_type
187 		value_type;
188 	for (AlignmentSet::const_iterator ctgIter = alignSet.begin();
189 			ctgIter != alignSet.end(); ++ctgIter) {
190 		AlignmentVector alignVec = ctgIter->second;
191 		assert(!alignVec.empty());
192 
193 		sort(alignVec.begin(), alignVec.end(), compareQueryPos);
194 
195 		AlignmentVector::iterator prevIter = alignVec.begin();
196 		AlignmentVector::iterator currIter = alignVec.begin() + 1;
197 		Alignment currAlign = *prevIter;
198 		while (currIter != alignVec.end()) {
199 			int qstep = currIter->read_start_pos -
200 						prevIter->read_start_pos;
201 			assert(qstep >= 0);
202 			int tstep = currIter->isRC ? -qstep : qstep;
203 			if (currIter->contig_start_pos
204 						== prevIter->contig_start_pos + tstep
205 					&& qstep <= m_hashSize) {
206 				currAlign.align_length += qstep;
207 				if (currAlign.isRC)
208 					currAlign.contig_start_pos -= qstep;
209 			} else {
210 				currAlign.contig = contigIndexToID(ctgIter->first);
211 				*dest++ = value_type(currAlign, qid, seq);
212 				currAlign = *currIter;
213 			}
214 
215 			prevIter = currIter;
216 			currIter++;
217 		}
218 
219 		currAlign.contig = contigIndexToID(ctgIter->first);
220 		*dest++ = value_type(currAlign, qid, seq);
221 	}
222 }
223 
224 // Explicit instantiation.
225 template void Aligner<SeqPosHashMultiMap>::addReferenceSequence(
226 		const StringID& id, const Sequence& seq);
227 
228 template void Aligner<SeqPosHashUniqueMap>::addReferenceSequence(
229 		const StringID& id, const Sequence& seq);
230 
231 template void Aligner<SeqPosHashMultiMap>::
232 alignRead<affix_ostream_iterator<Alignment> >(
233 		const string& qid, const Sequence& seq,
234 		affix_ostream_iterator<Alignment> dest);
235 
236 template void Aligner<SeqPosHashUniqueMap>::
237 alignRead<affix_ostream_iterator<Alignment> >(
238 		const string& qid, const Sequence& seq,
239 		affix_ostream_iterator<Alignment> dest);
240 
241 template void Aligner<SeqPosHashMultiMap>::
242 alignRead<ostream_iterator<SAMRecord> >(
243 		const string& qid, const Sequence& seq,
244 		ostream_iterator<SAMRecord> dest);
245 
246 template void Aligner<SeqPosHashUniqueMap>::
247 alignRead<ostream_iterator<SAMRecord> >(
248 		const string& qid, const Sequence& seq,
249 		ostream_iterator<SAMRecord> dest);
250