1 /*
2 * Copyright (C) 2010 Regents of the University of Michigan
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <stdexcept>
19
20 #include "SamQuerySeqWithRefHelper.h"
21 #include "BaseUtilities.h"
22 #include "SamFlag.h"
23
SamQuerySeqWithRefIter(SamRecord & record,GenomeSequence & refSequence,bool forward)24 SamQuerySeqWithRefIter::SamQuerySeqWithRefIter(SamRecord& record,
25 GenomeSequence& refSequence,
26 bool forward)
27 : myRecord(record),
28 myRefSequence(refSequence),
29 myCigar(NULL),
30 myStartOfReadOnRefIndex(INVALID_GENOME_INDEX),
31 myQueryIndex(0),
32 myForward(forward)
33 {
34 myCigar = myRecord.getCigarInfo();
35 myStartOfReadOnRefIndex =
36 refSequence.getGenomePosition(myRecord.getReferenceName());
37
38 if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
39 {
40 // This reference name was found in the reference file, so
41 // add the start position.
42 myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
43 }
44
45 if(!forward)
46 {
47 myQueryIndex = myRecord.getReadLength() - 1;
48 }
49 }
50
51
~SamQuerySeqWithRefIter()52 SamQuerySeqWithRefIter::~SamQuerySeqWithRefIter()
53 {
54 }
55
56
57
reset(bool forward)58 bool SamQuerySeqWithRefIter::reset(bool forward)
59 {
60 myCigar = myRecord.getCigarInfo();
61 if(myCigar == NULL)
62 {
63 // Failed to get Cigar.
64 return(false);
65 }
66
67 // Get where the position of where this read starts as mapped to the
68 // reference.
69 myStartOfReadOnRefIndex =
70 myRefSequence.getGenomePosition(myRecord.getReferenceName());
71
72 if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
73 {
74 // This reference name was found in the reference file, so
75 // add the start position.
76 myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
77 }
78
79 myForward = forward;
80
81 if(myForward)
82 {
83 myQueryIndex = 0;
84 }
85 else
86 {
87 // reverse, so start at the last entry.
88 myQueryIndex = myRecord.getReadLength() - 1;
89 }
90 return(true);
91 }
92
93
94 // Returns information for the next position where the query and the
95 // reference match or mismatch. To be a match or mismatch, both the query
96 // and reference must have a base that is not 'N'.
97 // This means:
98 // insertions and deletions are not mismatches or matches.
99 // 'N' bases are not matches or mismatches
100 // Returns true if an entry was found, false if there are no more matches or
101 // mismatches.
getNextMatchMismatch(SamSingleBaseMatchInfo & matchMismatchInfo)102 bool SamQuerySeqWithRefIter::getNextMatchMismatch(SamSingleBaseMatchInfo& matchMismatchInfo)
103 {
104 // Check whether or not this read is mapped.
105 // If the read is not mapped, return no matches.
106 if(!SamFlag::isMapped(myRecord.getFlag()))
107 {
108 // Not mapped.
109 return(false);
110 }
111
112 // Check that the Cigar is set.
113 if(myCigar == NULL)
114 {
115 // Error.
116 throw(std::runtime_error("Cannot determine matches/mismatches since failed to retrieve the cigar"));
117 return(false);
118 }
119
120 // If myStartOfReadOnRefIndex is the default (unset) value, then
121 // the reference was not found, so return false, no matches/mismatches.
122 if(myStartOfReadOnRefIndex == INVALID_GENOME_INDEX)
123 {
124 // This reference name was not found in the reference file, so just
125 // return no matches/mismatches.
126 return(false);
127 }
128
129
130 // Repull the read length from the record to check just in case the
131 // record has changed length.
132 // Loop until a match or mismatch is found as long as query index
133 // is still within the read (Loop is broken by a return).
134 while((myQueryIndex < myRecord.getReadLength()) &&
135 (myQueryIndex >= 0))
136 {
137 // Still more bases, look for a match/mismatch.
138
139 // Get the reference offset for this read position.
140 int32_t refOffset = myCigar->getRefOffset(myQueryIndex);
141 if(refOffset == Cigar::INDEX_NA)
142 {
143 // This is either a softclip or an insertion
144 // which do not count as a match or a mismatch, so
145 // go to the next index.
146 nextIndex();
147 continue;
148 }
149
150 // Both the reference and the read have a base, so get the bases.
151 char readBase = myRecord.getSequence(myQueryIndex, SamRecord::NONE);
152 char refBase = myRefSequence[myStartOfReadOnRefIndex + refOffset];
153
154 // If either the read or the reference bases are unknown, then
155 // it does not count as a match or a mismatch.
156 if(BaseUtilities::isAmbiguous(readBase) ||
157 BaseUtilities::isAmbiguous(refBase))
158 {
159 // Either the reference base or the read base are unknown,
160 // so skip this position.
161 nextIndex();
162 continue;
163 }
164
165 // Both the read & the reference have a known base, so it is either
166 // a match or a mismatch.
167 matchMismatchInfo.setQueryIndex(myQueryIndex);
168
169 // Check if they are equal.
170 if(BaseUtilities::areEqual(readBase, refBase))
171 {
172 // Match.
173 matchMismatchInfo.setType(SamSingleBaseMatchInfo::MATCH);
174 // Increment the query index to the next position.
175 nextIndex();
176 return(true);
177 }
178 else
179 {
180 // Mismatch
181 matchMismatchInfo.setType(SamSingleBaseMatchInfo::MISMATCH);
182 // Increment the query index to the next position.
183 nextIndex();
184 return(true);
185 }
186 }
187
188 // No matches or mismatches were found, so return false.
189 return(false);
190 }
191
192
nextIndex()193 void SamQuerySeqWithRefIter::nextIndex()
194 {
195 if(myForward)
196 {
197 ++myQueryIndex;
198 }
199 else
200 {
201 --myQueryIndex;
202 }
203 }
204
205
SamSingleBaseMatchInfo()206 SamSingleBaseMatchInfo::SamSingleBaseMatchInfo()
207 : myType(UNKNOWN),
208 myQueryIndex(0)
209 {
210 }
211
212
~SamSingleBaseMatchInfo()213 SamSingleBaseMatchInfo::~SamSingleBaseMatchInfo()
214 {
215 }
216
217
getType()218 SamSingleBaseMatchInfo::Type SamSingleBaseMatchInfo::getType()
219 {
220 return(myType);
221 }
222
223
getQueryIndex()224 int32_t SamSingleBaseMatchInfo::getQueryIndex()
225 {
226 return(myQueryIndex);
227 }
228
229
setType(Type newType)230 void SamSingleBaseMatchInfo::setType(Type newType)
231 {
232 myType = newType;
233 }
234
235
setQueryIndex(int32_t queryIndex)236 void SamSingleBaseMatchInfo::setQueryIndex(int32_t queryIndex)
237 {
238 myQueryIndex = queryIndex;
239 }
240
241
242 ///////////////////////////////////////////////////////////////////////////
seqWithEquals(const char * currentSeq,int32_t seq0BasedPos,Cigar & cigar,const char * referenceName,const GenomeSequence & refSequence,std::string & updatedSeq)243 void SamQuerySeqWithRef::seqWithEquals(const char* currentSeq,
244 int32_t seq0BasedPos,
245 Cigar& cigar,
246 const char* referenceName,
247 const GenomeSequence& refSequence,
248 std::string& updatedSeq)
249 {
250 updatedSeq = currentSeq;
251
252 int32_t seqLength = updatedSeq.length();
253 int32_t queryIndex = 0;
254
255 uint32_t startOfReadOnRefIndex =
256 refSequence.getGenomePosition(referenceName);
257
258 if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
259 {
260 // This reference name was not found in the reference file, so just
261 // return.
262 return;
263 }
264 startOfReadOnRefIndex += seq0BasedPos;
265
266 // Loop until the entire sequence has been updated.
267 while(queryIndex < seqLength)
268 {
269 // Still more bases, look for matches.
270
271 // Get the reference offset for this read position.
272 int32_t refOffset = cigar.getRefOffset(queryIndex);
273 if(refOffset != Cigar::INDEX_NA)
274 {
275 // Both the reference and the read have a base, so get the bases.
276 char readBase = currentSeq[queryIndex];
277 char refBase = refSequence[startOfReadOnRefIndex + refOffset];
278
279 // If neither base is unknown and they are the same, count it
280 // as a match.
281 if(!BaseUtilities::isAmbiguous(readBase) &&
282 !BaseUtilities::isAmbiguous(refBase) &&
283 (BaseUtilities::areEqual(readBase, refBase)))
284 {
285 // Match.
286 updatedSeq[queryIndex] = '=';
287 }
288 }
289 // Increment the query index to the next position.
290 ++queryIndex;
291 continue;
292 }
293 }
294
295
seqWithoutEquals(const char * currentSeq,int32_t seq0BasedPos,Cigar & cigar,const char * referenceName,const GenomeSequence & refSequence,std::string & updatedSeq)296 void SamQuerySeqWithRef::seqWithoutEquals(const char* currentSeq,
297 int32_t seq0BasedPos,
298 Cigar& cigar,
299 const char* referenceName,
300 const GenomeSequence& refSequence,
301 std::string& updatedSeq)
302 {
303 updatedSeq = currentSeq;
304
305 int32_t seqLength = updatedSeq.length();
306 int32_t queryIndex = 0;
307
308 uint32_t startOfReadOnRefIndex =
309 refSequence.getGenomePosition(referenceName);
310
311 if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
312 {
313 // This reference name was not found in the reference file, so just
314 // return.
315 return;
316 }
317 startOfReadOnRefIndex += seq0BasedPos;
318
319 // Loop until the entire sequence has been updated.
320 while(queryIndex < seqLength)
321 {
322 // Still more bases, look for matches.
323
324 // Get the reference offset for this read position.
325 int32_t refOffset = cigar.getRefOffset(queryIndex);
326 if(refOffset != Cigar::INDEX_NA)
327 {
328 // Both the reference and the read have a base, so get the bases.
329 char readBase = currentSeq[queryIndex];
330 char refBase = refSequence[startOfReadOnRefIndex + refOffset];
331
332 // If the bases are equal, set the sequence to the reference
333 // base. (Skips the check for ambiguous to catch a case where
334 // ambiguous had been converted to a '=', and if both are ambiguous,
335 // it will still be set to ambiguous.)
336 if(BaseUtilities::areEqual(readBase, refBase))
337 {
338 // Match.
339 updatedSeq[queryIndex] = refBase;
340 }
341 }
342
343 // Increment the query index to the next position.
344 ++queryIndex;
345 continue;
346 }
347 }
348