1 /*
2  *  Copyright (C) 2010  Regents of the University of Michigan
3  *
4  *   This program is free software: you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation, either version 3 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *   GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <stdexcept>
19 
20 #include "SamQuerySeqWithRefHelper.h"
21 #include "BaseUtilities.h"
22 #include "SamFlag.h"
23 
SamQuerySeqWithRefIter(SamRecord & record,GenomeSequence & refSequence,bool forward)24 SamQuerySeqWithRefIter::SamQuerySeqWithRefIter(SamRecord& record,
25                                                GenomeSequence& refSequence,
26                                                bool forward)
27     : myRecord(record),
28       myRefSequence(refSequence),
29       myCigar(NULL),
30       myStartOfReadOnRefIndex(INVALID_GENOME_INDEX),
31       myQueryIndex(0),
32       myForward(forward)
33 {
34     myCigar = myRecord.getCigarInfo();
35     myStartOfReadOnRefIndex =
36         refSequence.getGenomePosition(myRecord.getReferenceName());
37 
38     if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
39     {
40         // This reference name was found in the reference file, so
41         // add the start position.
42         myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
43     }
44 
45     if(!forward)
46     {
47         myQueryIndex = myRecord.getReadLength() - 1;
48     }
49 }
50 
51 
~SamQuerySeqWithRefIter()52 SamQuerySeqWithRefIter::~SamQuerySeqWithRefIter()
53 {
54 }
55 
56 
57 
reset(bool forward)58 bool SamQuerySeqWithRefIter::reset(bool forward)
59 {
60     myCigar = myRecord.getCigarInfo();
61     if(myCigar == NULL)
62     {
63         // Failed to get Cigar.
64         return(false);
65     }
66 
67     // Get where the position of where this read starts as mapped to the
68     // reference.
69     myStartOfReadOnRefIndex =
70         myRefSequence.getGenomePosition(myRecord.getReferenceName());
71 
72     if(myStartOfReadOnRefIndex != INVALID_GENOME_INDEX)
73     {
74         // This reference name was found in the reference file, so
75         // add the start position.
76         myStartOfReadOnRefIndex += myRecord.get0BasedPosition();
77     }
78 
79     myForward = forward;
80 
81     if(myForward)
82     {
83         myQueryIndex = 0;
84     }
85     else
86     {
87         // reverse, so start at the last entry.
88         myQueryIndex = myRecord.getReadLength() - 1;
89     }
90     return(true);
91 }
92 
93 
94 // Returns information for the next position where the query and the
95 // reference match or mismatch.  To be a match or mismatch, both the query
96 // and reference must have a base that is not 'N'.
97 // This means:
98 //    insertions and deletions are not mismatches or matches.
99 //    'N' bases are not matches or mismatches
100 // Returns true if an entry was found, false if there are no more matches or
101 // mismatches.
getNextMatchMismatch(SamSingleBaseMatchInfo & matchMismatchInfo)102 bool SamQuerySeqWithRefIter::getNextMatchMismatch(SamSingleBaseMatchInfo& matchMismatchInfo)
103 {
104     // Check whether or not this read is mapped.
105     // If the read is not mapped, return no matches.
106     if(!SamFlag::isMapped(myRecord.getFlag()))
107     {
108         // Not mapped.
109         return(false);
110     }
111 
112     // Check that the Cigar is set.
113     if(myCigar == NULL)
114     {
115         // Error.
116         throw(std::runtime_error("Cannot determine matches/mismatches since failed to retrieve the cigar"));
117         return(false);
118     }
119 
120     // If myStartOfReadOnRefIndex is the default (unset) value, then
121     // the reference was not found, so return false, no matches/mismatches.
122     if(myStartOfReadOnRefIndex == INVALID_GENOME_INDEX)
123     {
124         // This reference name was not found in the reference file, so just
125         // return no matches/mismatches.
126         return(false);
127     }
128 
129 
130     // Repull the read length from the record to check just in case the
131     // record has changed length.
132     // Loop until a match or mismatch is found as long as query index
133     // is still within the read  (Loop is broken by a return).
134     while((myQueryIndex < myRecord.getReadLength()) &&
135           (myQueryIndex >= 0))
136     {
137         // Still more bases, look for a match/mismatch.
138 
139         // Get the reference offset for this read position.
140         int32_t refOffset = myCigar->getRefOffset(myQueryIndex);
141         if(refOffset == Cigar::INDEX_NA)
142         {
143             // This is either a softclip or an insertion
144             // which do not count as a match or a mismatch, so
145             // go to the next index.
146             nextIndex();
147             continue;
148         }
149 
150         // Both the reference and the read have a base, so get the bases.
151         char readBase = myRecord.getSequence(myQueryIndex, SamRecord::NONE);
152         char refBase = myRefSequence[myStartOfReadOnRefIndex + refOffset];
153 
154         // If either the read or the reference bases are unknown, then
155         // it does not count as a match or a mismatch.
156         if(BaseUtilities::isAmbiguous(readBase) ||
157            BaseUtilities::isAmbiguous(refBase))
158         {
159             // Either the reference base or the read base are unknown,
160             // so skip this position.
161             nextIndex();
162             continue;
163         }
164 
165         // Both the read & the reference have a known base, so it is either
166         // a match or a mismatch.
167         matchMismatchInfo.setQueryIndex(myQueryIndex);
168 
169         // Check if they are equal.
170         if(BaseUtilities::areEqual(readBase, refBase))
171         {
172             // Match.
173             matchMismatchInfo.setType(SamSingleBaseMatchInfo::MATCH);
174             // Increment the query index to the next position.
175             nextIndex();
176             return(true);
177         }
178         else
179         {
180             // Mismatch
181             matchMismatchInfo.setType(SamSingleBaseMatchInfo::MISMATCH);
182             // Increment the query index to the next position.
183             nextIndex();
184             return(true);
185         }
186     }
187 
188     // No matches or mismatches were found, so return false.
189     return(false);
190 }
191 
192 
nextIndex()193 void SamQuerySeqWithRefIter::nextIndex()
194 {
195     if(myForward)
196     {
197         ++myQueryIndex;
198     }
199     else
200     {
201         --myQueryIndex;
202     }
203 }
204 
205 
SamSingleBaseMatchInfo()206 SamSingleBaseMatchInfo::SamSingleBaseMatchInfo()
207     : myType(UNKNOWN),
208       myQueryIndex(0)
209 {
210 }
211 
212 
~SamSingleBaseMatchInfo()213 SamSingleBaseMatchInfo::~SamSingleBaseMatchInfo()
214 {
215 }
216 
217 
getType()218 SamSingleBaseMatchInfo::Type SamSingleBaseMatchInfo::getType()
219 {
220     return(myType);
221 }
222 
223 
getQueryIndex()224 int32_t SamSingleBaseMatchInfo::getQueryIndex()
225 {
226     return(myQueryIndex);
227 }
228 
229 
setType(Type newType)230 void SamSingleBaseMatchInfo::setType(Type newType)
231 {
232     myType = newType;
233 }
234 
235 
setQueryIndex(int32_t queryIndex)236 void SamSingleBaseMatchInfo::setQueryIndex(int32_t queryIndex)
237 {
238     myQueryIndex = queryIndex;
239 }
240 
241 
242 ///////////////////////////////////////////////////////////////////////////
seqWithEquals(const char * currentSeq,int32_t seq0BasedPos,Cigar & cigar,const char * referenceName,const GenomeSequence & refSequence,std::string & updatedSeq)243 void SamQuerySeqWithRef::seqWithEquals(const char* currentSeq,
244                                        int32_t seq0BasedPos,
245                                        Cigar& cigar,
246                                        const char* referenceName,
247                                        const GenomeSequence& refSequence,
248                                        std::string& updatedSeq)
249 {
250     updatedSeq = currentSeq;
251 
252     int32_t seqLength = updatedSeq.length();
253     int32_t queryIndex = 0;
254 
255     uint32_t startOfReadOnRefIndex =
256         refSequence.getGenomePosition(referenceName);
257 
258     if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
259     {
260         // This reference name was not found in the reference file, so just
261         // return.
262         return;
263     }
264     startOfReadOnRefIndex += seq0BasedPos;
265 
266     // Loop until the entire sequence has been updated.
267     while(queryIndex < seqLength)
268     {
269         // Still more bases, look for matches.
270 
271         // Get the reference offset for this read position.
272         int32_t refOffset = cigar.getRefOffset(queryIndex);
273         if(refOffset != Cigar::INDEX_NA)
274         {
275             // Both the reference and the read have a base, so get the bases.
276             char readBase = currentSeq[queryIndex];
277             char refBase = refSequence[startOfReadOnRefIndex + refOffset];
278 
279             // If neither base is unknown and they are the same, count it
280             // as a match.
281             if(!BaseUtilities::isAmbiguous(readBase) &&
282                !BaseUtilities::isAmbiguous(refBase) &&
283                (BaseUtilities::areEqual(readBase, refBase)))
284             {
285                 // Match.
286                 updatedSeq[queryIndex] = '=';
287             }
288         }
289         // Increment the query index to the next position.
290         ++queryIndex;
291         continue;
292     }
293 }
294 
295 
seqWithoutEquals(const char * currentSeq,int32_t seq0BasedPos,Cigar & cigar,const char * referenceName,const GenomeSequence & refSequence,std::string & updatedSeq)296 void SamQuerySeqWithRef::seqWithoutEquals(const char* currentSeq,
297                                           int32_t seq0BasedPos,
298                                           Cigar& cigar,
299                                           const char* referenceName,
300                                           const GenomeSequence& refSequence,
301                                           std::string& updatedSeq)
302 {
303     updatedSeq = currentSeq;
304 
305     int32_t seqLength = updatedSeq.length();
306     int32_t queryIndex = 0;
307 
308     uint32_t startOfReadOnRefIndex =
309         refSequence.getGenomePosition(referenceName);
310 
311     if(startOfReadOnRefIndex == INVALID_GENOME_INDEX)
312     {
313         // This reference name was not found in the reference file, so just
314         // return.
315         return;
316     }
317     startOfReadOnRefIndex += seq0BasedPos;
318 
319     // Loop until the entire sequence has been updated.
320     while(queryIndex < seqLength)
321     {
322         // Still more bases, look for matches.
323 
324         // Get the reference offset for this read position.
325         int32_t refOffset = cigar.getRefOffset(queryIndex);
326         if(refOffset != Cigar::INDEX_NA)
327         {
328             // Both the reference and the read have a base, so get the bases.
329             char readBase = currentSeq[queryIndex];
330             char refBase = refSequence[startOfReadOnRefIndex + refOffset];
331 
332             // If the bases are equal, set the sequence to the reference
333             // base. (Skips the check for ambiguous to catch a case where
334             // ambiguous had been converted to a '=', and if both are ambiguous,
335             // it will still be set to ambiguous.)
336             if(BaseUtilities::areEqual(readBase, refBase))
337             {
338                 // Match.
339                 updatedSeq[queryIndex] = refBase;
340             }
341         }
342 
343         // Increment the query index to the next position.
344         ++queryIndex;
345         continue;
346     }
347 }
348