1 /*
2  *  Copyright (C) 2010-2011  Regents of the University of Michigan
3  *
4  *   This program is free software: you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation, either version 3 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *   GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #if !defined(_CIGAR_ROLLER_H)
19 #define _CIGAR_ROLLER_H
20 
21 #include "Cigar.h"
22 
23 /// The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object. It is a child class of Cigar.
24 
25 ///
26 /// Docs from Sam1.pdf:
27 ///
28 /// Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one.
29 /// Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is
30 /// an example. Suppose the clipped alignment is:
31 /// REF:  AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG
32 /// READ:        gggGTGTAACC-GACTAGgggg
33 /// where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for
34 /// this alignment is: 3S8M1D6M4S.
35 ///
36 ///
37 /// If the mapping position of the query is not available, RNAME and
38 /// CIGAR are set as “*”
39 ///
40 /// A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows
41 /// for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format
42 /// further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing:
43 ///
44 /// op   Description
45 /// --   -----------
46 /// M    Match or mismatch
47 /// I    Insertion to the reference
48 /// D    Deletion from the reference
49 /// N    Skipped region from the reference
50 /// S    Soft clip on the read (clipped sequence present in <seq>)
51 /// H    Hard clip on the read (clipped sequence NOT present in <seq>)
52 /// P    Padding (silent deletion from the padded reference sequence)
53 ///
54 
55 
56 
57 ////////////////////////////////////////////////////////////////////////
58 ///
59 /// CigarRoller is an aid to correctly generating the CIGAR strings
60 /// necessary to represent how a read maps to the reference.
61 ///
62 /// It is called once a particular match candidate is being written
63 /// out, so it is far less performance sensitive than the Smith Waterman
64 /// code below.
65 ///
66 class CigarRoller : public Cigar
67 {
68 public:
69 
70     ////////////////////////////////////////////////////////////////////////
71     //
72     // Cigar Roller Class
73     //
74     /// Writes all of the cigar operations contained in this roller to the
75     /// passed in stream.
76     friend std::ostream &operator << (std::ostream &stream, const CigarRoller& roller);
77 
78     /// Default constructor initializes as a CIGAR with no operations.
CigarRoller()79     CigarRoller()
80     {
81         clearQueryAndReferenceIndexes();
82     }
83 
84     /// Constructor that initializes the object with the specified cigarString.
CigarRoller(const char * cigarString)85     CigarRoller(const char *cigarString)
86     {
87         Set(cigarString);
88     }
89 
90     /// Add the contents of the specified CigarRoller to this object.
91     CigarRoller & operator += (CigarRoller &rhs);
92 
93     /// Append the specified operator to this object.
94     CigarRoller & operator += (const CigarOperator &rhs);
95 
96     /// Set this object to be equal to the specified CigarRoller.
97     CigarRoller & operator = (CigarRoller &rhs);
98 
99     /// Append the specified operation with the specified count to this object.
100     void Add(Operation operation, int count);
101 
102     /// Append the specified operation with the specified count to this object.
103     void Add(char operation, int count);
104 
105     /// Append the specified cigarString to this object.
106     void Add(const char *cigarString);
107 
108     /// Append the specified Cigar object to this object.
Add(CigarRoller & rhs)109     void Add(CigarRoller &rhs)
110     {
111         (*this) += rhs;
112     }
113 
114     /// Remove the operation at the specified index.
115     /// \return true if successfully removed, false if not.
116     bool Remove(int index);
117 
118     /// Increments the count for the operation at the specified index
119     /// by the specified value, specify a negative value to decrement.
120     /// \return true if it is successfully incremented, false if not.
121     bool IncrementCount(int index, int increment);
122 
123     /// Updates the operation at the specified index to be the specified
124     /// operation and have the specified count.
125     /// \return true if it is successfully updated, false if not.
126     bool Update(int index, Operation op, int count);
127 
128     /// Sets this object to the specified cigarString.
129     void Set(const char *cigarString);
130 
131     /// Sets this object to the BAM formatted cigar found at the beginning
132     /// of the specified buffer which is bufferLen long.
133     void Set(const uint32_t* cigarBuffer, uint16_t bufferLen);
134 
135     //
136     // when we examine CIGAR strings, we need to know how
137     // many cumulative insert and delete positions there are
138     // so that we can adjust the read location appropriately.
139     //
140     // Here, we iterate over the vector of CIGAR operations,
141     // summaring the count for each insert or delete (insert
142     // increases the offset, delete decreases it).
143     //
144     // The use case for this is when we have a genome match
145     // position based on an index word other than the first one,
146     // and there is also a insert or delete between the beginning
147     // of the read and the index word.  We can't simply report
148     // the match position without taking into account the indels,
149     // otherwise we'll be off by N where N is the sum of this
150     // indel count.
151     //
152     /// DEPRECATED - do not use, there are better ways to accomplish that by
153     /// using read lengths, reference lengths, span of the read, etc.
154     int getMatchPositionOffset();
155 
156     /// Get the string reprentation of the Cigar operations in this object,
157     /// caller must delete the returned value.
158     const char *getString();
159 
160     /// Clear this object so that it has no Cigar Operations.
161     void clear();
162 
163 private:
164 };
165 
166 
167 inline std::ostream &operator << (std::ostream &stream, const CigarRoller& roller)
168 {
169     stream << roller.cigarOperations;
170     return stream;
171 }
172 
173 #endif
174