1 /* 2 * Copyright (C) 2010-2011 Regents of the University of Michigan 3 * 4 * This program is free software: you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation, either version 3 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <http://www.gnu.org/licenses/>. 16 */ 17 18 #if !defined(_CIGAR_ROLLER_H) 19 #define _CIGAR_ROLLER_H 20 21 #include "Cigar.h" 22 23 /// The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object. It is a child class of Cigar. 24 25 /// 26 /// Docs from Sam1.pdf: 27 /// 28 /// Clipped alignment. In Smith-Waterman alignment, a sequence may not be aligned from the first residue to the last one. 29 /// Subsequences at the ends may be clipped off. We introduce operation ʻSʼ to describe (softly) clipped alignment. Here is 30 /// an example. Suppose the clipped alignment is: 31 /// REF: AGCTAGCATCGTGTCGCCCGTCTAGCATACGCATGATCGACTGTCAGCTAGTCAGACTAGTCGATCGATGTG 32 /// READ: gggGTGTAACC-GACTAGgggg 33 /// where on the read sequence, bases in uppercase are matches and bases in lowercase are clipped off. The CIGAR for 34 /// this alignment is: 3S8M1D6M4S. 35 /// 36 /// 37 /// If the mapping position of the query is not available, RNAME and 38 /// CIGAR are set as “*” 39 /// 40 /// A CIGAR string is comprised of a series of operation lengths plus the operations. The conventional CIGAR format allows 41 /// for three types of operations: M for match or mismatch, I for insertion and D for deletion. The extended CIGAR format 42 /// further allows four more operations, as is shown in the following table, to describe clipping, padding and splicing: 43 /// 44 /// op Description 45 /// -- ----------- 46 /// M Match or mismatch 47 /// I Insertion to the reference 48 /// D Deletion from the reference 49 /// N Skipped region from the reference 50 /// S Soft clip on the read (clipped sequence present in <seq>) 51 /// H Hard clip on the read (clipped sequence NOT present in <seq>) 52 /// P Padding (silent deletion from the padded reference sequence) 53 /// 54 55 56 57 //////////////////////////////////////////////////////////////////////// 58 /// 59 /// CigarRoller is an aid to correctly generating the CIGAR strings 60 /// necessary to represent how a read maps to the reference. 61 /// 62 /// It is called once a particular match candidate is being written 63 /// out, so it is far less performance sensitive than the Smith Waterman 64 /// code below. 65 /// 66 class CigarRoller : public Cigar 67 { 68 public: 69 70 //////////////////////////////////////////////////////////////////////// 71 // 72 // Cigar Roller Class 73 // 74 /// Writes all of the cigar operations contained in this roller to the 75 /// passed in stream. 76 friend std::ostream &operator << (std::ostream &stream, const CigarRoller& roller); 77 78 /// Default constructor initializes as a CIGAR with no operations. CigarRoller()79 CigarRoller() 80 { 81 clearQueryAndReferenceIndexes(); 82 } 83 84 /// Constructor that initializes the object with the specified cigarString. CigarRoller(const char * cigarString)85 CigarRoller(const char *cigarString) 86 { 87 Set(cigarString); 88 } 89 90 /// Add the contents of the specified CigarRoller to this object. 91 CigarRoller & operator += (CigarRoller &rhs); 92 93 /// Append the specified operator to this object. 94 CigarRoller & operator += (const CigarOperator &rhs); 95 96 /// Set this object to be equal to the specified CigarRoller. 97 CigarRoller & operator = (CigarRoller &rhs); 98 99 /// Append the specified operation with the specified count to this object. 100 void Add(Operation operation, int count); 101 102 /// Append the specified operation with the specified count to this object. 103 void Add(char operation, int count); 104 105 /// Append the specified cigarString to this object. 106 void Add(const char *cigarString); 107 108 /// Append the specified Cigar object to this object. Add(CigarRoller & rhs)109 void Add(CigarRoller &rhs) 110 { 111 (*this) += rhs; 112 } 113 114 /// Remove the operation at the specified index. 115 /// \return true if successfully removed, false if not. 116 bool Remove(int index); 117 118 /// Increments the count for the operation at the specified index 119 /// by the specified value, specify a negative value to decrement. 120 /// \return true if it is successfully incremented, false if not. 121 bool IncrementCount(int index, int increment); 122 123 /// Updates the operation at the specified index to be the specified 124 /// operation and have the specified count. 125 /// \return true if it is successfully updated, false if not. 126 bool Update(int index, Operation op, int count); 127 128 /// Sets this object to the specified cigarString. 129 void Set(const char *cigarString); 130 131 /// Sets this object to the BAM formatted cigar found at the beginning 132 /// of the specified buffer which is bufferLen long. 133 void Set(const uint32_t* cigarBuffer, uint16_t bufferLen); 134 135 // 136 // when we examine CIGAR strings, we need to know how 137 // many cumulative insert and delete positions there are 138 // so that we can adjust the read location appropriately. 139 // 140 // Here, we iterate over the vector of CIGAR operations, 141 // summaring the count for each insert or delete (insert 142 // increases the offset, delete decreases it). 143 // 144 // The use case for this is when we have a genome match 145 // position based on an index word other than the first one, 146 // and there is also a insert or delete between the beginning 147 // of the read and the index word. We can't simply report 148 // the match position without taking into account the indels, 149 // otherwise we'll be off by N where N is the sum of this 150 // indel count. 151 // 152 /// DEPRECATED - do not use, there are better ways to accomplish that by 153 /// using read lengths, reference lengths, span of the read, etc. 154 int getMatchPositionOffset(); 155 156 /// Get the string reprentation of the Cigar operations in this object, 157 /// caller must delete the returned value. 158 const char *getString(); 159 160 /// Clear this object so that it has no Cigar Operations. 161 void clear(); 162 163 private: 164 }; 165 166 167 inline std::ostream &operator << (std::ostream &stream, const CigarRoller& roller) 168 { 169 stream << roller.cigarOperations; 170 return stream; 171 } 172 173 #endif 174