1 /** 2 * UGENE - Integrated Bioinformatics Tools. 3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru> 4 * http://ugene.net 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 2 9 * of the License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19 * MA 02110-1301, USA. 20 */ 21 22 #ifndef _U2_ASSEMBLY_H_ 23 #define _U2_ASSEMBLY_H_ 24 25 #include <QSharedData> 26 27 #include <U2Core/U2Range.h> 28 #include <U2Core/U2Sequence.h> 29 30 namespace U2 { 31 32 /** 33 * CIGAR string operation. 34 * CIGAR string represents complex ins/del model for short-reads. 35 * 36 * Meanings of the operations are the following: 37 * 38 * - M - 'alignment match' 39 * Either match or mismatch to reference. 40 * 41 * - I - 'insertion' 42 * Insertion to the reference. Residues marked as 'I' must be skipped when 43 * counting 'real' read length and must be skipped when read is aligned to 44 * reference. 45 * 46 * - D - 'deletion' 47 * Deletion from the reference. Gaps must be inserted to the read when read 48 * is aligned to reference. Deleted regions must be added to the 'real' 49 * read length. 50 * 51 * - N - 'skip' 52 * Skipped region from the reference. Skips behave exactly as deletions, 53 * however have different biological meaning: they make sense _only_ in 54 * mRNA-to-genome alignment where represent an intron. 55 * 56 * - S - 'soft clipping' 57 * Regions which do not match to the reference, behave exactly as insertions. 58 * Must be located at the start or the end of the read (see 59 * SAM spec and CigarValidator) 60 * 61 * - H - 'hard clipping' 62 * Regions which do not match to the reference, skipped by hardware (not 63 * present in read sequence). Hard clipping does not affects read length or 64 * visualization 65 * 66 * - P - 'padding' (TODO) 67 * Silent Deletion from padded reference. Someday we should find out how to 68 * handle this. Padding does not affect read length. 69 * 70 * - = - 'sequence match' 71 * Exact match to reference. 72 * 73 * - X - 'sequence mismatch' 74 * Mismatch to reference. 75 * 76 * See also: U2AssemblyUtils::getCigarExtraLength(), ShortReadIterator. 77 */ 78 enum U2CigarOp { 79 U2CigarOp_Invalid = 0, 80 U2CigarOp_D = 1, // deleted 81 U2CigarOp_I = 2, // inserted 82 U2CigarOp_H = 3, // hard-clipped 83 U2CigarOp_M = 4, // alignment match 84 U2CigarOp_N = 5, // skipped 85 U2CigarOp_P = 6, // padded 86 U2CigarOp_S = 7, // soft-clipped 87 U2CigarOp_EQ = 8, // sequence match 88 U2CigarOp_X = 9 // sequence mismatch 89 }; 90 91 /** 92 CIGAR token: operation + count 93 */ 94 class U2CORE_EXPORT U2CigarToken { 95 public: U2CigarToken()96 U2CigarToken() 97 : op(U2CigarOp_M), count(1) { 98 } U2CigarToken(U2CigarOp _op,int _count)99 U2CigarToken(U2CigarOp _op, int _count) 100 : op(_op), count(_count) { 101 } 102 103 U2CigarOp op; 104 int count; 105 }; 106 107 /** 108 assembly read flags 109 */ 110 enum ReadFlag { 111 None = 0, 112 Fragmented = 1 << 0, 113 FragmentsAligned = 1 << 1, 114 Unmapped = 1 << 2, 115 NextUnmapped = 1 << 3, 116 Reverse = 1 << 4, 117 NextReverse = 1 << 5, 118 FirstInTemplate = 1 << 6, 119 LastInTemplate = 1 << 7, 120 SecondaryAlignment = 1 << 8, 121 FailsChecks = 1 << 9, 122 Duplicate = 1 << 10, 123 DnaExtAlphabet = 1 << 16 124 }; 125 126 /** 127 Utility class to work with flags 128 */ 129 class ReadFlagsUtils { 130 public: isExtendedAlphabet(qint64 flags)131 static bool isExtendedAlphabet(qint64 flags) { 132 return flags & DnaExtAlphabet; 133 } 134 isComplementaryRead(qint64 flags)135 static bool isComplementaryRead(qint64 flags) { 136 return flags & Reverse; 137 } 138 isPairedRead(qint64 flags)139 static bool isPairedRead(qint64 flags) { 140 return flags & Fragmented; 141 } 142 isUnmappedRead(qint64 flags)143 static bool isUnmappedRead(qint64 flags) { 144 return flags & Unmapped; 145 } 146 }; 147 148 /** 149 * Auxiliary data from BAM/SAM. 150 */ 151 class U2CORE_EXPORT U2AuxData { 152 public: U2AuxData()153 U2AuxData() 154 : type(0), subType(0) { 155 } 156 /** Two bytes for tag */ 157 char tag[2]; 158 /** One byte for type: AcCsSiIfZHB */ 159 char type; 160 /** Value size depends on the type */ 161 QByteArray value; 162 /** Type of array data. Only for array auxes */ 163 char subType; 164 }; 165 166 /** 167 Row of assembly: sequence, leftmost position and CIGAR 168 */ 169 class U2CORE_EXPORT U2AssemblyReadData : public U2Entity, public QSharedData { 170 public: U2AssemblyReadData()171 U2AssemblyReadData() 172 : leftmostPos(0), effectiveLen(0), 173 packedViewRow(0), mappingQuality(255), flags(0), rnext("*"), pnext(0) { 174 } 175 176 /** Name of the read, ASCII string */ 177 QByteArray name; 178 179 /** 180 Zero-based left-most position of the read 181 */ 182 qint64 leftmostPos; 183 184 /** Length of the read with CIGAR affect applied */ 185 qint64 effectiveLen; 186 187 /** 188 Position of the read in packed view 189 */ 190 qint64 packedViewRow; 191 192 /** 193 CIGAR info for the read 194 */ 195 QList<U2CigarToken> cigar; 196 197 /** 198 Sequence of the read. 199 The array is not empty only if sequence is embedded into the read 200 */ 201 QByteArray readSequence; 202 203 /** Quality string */ 204 QByteArray quality; 205 206 /** Mapping quality */ 207 quint8 mappingQuality; 208 209 /** Read flags */ 210 qint64 flags; 211 212 /** Reference sequence name of the next mate read */ 213 QByteArray rnext; 214 215 /** Left-most position of the next mate read */ 216 qint64 pnext; 217 218 /** The list of auxiliary data of BAM/SAM formats */ 219 QList<U2AuxData> aux; 220 }; 221 222 typedef QSharedDataPointer<U2AssemblyReadData> U2AssemblyRead; 223 224 /** 225 Assembly representation 226 */ 227 class U2CORE_EXPORT U2Assembly : public U2Object { 228 public: U2Assembly()229 U2Assembly() { 230 } U2Assembly(U2DataId id,QString dbId,qint64 version)231 U2Assembly(U2DataId id, QString dbId, qint64 version) 232 : U2Object(id, dbId, version) { 233 } 234 235 U2DataId referenceId; 236 getType()237 U2DataType getType() const { 238 return U2Type::Assembly; 239 } 240 }; 241 242 /** Statistics information collected during the reads packing algorithm */ 243 class U2AssemblyPackStat { 244 public: U2AssemblyPackStat()245 U2AssemblyPackStat() 246 : maxProw(0), readsCount(0) { 247 } 248 /** Maximum packed row value after pack algorithm */ 249 int maxProw; 250 /** Number of reads packed */ 251 qint64 readsCount; 252 }; 253 254 /** Statistics information collected during the reads packing algorithm */ 255 typedef QVector<qint32> U2AssemblyCoverageStat; 256 257 } // namespace U2 258 259 #endif 260