1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #ifndef _U2_ASSEMBLY_H_
23 #define _U2_ASSEMBLY_H_
24 
25 #include <QSharedData>
26 
27 #include <U2Core/U2Range.h>
28 #include <U2Core/U2Sequence.h>
29 
30 namespace U2 {
31 
32 /**
33  * CIGAR string operation.
34  * CIGAR string represents complex ins/del model for short-reads.
35  *
36  * Meanings of the operations are the following:
37  *
38  * - M - 'alignment match'
39  *   Either match or mismatch to reference.
40  *
41  * - I - 'insertion'
42  *   Insertion to the reference. Residues marked as 'I' must be skipped when
43  *   counting 'real' read length and must be skipped when read is aligned to
44  *   reference.
45  *
46  * - D - 'deletion'
47  *   Deletion from the reference. Gaps must be inserted to the read when read
48  *   is aligned to reference. Deleted regions must be added to the 'real'
49  *   read length.
50  *
51  * - N - 'skip'
52  *   Skipped region from the reference. Skips behave exactly as deletions,
53  *   however have different biological meaning: they make sense _only_ in
54  *   mRNA-to-genome alignment where represent an intron.
55  *
56  * - S - 'soft clipping'
57  *   Regions which do not match to the reference, behave exactly as insertions.
58  *   Must be located at the start or the end of the read (see
59  *   SAM spec and CigarValidator)
60  *
61  * - H - 'hard clipping'
62  *   Regions which do not match to the reference, skipped by hardware (not
63  *   present in read sequence). Hard clipping does not affects read length or
64  *   visualization
65  *
66  * - P - 'padding' (TODO)
67  *   Silent Deletion from padded reference. Someday we should find out how to
68  *   handle this. Padding does not affect read length.
69  *
70  * - = - 'sequence match'
71  *   Exact match to reference.
72  *
73  * - X - 'sequence mismatch'
74  *   Mismatch to reference.
75  *
76  * See also: U2AssemblyUtils::getCigarExtraLength(), ShortReadIterator.
77  */
78 enum U2CigarOp {
79     U2CigarOp_Invalid = 0,
80     U2CigarOp_D = 1,  // deleted
81     U2CigarOp_I = 2,  // inserted
82     U2CigarOp_H = 3,  // hard-clipped
83     U2CigarOp_M = 4,  // alignment match
84     U2CigarOp_N = 5,  // skipped
85     U2CigarOp_P = 6,  // padded
86     U2CigarOp_S = 7,  // soft-clipped
87     U2CigarOp_EQ = 8,  // sequence match
88     U2CigarOp_X = 9  // sequence mismatch
89 };
90 
91 /**
92     CIGAR token: operation + count
93 */
94 class U2CORE_EXPORT U2CigarToken {
95 public:
U2CigarToken()96     U2CigarToken()
97         : op(U2CigarOp_M), count(1) {
98     }
U2CigarToken(U2CigarOp _op,int _count)99     U2CigarToken(U2CigarOp _op, int _count)
100         : op(_op), count(_count) {
101     }
102 
103     U2CigarOp op;
104     int count;
105 };
106 
107 /**
108     assembly read flags
109 */
110 enum ReadFlag {
111     None = 0,
112     Fragmented = 1 << 0,
113     FragmentsAligned = 1 << 1,
114     Unmapped = 1 << 2,
115     NextUnmapped = 1 << 3,
116     Reverse = 1 << 4,
117     NextReverse = 1 << 5,
118     FirstInTemplate = 1 << 6,
119     LastInTemplate = 1 << 7,
120     SecondaryAlignment = 1 << 8,
121     FailsChecks = 1 << 9,
122     Duplicate = 1 << 10,
123     DnaExtAlphabet = 1 << 16
124 };
125 
126 /**
127     Utility class to work with flags
128  */
129 class ReadFlagsUtils {
130 public:
isExtendedAlphabet(qint64 flags)131     static bool isExtendedAlphabet(qint64 flags) {
132         return flags & DnaExtAlphabet;
133     }
134 
isComplementaryRead(qint64 flags)135     static bool isComplementaryRead(qint64 flags) {
136         return flags & Reverse;
137     }
138 
isPairedRead(qint64 flags)139     static bool isPairedRead(qint64 flags) {
140         return flags & Fragmented;
141     }
142 
isUnmappedRead(qint64 flags)143     static bool isUnmappedRead(qint64 flags) {
144         return flags & Unmapped;
145     }
146 };
147 
148 /**
149  * Auxiliary data from BAM/SAM.
150  */
151 class U2CORE_EXPORT U2AuxData {
152 public:
U2AuxData()153     U2AuxData()
154         : type(0), subType(0) {
155     }
156     /** Two bytes for tag */
157     char tag[2];
158     /** One byte for type: AcCsSiIfZHB */
159     char type;
160     /** Value size depends on the type */
161     QByteArray value;
162     /** Type of array data. Only for array auxes */
163     char subType;
164 };
165 
166 /**
167     Row of assembly: sequence, leftmost position and CIGAR
168 */
169 class U2CORE_EXPORT U2AssemblyReadData : public U2Entity, public QSharedData {
170 public:
U2AssemblyReadData()171     U2AssemblyReadData()
172         : leftmostPos(0), effectiveLen(0),
173           packedViewRow(0), mappingQuality(255), flags(0), rnext("*"), pnext(0) {
174     }
175 
176     /** Name of the read, ASCII string */
177     QByteArray name;
178 
179     /**
180         Zero-based left-most position of the read
181     */
182     qint64 leftmostPos;
183 
184     /** Length of the read with CIGAR affect applied */
185     qint64 effectiveLen;
186 
187     /**
188         Position of the read in packed view
189     */
190     qint64 packedViewRow;
191 
192     /**
193         CIGAR info for the read
194     */
195     QList<U2CigarToken> cigar;
196 
197     /**
198         Sequence of the read.
199         The array is not empty only if sequence is embedded into the read
200     */
201     QByteArray readSequence;
202 
203     /** Quality string */
204     QByteArray quality;
205 
206     /** Mapping quality */
207     quint8 mappingQuality;
208 
209     /** Read flags */
210     qint64 flags;
211 
212     /** Reference sequence name of the next mate read */
213     QByteArray rnext;
214 
215     /** Left-most position of the next mate read */
216     qint64 pnext;
217 
218     /** The list of auxiliary data of BAM/SAM formats */
219     QList<U2AuxData> aux;
220 };
221 
222 typedef QSharedDataPointer<U2AssemblyReadData> U2AssemblyRead;
223 
224 /**
225     Assembly representation
226 */
227 class U2CORE_EXPORT U2Assembly : public U2Object {
228 public:
U2Assembly()229     U2Assembly() {
230     }
U2Assembly(U2DataId id,QString dbId,qint64 version)231     U2Assembly(U2DataId id, QString dbId, qint64 version)
232         : U2Object(id, dbId, version) {
233     }
234 
235     U2DataId referenceId;
236 
getType()237     U2DataType getType() const {
238         return U2Type::Assembly;
239     }
240 };
241 
242 /** Statistics information collected during the reads packing algorithm */
243 class U2AssemblyPackStat {
244 public:
U2AssemblyPackStat()245     U2AssemblyPackStat()
246         : maxProw(0), readsCount(0) {
247     }
248     /** Maximum packed row value after pack algorithm */
249     int maxProw;
250     /** Number of reads packed */
251     qint64 readsCount;
252 };
253 
254 /** Statistics information collected during the reads packing algorithm */
255 typedef QVector<qint32> U2AssemblyCoverageStat;
256 
257 }  // namespace U2
258 
259 #endif
260