1 #ifndef OBJECTS_ALNMGR___ALNVEC__HPP
2 #define OBJECTS_ALNMGR___ALNVEC__HPP
3
4 /* $Id: alnvec.hpp 550451 2017-11-02 15:21:07Z falkrb $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Kamen Todorov, NCBI
30 *
31 * File Description:
32 * Access to the actual aligned residues
33 *
34 */
35
36
37 #include <objtools/alnmgr/alnmap.hpp>
38 #include <objmgr/seq_vector.hpp>
39
40 BEGIN_NCBI_SCOPE
41 BEGIN_objects_SCOPE
42
43
44 // forward declarations
45 class CScope;
46
47 class NCBI_XALNMGR_EXPORT CAlnVec : public CAlnMap
48 {
49 typedef CAlnMap Tparent;
50 typedef map<TNumrow, CBioseq_Handle> TBioseqHandleCache;
51 typedef map<TNumrow, CRef<CSeqVector> > TSeqVectorCache;
52
53 public:
54 typedef CSeqVector::TResidue TResidue;
55 typedef vector<int> TResidueCount;
56
57 // constructor
58 CAlnVec(const CDense_seg& ds, CScope& scope);
59 CAlnVec(const CDense_seg& ds, TNumrow anchor, CScope& scope);
60
61 // destructor
62 ~CAlnVec(void);
63
64 CScope& GetScope(void) const;
65
66
67 // GetSeqString methods:
68
69 // raw seq string (in seq coords)
70 string& GetSeqString (string& buffer,
71 TNumrow row,
72 TSeqPos seq_from, TSeqPos seq_to) const;
73 string& GetSeqString (string& buffer,
74 TNumrow row,
75 const CAlnMap::TRange& seq_rng) const;
76 string& GetSegSeqString(string& buffer,
77 TNumrow row,
78 TNumseg seg, TNumseg offset = 0) const;
79
80 // alignment (seq + gaps) string (in aln coords)
81 string& GetAlnSeqString(string& buffer,
82 TNumrow row,
83 const CAlnMap::TSignedRange& aln_rng) const;
84
85 // creates a vertical string of residues for a given aln pos
86 // NB: buffer will be resized to GetNumRows()
87 // optionally, returns a distribution of residues
88 // optionally, counts the gaps in this distribution
89 string& GetColumnVector(string& buffer,
90 TSeqPos aln_pos,
91 TResidueCount * residue_count = 0,
92 bool gaps_in_count = false) const;
93
94 // get the seq string for the whole alignment (seq + gaps)
95 // optionally, get the inserts and screen limit coords
96 string& GetWholeAlnSeqString(TNumrow row,
97 string& buffer,
98 TSeqPosList * insert_aln_starts = 0,
99 TSeqPosList * insert_starts = 0,
100 TSeqPosList * insert_lens = 0,
101 unsigned int scrn_width = 0,
102 TSeqPosList * scrn_lefts = 0,
103 TSeqPosList * scrn_rights = 0) const;
104
105 const CBioseq_Handle& GetBioseqHandle(TNumrow row) const;
106 TResidue GetResidue (TNumrow row, TSeqPos aln_pos) const;
107
108 // Sequence coding. If not set (default), Iupac[na/aa] coding is used.
109 // If set to a value conflicting with the sequence type
110 typedef CSeq_data::E_Choice TCoding;
GetNaCoding(void) const111 TCoding GetNaCoding(void) const { return m_NaCoding; }
GetAaCoding(void) const112 TCoding GetAaCoding(void) const { return m_AaCoding; }
SetNaCoding(TCoding coding)113 void SetNaCoding(TCoding coding) { m_NaCoding = coding; }
SetAaCoding(TCoding coding)114 void SetAaCoding(TCoding coding) { m_AaCoding = coding; }
115
116 // gap character could be explicitely set otherwise taken from seqvector
117 void SetGapChar(TResidue gap_char);
118 void UnsetGapChar();
119 bool IsSetGapChar() const;
120 TResidue GetGapChar(TNumrow row) const;
121
122 // end character is ' ' by default
123 void SetEndChar(TResidue gap_char);
124 void UnsetEndChar();
125 bool IsSetEndChar() const;
126 TResidue GetEndChar() const;
127
128 // genetic code
129 enum EConstants {
130 kDefaultGenCode = 1
131 };
132 void SetGenCode(int gen_code,
133 TNumrow row = -1);
134 void UnsetGenCode();
135 bool IsSetGenCode() const;
136 int GetGenCode(TNumrow row) const;
137
138 // Functions for obtaining a consensus sequence
139 // These versions add the consensus Bioseq to the scope.
140 CRef<CDense_seg> CreateConsensus(int& consensus_row) const;
141 CRef<CDense_seg> CreateConsensus(int& consensus_row,
142 const CSeq_id& consensus_id) const;
143 // This version returns the consensus Bioseq (in a parameter)
144 // without adding it to the scope.
145 CRef<CDense_seg> CreateConsensus(int& consensus_row,
146 CBioseq& consensus_seq,
147 const CSeq_id& consensus_id,
148 vector<string>* consens = NULL) const;
149
150 // utilities
151 int CalculateScore (TNumrow row1, TNumrow row2) const;
152 int CalculatePercentIdentity(TSeqPos aln_pos) const;
153
154 // static utilities
155 static void TranslateNAToAA(const string& na, string& aa,
156 int gen_code = kDefaultGenCode);
157 // gen_code per
158 // http://www.ncbi.nlm.nih.gov/collab/FT/#7.5.5
159
160 static int CalculateScore (const string& s1, const string& s2,
161 bool s1_is_prot, bool s2_is_prot,
162 int gen_code1 = kDefaultGenCode,
163 int gen_code2 = kDefaultGenCode);
164
165 // temporaries for conversion (see note below)
166 static unsigned char FromIupac(unsigned char c);
167 static unsigned char ToIupac (unsigned char c);
168
169 static void TransposeSequences(vector<string>& segs);
170 static void CollectNucleotideFrequences(const string& col, int base_count[], int numBases);
171 static void CollectProteinFrequences(const string& col, int base_count[], int numBases);
172
173 void RetrieveSegmentSequences(size_t segment, vector<string>& segs) const;
174
175 protected:
176
177 CSeqVector& x_GetSeqVector (TNumrow row) const;
178 CSeqVector& x_GetConsensusSeqVector(void) const;
179
180 void CreateConsensus(vector<string>& consens) const;
181
182 mutable CRef<CScope> m_Scope;
183 mutable TBioseqHandleCache m_BioseqHandlesCache;
184 mutable TSeqVectorCache m_SeqVectorCache;
185
186 private:
187 // Prohibit copy constructor and assignment operator
188 CAlnVec(const CAlnVec&);
189 CAlnVec& operator=(const CAlnVec&);
190
191 TResidue m_GapChar;
192 bool m_set_GapChar;
193 TResidue m_EndChar;
194 bool m_set_EndChar;
195 vector<int> m_GenCodes;
196 TCoding m_NaCoding;
197 TCoding m_AaCoding;
198 };
199
200
201
202 class NCBI_XALNMGR_EXPORT CAlnVecPrinter : public CAlnMapPrinter
203 {
204 public:
205 /// Constructor
206 CAlnVecPrinter(const CAlnVec& aln_vec,
207 CNcbiOstream& out);
208
209
210 /// which algorithm to choose
211 enum EAlgorithm {
212 eUseSeqString, /// memory ineficient
213 eUseAlnSeqString, /// memory efficient, recommended for large alns
214 eUseWholeAlnSeqString /// memory ineficient, but very fast
215 };
216
217 /// Printing methods
218 void PopsetStyle (int scrn_width = 70,
219 EAlgorithm algorithm = eUseAlnSeqString);
220
221 void ClustalStyle(int scrn_width = 50,
222 EAlgorithm algorithm = eUseAlnSeqString);
223
224 private:
225 void x_SetChars();
226 void x_UnsetChars();
227
228 const CAlnVec& m_AlnVec;
229
230 typedef CSeqVector::TResidue TResidue;
231
232 bool m_OrigSetGapChar;
233 TResidue m_OrigGapChar;
234
235 bool m_OrigSetEndChar;
236 TResidue m_OrigEndChar;
237 };
238
239
240
241 /////////////////////////////////////////////////////////////////////////////
242 // IMPLEMENTATION of INLINE functions
243 /////////////////////////////////////////////////////////////////////////////
244
245
246 inline
GetScope(void) const247 CScope& CAlnVec::GetScope(void) const
248 {
249 return *m_Scope;
250 }
251
252
253 inline
GetResidue(TNumrow row,TSeqPos aln_pos) const254 CSeqVector::TResidue CAlnVec::GetResidue(TNumrow row, TSeqPos aln_pos) const
255 {
256 if (aln_pos > GetAlnStop()) {
257 return (TResidue) 0; // out of range
258 }
259 TSegTypeFlags type = GetSegType(row, GetSeg(aln_pos));
260 if (type & fSeq) {
261 CSeqVector& seq_vec = x_GetSeqVector(row);
262 TSignedSeqPos pos = GetSeqPosFromAlnPos(row, aln_pos);
263 if (GetWidth(row) == 3) {
264 string na_buff, aa_buff;
265 if (IsPositiveStrand(row)) {
266 seq_vec.GetSeqData(pos, pos + 3, na_buff);
267 } else {
268 TSeqPos size = seq_vec.size();
269 seq_vec.GetSeqData(size - pos - 3, size - pos, na_buff);
270 }
271 TranslateNAToAA(na_buff, aa_buff, GetGenCode(row));
272 return aa_buff[0];
273 } else {
274 return seq_vec[IsPositiveStrand(row) ?
275 pos : seq_vec.size() - pos - 1];
276 }
277 } else {
278 if (type & fNoSeqOnLeft || type & fNoSeqOnRight) {
279 return GetEndChar();
280 } else {
281 return GetGapChar(row);
282 }
283 }
284 }
285
286
287 inline
GetSeqString(string & buffer,TNumrow row,TSeqPos seq_from,TSeqPos seq_to) const288 string& CAlnVec::GetSeqString(string& buffer,
289 TNumrow row,
290 TSeqPos seq_from, TSeqPos seq_to) const
291 {
292 if (GetWidth(row) == 3) {
293 string buff;
294 buffer.erase();
295 if (IsPositiveStrand(row)) {
296 x_GetSeqVector(row).GetSeqData(seq_from, seq_to + 1, buff);
297 } else {
298 CSeqVector& seq_vec = x_GetSeqVector(row);
299 TSeqPos size = seq_vec.size();
300 seq_vec.GetSeqData(size - seq_to - 1, size - seq_from, buff);
301 }
302 TranslateNAToAA(buff, buffer, GetGenCode(row));
303 } else {
304 if (IsPositiveStrand(row)) {
305 x_GetSeqVector(row).GetSeqData(seq_from, seq_to + 1, buffer);
306 } else {
307 CSeqVector& seq_vec = x_GetSeqVector(row);
308 TSeqPos size = seq_vec.size();
309 seq_vec.GetSeqData(size - seq_to - 1, size - seq_from, buffer);
310 }
311 }
312 return buffer;
313 }
314
315
316 inline
GetSegSeqString(string & buffer,TNumrow row,TNumseg seg,int offset) const317 string& CAlnVec::GetSegSeqString(string& buffer,
318 TNumrow row,
319 TNumseg seg, int offset) const
320 {
321 return GetSeqString(buffer, row,
322 GetStart(row, seg, offset),
323 GetStop (row, seg, offset));
324 }
325
326
327 inline
GetSeqString(string & buffer,TNumrow row,const CAlnMap::TRange & seq_rng) const328 string& CAlnVec::GetSeqString(string& buffer,
329 TNumrow row,
330 const CAlnMap::TRange& seq_rng) const
331 {
332 return GetSeqString(buffer, row,
333 seq_rng.GetFrom(),
334 seq_rng.GetTo());
335 }
336
337
338 inline
SetGapChar(TResidue gap_char)339 void CAlnVec::SetGapChar(TResidue gap_char)
340 {
341 m_GapChar = gap_char;
342 m_set_GapChar = true;
343 }
344
345 inline
UnsetGapChar()346 void CAlnVec::UnsetGapChar()
347 {
348 m_set_GapChar = false;
349 }
350
351 inline
IsSetGapChar() const352 bool CAlnVec::IsSetGapChar() const
353 {
354 return m_set_GapChar;
355 }
356
357 inline
GetGapChar(TNumrow row) const358 CSeqVector::TResidue CAlnVec::GetGapChar(TNumrow row) const
359 {
360 if (IsSetGapChar()) {
361 return m_GapChar;
362 } else {
363 return x_GetSeqVector(row).GetGapChar();
364 }
365 }
366
367 inline
SetEndChar(TResidue end_char)368 void CAlnVec::SetEndChar(TResidue end_char)
369 {
370 m_EndChar = end_char;
371 m_set_EndChar = true;
372 }
373
374 inline
UnsetEndChar()375 void CAlnVec::UnsetEndChar()
376 {
377 m_set_EndChar = false;
378 }
379
380 inline
IsSetEndChar() const381 bool CAlnVec::IsSetEndChar() const
382 {
383 return m_set_EndChar;
384 }
385
386 inline
GetEndChar() const387 CSeqVector::TResidue CAlnVec::GetEndChar() const
388 {
389 if (IsSetEndChar()) {
390 return m_EndChar;
391 } else {
392 return ' ';
393 }
394 }
395
396 inline
SetGenCode(int gen_code,TNumrow row)397 void CAlnVec::SetGenCode(int gen_code, TNumrow row)
398 {
399 if (row == -1) {
400 if (IsSetGenCode()) {
401 UnsetGenCode();
402 }
403 m_GenCodes.resize(GetNumRows(), gen_code);
404 } else {
405 if ( !IsSetGenCode() ) {
406 m_GenCodes.resize(GetNumRows(), kDefaultGenCode);
407 }
408 m_GenCodes[row] = gen_code;
409 }
410 }
411
412 inline
UnsetGenCode()413 void CAlnVec::UnsetGenCode()
414 {
415 m_GenCodes.clear();
416 }
417
418 inline
IsSetGenCode() const419 bool CAlnVec::IsSetGenCode() const
420 {
421 return !m_GenCodes.empty();
422 }
423
424 inline
GetGenCode(TNumrow row) const425 int CAlnVec::GetGenCode(TNumrow row) const
426 {
427 if (IsSetGenCode()) {
428 return m_GenCodes[row];
429 } else {
430 return kDefaultGenCode;
431 }
432 }
433
434
435 //
436 // these are a temporary work-around
437 // CSeqportUtil contains routines for converting sequence data from one
438 // format to another, but it places a requirement on the data: it must in
439 // a CSeq_data class. I can get this for my data, but it is a bit of work
440 // (much more work than calling CSeqVector::GetSeqdata(), which, if I use the
441 // internal sequence vector, is guaranteed to be in IUPAC notation)
442 //
443 inline
FromIupac(unsigned char c)444 unsigned char CAlnVec::FromIupac(unsigned char c)
445 {
446 switch (c)
447 {
448 case 'A': return 0x01;
449 case 'C': return 0x02;
450 case 'M': return 0x03;
451 case 'G': return 0x04;
452 case 'R': return 0x05;
453 case 'S': return 0x06;
454 case 'V': return 0x07;
455 case 'T': return 0x08;
456 case 'W': return 0x09;
457 case 'Y': return 0x0a;
458 case 'H': return 0x0b;
459 case 'K': return 0x0c;
460 case 'D': return 0x0d;
461 case 'B': return 0x0e;
462 case 'N': return 0x0f;
463 }
464
465 return 0x00;
466 }
467
ToIupac(unsigned char c)468 inline unsigned char CAlnVec::ToIupac(unsigned char c)
469 {
470 const char *data = "-ACMGRSVTWYHKDBN";
471 return ((c < 16) ? data[c] : 0);
472 }
473
474
475 ///////////////////////////////////////////////////////////
476 ////////////////// end of inline methods //////////////////
477 ///////////////////////////////////////////////////////////
478
479 END_objects_SCOPE // namespace ncbi::objects::
480 END_NCBI_SCOPE
481
482 #endif /* OBJECTS_ALNMGR___ALNVEC__HPP */
483