1 #ifndef OBJECTS_ALNMGR___ALNVEC__HPP
2 #define OBJECTS_ALNMGR___ALNVEC__HPP
3 
4 /*  $Id: alnvec.hpp 550451 2017-11-02 15:21:07Z falkrb $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kamen Todorov, NCBI
30  *
31  * File Description:
32  *   Access to the actual aligned residues
33  *
34  */
35 
36 
37 #include <objtools/alnmgr/alnmap.hpp>
38 #include <objmgr/seq_vector.hpp>
39 
40 BEGIN_NCBI_SCOPE
41 BEGIN_objects_SCOPE
42 
43 
44 // forward declarations
45 class CScope;
46 
47 class NCBI_XALNMGR_EXPORT CAlnVec : public CAlnMap
48 {
49     typedef CAlnMap                         Tparent;
50     typedef map<TNumrow, CBioseq_Handle>    TBioseqHandleCache;
51     typedef map<TNumrow, CRef<CSeqVector> > TSeqVectorCache;
52 
53 public:
54     typedef CSeqVector::TResidue            TResidue;
55     typedef vector<int>                     TResidueCount;
56 
57     // constructor
58     CAlnVec(const CDense_seg& ds, CScope& scope);
59     CAlnVec(const CDense_seg& ds, TNumrow anchor, CScope& scope);
60 
61     // destructor
62     ~CAlnVec(void);
63 
64     CScope& GetScope(void) const;
65 
66 
67     // GetSeqString methods:
68 
69     // raw seq string (in seq coords)
70     string& GetSeqString   (string& buffer,
71                             TNumrow row,
72                             TSeqPos seq_from, TSeqPos seq_to)             const;
73     string& GetSeqString   (string& buffer,
74                             TNumrow row,
75                             const CAlnMap::TRange& seq_rng)               const;
76     string& GetSegSeqString(string& buffer,
77                             TNumrow row,
78                             TNumseg seg, TNumseg offset = 0)              const;
79 
80    // alignment (seq + gaps) string (in aln coords)
81     string& GetAlnSeqString(string& buffer,
82                             TNumrow row,
83                             const CAlnMap::TSignedRange& aln_rng)         const;
84 
85     // creates a vertical string of residues for a given aln pos
86     // NB: buffer will be resized to GetNumRows()
87     // optionally, returns a distribution of residues
88     // optionally, counts the gaps in this distribution
89     string& GetColumnVector(string& buffer,
90                             TSeqPos aln_pos,
91                             TResidueCount * residue_count = 0,
92                             bool gaps_in_count = false)                   const;
93 
94     // get the seq string for the whole alignment (seq + gaps)
95     // optionally, get the inserts and screen limit coords
96     string& GetWholeAlnSeqString(TNumrow       row,
97                                  string&       buffer,
98                                  TSeqPosList * insert_aln_starts = 0,
99                                  TSeqPosList * insert_starts = 0,
100                                  TSeqPosList * insert_lens = 0,
101                                  unsigned int  scrn_width = 0,
102                                  TSeqPosList * scrn_lefts = 0,
103                                  TSeqPosList * scrn_rights = 0) const;
104 
105     const CBioseq_Handle& GetBioseqHandle(TNumrow row)                  const;
106     TResidue              GetResidue     (TNumrow row, TSeqPos aln_pos) const;
107 
108     // Sequence coding. If not set (default), Iupac[na/aa] coding is used.
109     // If set to a value conflicting with the sequence type
110     typedef CSeq_data::E_Choice TCoding;
GetNaCoding(void) const111     TCoding GetNaCoding(void) const { return m_NaCoding; }
GetAaCoding(void) const112     TCoding GetAaCoding(void) const { return m_AaCoding; }
SetNaCoding(TCoding coding)113     void SetNaCoding(TCoding coding) { m_NaCoding = coding; }
SetAaCoding(TCoding coding)114     void SetAaCoding(TCoding coding) { m_AaCoding = coding; }
115 
116     // gap character could be explicitely set otherwise taken from seqvector
117     void     SetGapChar(TResidue gap_char);
118     void     UnsetGapChar();
119     bool     IsSetGapChar()                const;
120     TResidue GetGapChar(TNumrow row)       const;
121 
122     // end character is ' ' by default
123     void     SetEndChar(TResidue gap_char);
124     void     UnsetEndChar();
125     bool     IsSetEndChar()                const;
126     TResidue GetEndChar()                  const;
127 
128     // genetic code
129     enum EConstants {
130         kDefaultGenCode = 1
131     };
132     void     SetGenCode(int gen_code,
133                         TNumrow row = -1);
134     void     UnsetGenCode();
135     bool     IsSetGenCode()                const;
136     int      GetGenCode(TNumrow row)       const;
137 
138     // Functions for obtaining a consensus sequence
139     // These versions add the consensus Bioseq to the scope.
140     CRef<CDense_seg> CreateConsensus(int& consensus_row) const;
141     CRef<CDense_seg> CreateConsensus(int& consensus_row,
142                                      const CSeq_id& consensus_id) const;
143     // This version returns the consensus Bioseq (in a parameter)
144     // without adding it to the scope.
145     CRef<CDense_seg> CreateConsensus(int& consensus_row,
146                                      CBioseq& consensus_seq,
147      	                             const CSeq_id& consensus_id,
148                                      vector<string>* consens = NULL) const;
149 
150     // utilities
151     int CalculateScore          (TNumrow row1, TNumrow row2) const;
152     int CalculatePercentIdentity(TSeqPos aln_pos)            const;
153 
154     // static utilities
155     static void TranslateNAToAA(const string& na, string& aa,
156                                 int gen_code = kDefaultGenCode);
157     //                          gen_code per
158     //                          http://www.ncbi.nlm.nih.gov/collab/FT/#7.5.5
159 
160     static int  CalculateScore (const string& s1, const string& s2,
161                                 bool s1_is_prot, bool s2_is_prot,
162                                 int gen_code1 = kDefaultGenCode,
163                                 int gen_code2 = kDefaultGenCode);
164 
165     // temporaries for conversion (see note below)
166     static unsigned char FromIupac(unsigned char c);
167     static unsigned char ToIupac  (unsigned char c);
168 
169     static void TransposeSequences(vector<string>& segs);
170     static void CollectNucleotideFrequences(const string& col, int base_count[], int numBases);
171     static void CollectProteinFrequences(const string& col, int base_count[], int numBases);
172 
173     void RetrieveSegmentSequences(size_t segment, vector<string>& segs) const;
174 
175 protected:
176 
177     CSeqVector& x_GetSeqVector         (TNumrow row)       const;
178     CSeqVector& x_GetConsensusSeqVector(void)              const;
179 
180     void CreateConsensus(vector<string>& consens) const;
181 
182     mutable CRef<CScope>            m_Scope;
183     mutable TBioseqHandleCache      m_BioseqHandlesCache;
184     mutable TSeqVectorCache         m_SeqVectorCache;
185 
186 private:
187     // Prohibit copy constructor and assignment operator
188     CAlnVec(const CAlnVec&);
189     CAlnVec& operator=(const CAlnVec&);
190 
191     TResidue    m_GapChar;
192     bool        m_set_GapChar;
193     TResidue    m_EndChar;
194     bool        m_set_EndChar;
195     vector<int> m_GenCodes;
196     TCoding     m_NaCoding;
197     TCoding     m_AaCoding;
198 };
199 
200 
201 
202 class NCBI_XALNMGR_EXPORT CAlnVecPrinter : public CAlnMapPrinter
203 {
204 public:
205     /// Constructor
206     CAlnVecPrinter(const CAlnVec& aln_vec,
207                    CNcbiOstream&  out);
208 
209 
210     /// which algorithm to choose
211     enum EAlgorithm {
212         eUseSeqString,         /// memory ineficient
213         eUseAlnSeqString,      /// memory efficient, recommended for large alns
214         eUseWholeAlnSeqString  /// memory ineficient, but very fast
215     };
216 
217     /// Printing methods
218     void PopsetStyle (int        scrn_width = 70,
219                       EAlgorithm algorithm  = eUseAlnSeqString);
220 
221     void ClustalStyle(int        scrn_width = 50,
222                       EAlgorithm algorithm  = eUseAlnSeqString);
223 
224 private:
225     void x_SetChars();
226     void x_UnsetChars();
227 
228     const CAlnVec& m_AlnVec;
229 
230     typedef CSeqVector::TResidue            TResidue;
231 
232     bool     m_OrigSetGapChar;
233     TResidue m_OrigGapChar;
234 
235     bool     m_OrigSetEndChar;
236     TResidue m_OrigEndChar;
237 };
238 
239 
240 
241 /////////////////////////////////////////////////////////////////////////////
242 //  IMPLEMENTATION of INLINE functions
243 /////////////////////////////////////////////////////////////////////////////
244 
245 
246 inline
GetScope(void) const247 CScope& CAlnVec::GetScope(void) const
248 {
249     return *m_Scope;
250 }
251 
252 
253 inline
GetResidue(TNumrow row,TSeqPos aln_pos) const254 CSeqVector::TResidue CAlnVec::GetResidue(TNumrow row, TSeqPos aln_pos) const
255 {
256     if (aln_pos > GetAlnStop()) {
257         return (TResidue) 0; // out of range
258     }
259     TSegTypeFlags type = GetSegType(row, GetSeg(aln_pos));
260     if (type & fSeq) {
261         CSeqVector& seq_vec = x_GetSeqVector(row);
262         TSignedSeqPos pos = GetSeqPosFromAlnPos(row, aln_pos);
263         if (GetWidth(row) == 3) {
264             string na_buff, aa_buff;
265             if (IsPositiveStrand(row)) {
266                 seq_vec.GetSeqData(pos, pos + 3, na_buff);
267             } else {
268                 TSeqPos size = seq_vec.size();
269                 seq_vec.GetSeqData(size - pos - 3, size - pos, na_buff);
270             }
271             TranslateNAToAA(na_buff, aa_buff, GetGenCode(row));
272             return aa_buff[0];
273         } else {
274             return seq_vec[IsPositiveStrand(row) ?
275                           pos : seq_vec.size() - pos - 1];
276         }
277     } else {
278         if (type & fNoSeqOnLeft  ||  type & fNoSeqOnRight) {
279             return GetEndChar();
280         } else {
281             return GetGapChar(row);
282         }
283     }
284 }
285 
286 
287 inline
GetSeqString(string & buffer,TNumrow row,TSeqPos seq_from,TSeqPos seq_to) const288 string& CAlnVec::GetSeqString(string& buffer,
289                               TNumrow row,
290                               TSeqPos seq_from, TSeqPos seq_to) const
291 {
292     if (GetWidth(row) == 3) {
293         string buff;
294         buffer.erase();
295         if (IsPositiveStrand(row)) {
296             x_GetSeqVector(row).GetSeqData(seq_from, seq_to + 1, buff);
297         } else {
298             CSeqVector& seq_vec = x_GetSeqVector(row);
299             TSeqPos size = seq_vec.size();
300             seq_vec.GetSeqData(size - seq_to - 1, size - seq_from, buff);
301         }
302         TranslateNAToAA(buff, buffer, GetGenCode(row));
303     } else {
304         if (IsPositiveStrand(row)) {
305             x_GetSeqVector(row).GetSeqData(seq_from, seq_to + 1, buffer);
306         } else {
307             CSeqVector& seq_vec = x_GetSeqVector(row);
308             TSeqPos size = seq_vec.size();
309             seq_vec.GetSeqData(size - seq_to - 1, size - seq_from, buffer);
310         }
311     }
312     return buffer;
313 }
314 
315 
316 inline
GetSegSeqString(string & buffer,TNumrow row,TNumseg seg,int offset) const317 string& CAlnVec::GetSegSeqString(string& buffer,
318                                  TNumrow row,
319                                  TNumseg seg, int offset) const
320 {
321     return GetSeqString(buffer, row,
322                         GetStart(row, seg, offset),
323                         GetStop (row, seg, offset));
324 }
325 
326 
327 inline
GetSeqString(string & buffer,TNumrow row,const CAlnMap::TRange & seq_rng) const328 string& CAlnVec::GetSeqString(string& buffer,
329                               TNumrow row,
330                               const CAlnMap::TRange& seq_rng) const
331 {
332     return GetSeqString(buffer, row,
333                         seq_rng.GetFrom(),
334                         seq_rng.GetTo());
335 }
336 
337 
338 inline
SetGapChar(TResidue gap_char)339 void CAlnVec::SetGapChar(TResidue gap_char)
340 {
341     m_GapChar = gap_char;
342     m_set_GapChar = true;
343 }
344 
345 inline
UnsetGapChar()346 void CAlnVec::UnsetGapChar()
347 {
348     m_set_GapChar = false;
349 }
350 
351 inline
IsSetGapChar() const352 bool CAlnVec::IsSetGapChar() const
353 {
354     return m_set_GapChar;
355 }
356 
357 inline
GetGapChar(TNumrow row) const358 CSeqVector::TResidue CAlnVec::GetGapChar(TNumrow row) const
359 {
360     if (IsSetGapChar()) {
361         return m_GapChar;
362     } else {
363         return x_GetSeqVector(row).GetGapChar();
364     }
365 }
366 
367 inline
SetEndChar(TResidue end_char)368 void CAlnVec::SetEndChar(TResidue end_char)
369 {
370     m_EndChar = end_char;
371     m_set_EndChar = true;
372 }
373 
374 inline
UnsetEndChar()375 void CAlnVec::UnsetEndChar()
376 {
377     m_set_EndChar = false;
378 }
379 
380 inline
IsSetEndChar() const381 bool CAlnVec::IsSetEndChar() const
382 {
383     return m_set_EndChar;
384 }
385 
386 inline
GetEndChar() const387 CSeqVector::TResidue CAlnVec::GetEndChar() const
388 {
389     if (IsSetEndChar()) {
390         return m_EndChar;
391     } else {
392         return ' ';
393     }
394 }
395 
396 inline
SetGenCode(int gen_code,TNumrow row)397 void CAlnVec::SetGenCode(int gen_code, TNumrow row)
398 {
399     if (row == -1) {
400         if (IsSetGenCode()) {
401             UnsetGenCode();
402         }
403         m_GenCodes.resize(GetNumRows(), gen_code);
404     } else {
405         if ( !IsSetGenCode() ) {
406             m_GenCodes.resize(GetNumRows(), kDefaultGenCode);
407         }
408         m_GenCodes[row] = gen_code;
409     }
410 }
411 
412 inline
UnsetGenCode()413 void CAlnVec::UnsetGenCode()
414 {
415     m_GenCodes.clear();
416 }
417 
418 inline
IsSetGenCode() const419 bool CAlnVec::IsSetGenCode() const
420 {
421     return !m_GenCodes.empty();
422 }
423 
424 inline
GetGenCode(TNumrow row) const425 int CAlnVec::GetGenCode(TNumrow row) const
426 {
427     if (IsSetGenCode()) {
428         return m_GenCodes[row];
429     } else {
430         return kDefaultGenCode;
431     }
432 }
433 
434 
435 //
436 // these are a temporary work-around
437 // CSeqportUtil contains routines for converting sequence data from one
438 // format to another, but it places a requirement on the data: it must in
439 // a CSeq_data class.  I can get this for my data, but it is a bit of work
440 // (much more work than calling CSeqVector::GetSeqdata(), which, if I use the
441 // internal sequence vector, is guaranteed to be in IUPAC notation)
442 //
443 inline
FromIupac(unsigned char c)444 unsigned char CAlnVec::FromIupac(unsigned char c)
445 {
446     switch (c)
447     {
448     case 'A': return 0x01;
449     case 'C': return 0x02;
450     case 'M': return 0x03;
451     case 'G': return 0x04;
452     case 'R': return 0x05;
453     case 'S': return 0x06;
454     case 'V': return 0x07;
455     case 'T': return 0x08;
456     case 'W': return 0x09;
457     case 'Y': return 0x0a;
458     case 'H': return 0x0b;
459     case 'K': return 0x0c;
460     case 'D': return 0x0d;
461     case 'B': return 0x0e;
462     case 'N': return 0x0f;
463     }
464 
465     return 0x00;
466 }
467 
ToIupac(unsigned char c)468 inline unsigned char CAlnVec::ToIupac(unsigned char c)
469 {
470     const char *data = "-ACMGRSVTWYHKDBN";
471     return ((c < 16) ? data[c] : 0);
472 }
473 
474 
475 ///////////////////////////////////////////////////////////
476 ////////////////// end of inline methods //////////////////
477 ///////////////////////////////////////////////////////////
478 
479 END_objects_SCOPE // namespace ncbi::objects::
480 END_NCBI_SCOPE
481 
482 #endif  /* OBJECTS_ALNMGR___ALNVEC__HPP */
483