1 #ifndef OBJECTS_SEQ___SEQPORT_UTIL__HPP
2 #define OBJECTS_SEQ___SEQPORT_UTIL__HPP
3 
4 /*  $Id: seqport_util.hpp 392960 2013-03-21 12:46:54Z vasilche $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Clifford Clausen
30  *          (also reviewed/fixed/groomed by Denis Vakatov and Aaron Ucko)
31  *
32  * File Description:
33  */
34 #include <corelib/ncbi_limits.hpp>
35 #include <objects/seq/Seq_data.hpp>
36 #include <objects/seqcode/Seq_code_type.hpp>
37 #include <memory>
38 #include <vector>
39 
40 
41 BEGIN_NCBI_SCOPE
42 BEGIN_objects_SCOPE
43 
44 
45 // CSeqportUtil is a wrapper for a hidden object of class
46 // CSeqportUtil_implementation.
47 
48 class CSeqportUtil_implementation;
49 
50 
51 class NCBI_SEQ_EXPORT CSeqportUtil
52 {
53 public:
54 
55     // TypeDefs
56     typedef unsigned int TIndex;
57     typedef pair<TIndex, TIndex> TPair;
58 
59     // Classes thrown as errors
60     struct NCBI_SEQ_EXPORT CBadIndex : public runtime_error
61     {
CBadIndexCSeqportUtil::CBadIndex62         CBadIndex(TIndex idx, string method)
63             : runtime_error("CSeqportUtil::" + method +
64             " -- bad index specified: " + NStr::UIntToString(idx)) {}
65     };
66     struct NCBI_SEQ_EXPORT CBadSymbol : public runtime_error
67     {
CBadSymbolCSeqportUtil::CBadSymbol68         CBadSymbol(string code, string method)
69             : runtime_error("CSeqportUtil::" + method +
70             " -- bad symbol specified: " + code) {}
71     };
72     struct NCBI_SEQ_EXPORT CBadType : public runtime_error
73     {
CBadTypeCSeqportUtil::CBadType74         CBadType(string method)
75             : runtime_error("CSeqportUtil::" + method +
76             " -- specified code or code combination not supported") {}
77     };
78 
79     // Alphabet conversion function. Function returns the
80     // number of converted codes.
81     static TSeqPos Convert(const CSeq_data&       in_seq,
82                            CSeq_data*             out_seq,
83                            CSeq_data::E_Choice    to_code,
84                            TSeqPos                uBeginIdx = 0,
85                            TSeqPos                uLength   = 0,
86                            bool                   bAmbig    = false,
87                            Uint4                  seed      = 17734276);
88 
89     // Alphabet conversion function with ambiguities in BLAST form, for DNA only.
90     // Target encoding is ncbi2na. Ambiguities are returned in a separate vector.
91     // This function can be called in a loop, in which the returned data length
92     // and ambiguity vector are accumulated.
93     // @param in_seq Input sequence [in]
94     // @param out_seq Output sequence [out]
95     // @param uBeginIdx Starting offset in input sequence [in]
96     // @param uLength Ending offset in input sequence [in]
97     // @param total_length Total length of sequences to be converted. The BLAST
98     //        ambiguity encoding depends on whether total length is >= 1<<24.
99     // @param out_seq_length Number of bytes in the output sequence data.
100     //        Incremented if != 0 on input. [in|out]
101     // @param blast_ambig Ambiguities encoded in BLAST form. The vector is
102     //        appended, so this function can be called in a loop. [in|out]
103     // @return Number of converted codes.
104     static TSeqPos
105     ConvertWithBlastAmbig(const CSeq_data& in_seq,
106                           CSeq_data*       out_seq,
107                           TSeqPos          uBeginIdx,
108                           TSeqPos          uLength,
109                           TSeqPos          total_length,
110                           TSeqPos*         out_seq_length,
111                           vector<Uint4>*   blast_ambig);
112 
113     // Function to provide maximum in-place packing of na
114     // sequences without loss of information. Iupacna
115     // can always be packed to ncbi4na without loss. Iupacna
116     // can sometimes be packed to ncbi2na. Ncbi4na can
117     // sometimes be packed to ncbi2na. Returns number of
118     // residues packed. If in_seq cannot be packed, the
119     // original in_seq is returned unchanged and the return value
120     // from Pack is 0
121     static TSeqPos Pack(CSeq_data*   in_seq,
122         TSeqPos uLength = ncbi::numeric_limits<TSeqPos>::max());
123 
124     // Performs fast validation of CSeq_data. If all data in the
125     // sequence represent valid elements of a biological sequence, then
126     // FastValidate returns true. Otherwise it returns false
127     static bool FastValidate(const CSeq_data&   in_seq,
128                              TSeqPos            uBeginIdx = 0,
129                              TSeqPos            uLength   = 0);
130 
131     // Performs validation of CSeq_data. Returns a list of indices
132     // corresponding to data that does not represent a valid element
133     // of a biological sequence.
134     static void Validate(const CSeq_data&   in_seq,
135                          vector<TSeqPos>*   badIdx,
136                          TSeqPos            uBeginIdx = 0,
137                          TSeqPos            uLength   = 0);
138 
139     // Get ambiguous bases. out_indices returns
140     // the indices relative to in_seq of ambiguous bases.
141     // out_seq returns the ambiguous bases. Note, there are
142     // only ambiguous bases for iupacna->ncib2na and
143     // ncib4na->ncbi2na coversions.
144     static TSeqPos GetAmbigs(const CSeq_data&    in_seq,
145                              CSeq_data*          out_seq,
146                              vector<TSeqPos>*    out_indices,
147                              CSeq_data::E_Choice to_code = CSeq_data::e_Ncbi2na,
148                              TSeqPos             uBeginIdx = 0,
149                              TSeqPos             uLength   = 0);
150 
151     // Get a copy of CSeq_data. No conversion is done. uBeginIdx of the
152     // biological sequence in in_seq will be in position
153     // 0 of out_seq. Usually, uLength bases will be copied
154     // from in_seq to out_seq. If uLength goes beyond the end of
155     // in_seq, it will be shortened to go to the end of in_seq.
156     // For packed sequence formats (ncbi2na and ncbi4na),
157     // only uLength bases are valid copies. For example,
158     // in an ncbi4na encoded sequence, if uLength is odd, the last
159     // sequence returned will be uLength+1 because 2 bases are encoded
160     // per byte in ncbi4na. However, in this case, uLength will be returned
161     // unchanged (it will remain odd unless it goes beyond the end
162     // of in_seq). If uLength=0, then a copy from uBeginIdx to the end
163     // of in_seq is returned.
164     static TSeqPos GetCopy(const CSeq_data&   in_seq,
165                            CSeq_data*         out_seq,
166                            TSeqPos            uBeginIdx = 0,
167                            TSeqPos            uLength   = 0);
168 
169     // Method to keep only a contiguous piece of a sequence beginning
170     // at uBeginIdx and uLength residues long. Does bit shifting as
171     // needed to put uBeginIdx of original sequence at position zero on output.
172     // Similar to GetCopy(), but done in place.  Returns length of
173     // kept sequence.
174     static TSeqPos Keep(CSeq_data*   in_seq,
175                         TSeqPos      uBeginIdx = 0,
176                         TSeqPos      uLength   = 0);
177 
178     // Append in_seq2 to to end of in_seq1. Both in seqs must be
179     // in the same alphabet or this method will throw a runtime_error.
180     // The result of the append will be put into out_seq.
181     // For packed sequences ncbi2na and ncbi4na, Append will shift and
182     // append so as to remove any jaggedness at the append point.
183     static TSeqPos Append(CSeq_data*         out_seq,
184                           const CSeq_data&   in_seq1,
185                           TSeqPos            uBeginIdx1,
186                           TSeqPos            uLength1,
187                           const CSeq_data&   in_seq2,
188                           TSeqPos            uBeginIdx2,
189                           TSeqPos            uLength2);
190 
191     // Create a biological complement of an na sequence.
192     // Attempts to complement an aa sequence will throw
193     // a runtime_error. Returns length of complemented sequence.
194 
195     // Complement the input sequence in place
196     static TSeqPos Complement(CSeq_data*   in_seq,
197                               TSeqPos      uBeginIdx = 0,
198                               TSeqPos      uLength   = 0);
199 
200     // Complement the input sequence and put the result in
201     // the output sequence
202     static TSeqPos Complement(const CSeq_data&   in_seq,
203                               CSeq_data*         out_seq,
204                               TSeqPos            uBeginIdx = 0,
205                               TSeqPos            uLength   = 0);
206 
207     // Create a biological sequence that is the reversse
208     // of an input sequence. Attempts to reverse an aa
209     // sequence will throw a runtime_error. Returns length of
210     // reversed sequence.
211 
212     // Reverse a sequence in place
213     static TSeqPos Reverse(CSeq_data*   in_seq,
214                            TSeqPos      uBeginIdx = 0,
215                            TSeqPos      uLength   = 0);
216 
217     // Reverse an input sequence and put result in output sequence.
218     // Reverses packed bytes as necessary.
219     static TSeqPos Reverse(const CSeq_data&   in_seq,
220                            CSeq_data*         out_seq,
221                            TSeqPos            uBeginIdx = 0,
222                            TSeqPos            uLength   = 0);
223 
224 
225     // Create the reverse complement of an input sequence. Attempts
226     // to reverse-complement an aa sequence will throw a
227     // runtime_error.
228 
229     // Reverse complement a sequence in place
230     static TSeqPos ReverseComplement(CSeq_data*   in_seq,
231                                      TSeqPos      uBeginIdx = 0,
232                                      TSeqPos      uLength   = 0);
233 
234     // Reverse complmenet a sequence and put result in output sequence
235     static TSeqPos ReverseComplement(const CSeq_data&   in_seq,
236                                      CSeq_data*         out_seq,
237                                      TSeqPos            uBeginIdx = 0,
238                                      TSeqPos            uLength   = 0);
239 
240     // Given an Ncbistdaa input code index, returns the 3 letter Iupacaa3 code
241     // Implementation is efficient
242     static const string& GetIupacaa3(TIndex ncbistdaa);
243 
244     // Given a code type expressible as enum CSeq_data::E_Choice, returns
245     // true if the code type is available. See Seq_data_.hpp for definition
246     // of enum CSeq_data::E_Choice.
247     static bool IsCodeAvailable(CSeq_data::E_Choice code_type);
248 
249     // Same as above, but for code types expressible as enum ESeq_code_type.
250     // See Seq_code_type_.hpp for definition of enum ESeq_code_type.
251     static bool IsCodeAvailable(ESeq_code_type code_type);
252 
253     // Gets the first and last indices of a code type (e.g., for iupacna,
254     // the first index is 65 and the last index is 89). Throws CBadType
255     // if code_type is not available
256     static TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type);
257 
258     // Same as above but takes code type expressed as an ESeq_code_type
259     static TPair GetCodeIndexFromTo(ESeq_code_type code_type);
260 
261     // Given an index for any code type expressible as a
262     // CSeq_data::E_Choice, returns the corresponding symbol (code)
263     // (e.g., for iupacna, idx 65 causes "A" to be returned)
264     // if the code type is available (e.g., ncbi8aa is not currently
265     // available). Use IsCodeAvailable() to find out which code types
266     // are available. Throws CBadType if code_type not available. Throws
267     // CBadIndex if idx out of range for code_type.
268     static const string& GetCode(CSeq_data::E_Choice code_type,
269                                  TIndex              idx);
270 
271     // Similar to above, but works for all code types expressible as
272     // a ESeq_code_type (i.e., iupacaa3 is expressible as an
273     // ESeq_code_type but not as a CSeq_data::E_Choice)
274     static const string& GetCode(ESeq_code_type code_type,
275                                  TIndex         idx);
276 
277     // Given an index for any code type expressible as a
278     // CSeq_data::E_Choice, returns the corresponding name
279     // (e.g., for iupacna, idx 65 causes "Adenine" to be returned)
280     // if the code type is available (e.g., ncbi8aa is not currently
281     // available). Use IsCodeAvailable() to find out which code types
282     // are available. Throws CBadType if code_type not available. Throws
283     // CBadIndex if idx out of range for code_type.
284     static const string& GetName(CSeq_data::E_Choice code_type,
285                                  TIndex              idx);
286 
287     // Similar to above, but works for all code types expressible as
288     // a ESeq_code_type (i.e., iupacaa3 is expressible as an
289     // ESeq_code_type but not as a CSeq_data::E_Choice)
290     static const string& GetName(ESeq_code_type code_type,
291                                  TIndex         idx);
292 
293     // Given a code (symbol) for any code type expressible as a
294     // CSeq_data::E_Choice, returns the corresponding index
295     // (e.g., for iupacna, symbol "A" causes 65 to be returned)
296     // if the code type is available (e.g., ncbi8aa is not currently
297     // available). Use IsCodeAvailable() to find out which code types
298     // are available.Throws CBadType if code_type not available. Throws
299     // CBadSymbol if code is not a valid symbol for code_type.
300     static TIndex GetIndex(CSeq_data::E_Choice code_type, const string& code);
301 
302     // Similar to above, but works for all code types expressible as
303     // a ESeq_code_type (i.e., iupacaa3 is expressible as an
304     // ESeq_code_type but not as a CSeq_data::E_Choice)
305     static TIndex GetIndex(ESeq_code_type code_type, const string& code);
306 
307     // Get the index that is the complement of the index for the
308     // input CSeq_data::E_Choice code type (e.g., for iupacna, the
309     // complement of index 66 is 86). Throws CBadType if complements for
310     // code_type not available. Throws CBadIndex if idx out of range for
311     // code_type
312     static TIndex GetIndexComplement(CSeq_data::E_Choice code_type,
313                                      TIndex              idx);
314 
315     // Same as above, but for code type expressible as ESeq_code_type.
316     static TIndex GetIndexComplement(ESeq_code_type code_type,
317                                      TIndex         idx);
318 
319     // Takes an index for a from_type code and returns the index for to_type
320     // (e.g., GetMapToIndex(CSeq_data::e_Iupacna, CSeq_data::e_Ncbi2na,
321     // 68) returns 2). Returns 255 if no map value exists for a legal
322     // index (e.g., GetMapToIndex(CSeq_data::e_Iupacna, CSeq_data::e_Ncbi2na,
323     // 69) returns 255). Throws CBadType if map for from_type to to_type not
324     // available. Throws CBadIndex if from_idx out of range for from_type.
325     static TIndex GetMapToIndex(CSeq_data::E_Choice from_type,
326                              CSeq_data::E_Choice    to_type,
327                              TIndex                 from_idx);
328 
329     // Same as above, but uses ESeq_code_type.
330     static TIndex GetMapToIndex(ESeq_code_type from_type,
331                                 ESeq_code_type to_type,
332                                 TIndex         from_idx);
333 
334 private:
335 
336     // we maintain a singleton internally
337     // the singleton can be retrieved by calling x_GetImplementation()
338     static CSeqportUtil_implementation& x_GetImplementation (void);
339 };
340 
341 
342 END_objects_SCOPE
343 END_NCBI_SCOPE
344 
345 #endif  /* OBJECTS_SEQ___SEQPORT_UTIL__HPP */
346