1 #ifndef OBJECTS_SEQ___SEQPORT_UTIL__HPP 2 #define OBJECTS_SEQ___SEQPORT_UTIL__HPP 3 4 /* $Id: seqport_util.hpp 392960 2013-03-21 12:46:54Z vasilche $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Clifford Clausen 30 * (also reviewed/fixed/groomed by Denis Vakatov and Aaron Ucko) 31 * 32 * File Description: 33 */ 34 #include <corelib/ncbi_limits.hpp> 35 #include <objects/seq/Seq_data.hpp> 36 #include <objects/seqcode/Seq_code_type.hpp> 37 #include <memory> 38 #include <vector> 39 40 41 BEGIN_NCBI_SCOPE 42 BEGIN_objects_SCOPE 43 44 45 // CSeqportUtil is a wrapper for a hidden object of class 46 // CSeqportUtil_implementation. 47 48 class CSeqportUtil_implementation; 49 50 51 class NCBI_SEQ_EXPORT CSeqportUtil 52 { 53 public: 54 55 // TypeDefs 56 typedef unsigned int TIndex; 57 typedef pair<TIndex, TIndex> TPair; 58 59 // Classes thrown as errors 60 struct NCBI_SEQ_EXPORT CBadIndex : public runtime_error 61 { CBadIndexCSeqportUtil::CBadIndex62 CBadIndex(TIndex idx, string method) 63 : runtime_error("CSeqportUtil::" + method + 64 " -- bad index specified: " + NStr::UIntToString(idx)) {} 65 }; 66 struct NCBI_SEQ_EXPORT CBadSymbol : public runtime_error 67 { CBadSymbolCSeqportUtil::CBadSymbol68 CBadSymbol(string code, string method) 69 : runtime_error("CSeqportUtil::" + method + 70 " -- bad symbol specified: " + code) {} 71 }; 72 struct NCBI_SEQ_EXPORT CBadType : public runtime_error 73 { CBadTypeCSeqportUtil::CBadType74 CBadType(string method) 75 : runtime_error("CSeqportUtil::" + method + 76 " -- specified code or code combination not supported") {} 77 }; 78 79 // Alphabet conversion function. Function returns the 80 // number of converted codes. 81 static TSeqPos Convert(const CSeq_data& in_seq, 82 CSeq_data* out_seq, 83 CSeq_data::E_Choice to_code, 84 TSeqPos uBeginIdx = 0, 85 TSeqPos uLength = 0, 86 bool bAmbig = false, 87 Uint4 seed = 17734276); 88 89 // Alphabet conversion function with ambiguities in BLAST form, for DNA only. 90 // Target encoding is ncbi2na. Ambiguities are returned in a separate vector. 91 // This function can be called in a loop, in which the returned data length 92 // and ambiguity vector are accumulated. 93 // @param in_seq Input sequence [in] 94 // @param out_seq Output sequence [out] 95 // @param uBeginIdx Starting offset in input sequence [in] 96 // @param uLength Ending offset in input sequence [in] 97 // @param total_length Total length of sequences to be converted. The BLAST 98 // ambiguity encoding depends on whether total length is >= 1<<24. 99 // @param out_seq_length Number of bytes in the output sequence data. 100 // Incremented if != 0 on input. [in|out] 101 // @param blast_ambig Ambiguities encoded in BLAST form. The vector is 102 // appended, so this function can be called in a loop. [in|out] 103 // @return Number of converted codes. 104 static TSeqPos 105 ConvertWithBlastAmbig(const CSeq_data& in_seq, 106 CSeq_data* out_seq, 107 TSeqPos uBeginIdx, 108 TSeqPos uLength, 109 TSeqPos total_length, 110 TSeqPos* out_seq_length, 111 vector<Uint4>* blast_ambig); 112 113 // Function to provide maximum in-place packing of na 114 // sequences without loss of information. Iupacna 115 // can always be packed to ncbi4na without loss. Iupacna 116 // can sometimes be packed to ncbi2na. Ncbi4na can 117 // sometimes be packed to ncbi2na. Returns number of 118 // residues packed. If in_seq cannot be packed, the 119 // original in_seq is returned unchanged and the return value 120 // from Pack is 0 121 static TSeqPos Pack(CSeq_data* in_seq, 122 TSeqPos uLength = ncbi::numeric_limits<TSeqPos>::max()); 123 124 // Performs fast validation of CSeq_data. If all data in the 125 // sequence represent valid elements of a biological sequence, then 126 // FastValidate returns true. Otherwise it returns false 127 static bool FastValidate(const CSeq_data& in_seq, 128 TSeqPos uBeginIdx = 0, 129 TSeqPos uLength = 0); 130 131 // Performs validation of CSeq_data. Returns a list of indices 132 // corresponding to data that does not represent a valid element 133 // of a biological sequence. 134 static void Validate(const CSeq_data& in_seq, 135 vector<TSeqPos>* badIdx, 136 TSeqPos uBeginIdx = 0, 137 TSeqPos uLength = 0); 138 139 // Get ambiguous bases. out_indices returns 140 // the indices relative to in_seq of ambiguous bases. 141 // out_seq returns the ambiguous bases. Note, there are 142 // only ambiguous bases for iupacna->ncib2na and 143 // ncib4na->ncbi2na coversions. 144 static TSeqPos GetAmbigs(const CSeq_data& in_seq, 145 CSeq_data* out_seq, 146 vector<TSeqPos>* out_indices, 147 CSeq_data::E_Choice to_code = CSeq_data::e_Ncbi2na, 148 TSeqPos uBeginIdx = 0, 149 TSeqPos uLength = 0); 150 151 // Get a copy of CSeq_data. No conversion is done. uBeginIdx of the 152 // biological sequence in in_seq will be in position 153 // 0 of out_seq. Usually, uLength bases will be copied 154 // from in_seq to out_seq. If uLength goes beyond the end of 155 // in_seq, it will be shortened to go to the end of in_seq. 156 // For packed sequence formats (ncbi2na and ncbi4na), 157 // only uLength bases are valid copies. For example, 158 // in an ncbi4na encoded sequence, if uLength is odd, the last 159 // sequence returned will be uLength+1 because 2 bases are encoded 160 // per byte in ncbi4na. However, in this case, uLength will be returned 161 // unchanged (it will remain odd unless it goes beyond the end 162 // of in_seq). If uLength=0, then a copy from uBeginIdx to the end 163 // of in_seq is returned. 164 static TSeqPos GetCopy(const CSeq_data& in_seq, 165 CSeq_data* out_seq, 166 TSeqPos uBeginIdx = 0, 167 TSeqPos uLength = 0); 168 169 // Method to keep only a contiguous piece of a sequence beginning 170 // at uBeginIdx and uLength residues long. Does bit shifting as 171 // needed to put uBeginIdx of original sequence at position zero on output. 172 // Similar to GetCopy(), but done in place. Returns length of 173 // kept sequence. 174 static TSeqPos Keep(CSeq_data* in_seq, 175 TSeqPos uBeginIdx = 0, 176 TSeqPos uLength = 0); 177 178 // Append in_seq2 to to end of in_seq1. Both in seqs must be 179 // in the same alphabet or this method will throw a runtime_error. 180 // The result of the append will be put into out_seq. 181 // For packed sequences ncbi2na and ncbi4na, Append will shift and 182 // append so as to remove any jaggedness at the append point. 183 static TSeqPos Append(CSeq_data* out_seq, 184 const CSeq_data& in_seq1, 185 TSeqPos uBeginIdx1, 186 TSeqPos uLength1, 187 const CSeq_data& in_seq2, 188 TSeqPos uBeginIdx2, 189 TSeqPos uLength2); 190 191 // Create a biological complement of an na sequence. 192 // Attempts to complement an aa sequence will throw 193 // a runtime_error. Returns length of complemented sequence. 194 195 // Complement the input sequence in place 196 static TSeqPos Complement(CSeq_data* in_seq, 197 TSeqPos uBeginIdx = 0, 198 TSeqPos uLength = 0); 199 200 // Complement the input sequence and put the result in 201 // the output sequence 202 static TSeqPos Complement(const CSeq_data& in_seq, 203 CSeq_data* out_seq, 204 TSeqPos uBeginIdx = 0, 205 TSeqPos uLength = 0); 206 207 // Create a biological sequence that is the reversse 208 // of an input sequence. Attempts to reverse an aa 209 // sequence will throw a runtime_error. Returns length of 210 // reversed sequence. 211 212 // Reverse a sequence in place 213 static TSeqPos Reverse(CSeq_data* in_seq, 214 TSeqPos uBeginIdx = 0, 215 TSeqPos uLength = 0); 216 217 // Reverse an input sequence and put result in output sequence. 218 // Reverses packed bytes as necessary. 219 static TSeqPos Reverse(const CSeq_data& in_seq, 220 CSeq_data* out_seq, 221 TSeqPos uBeginIdx = 0, 222 TSeqPos uLength = 0); 223 224 225 // Create the reverse complement of an input sequence. Attempts 226 // to reverse-complement an aa sequence will throw a 227 // runtime_error. 228 229 // Reverse complement a sequence in place 230 static TSeqPos ReverseComplement(CSeq_data* in_seq, 231 TSeqPos uBeginIdx = 0, 232 TSeqPos uLength = 0); 233 234 // Reverse complmenet a sequence and put result in output sequence 235 static TSeqPos ReverseComplement(const CSeq_data& in_seq, 236 CSeq_data* out_seq, 237 TSeqPos uBeginIdx = 0, 238 TSeqPos uLength = 0); 239 240 // Given an Ncbistdaa input code index, returns the 3 letter Iupacaa3 code 241 // Implementation is efficient 242 static const string& GetIupacaa3(TIndex ncbistdaa); 243 244 // Given a code type expressible as enum CSeq_data::E_Choice, returns 245 // true if the code type is available. See Seq_data_.hpp for definition 246 // of enum CSeq_data::E_Choice. 247 static bool IsCodeAvailable(CSeq_data::E_Choice code_type); 248 249 // Same as above, but for code types expressible as enum ESeq_code_type. 250 // See Seq_code_type_.hpp for definition of enum ESeq_code_type. 251 static bool IsCodeAvailable(ESeq_code_type code_type); 252 253 // Gets the first and last indices of a code type (e.g., for iupacna, 254 // the first index is 65 and the last index is 89). Throws CBadType 255 // if code_type is not available 256 static TPair GetCodeIndexFromTo(CSeq_data::E_Choice code_type); 257 258 // Same as above but takes code type expressed as an ESeq_code_type 259 static TPair GetCodeIndexFromTo(ESeq_code_type code_type); 260 261 // Given an index for any code type expressible as a 262 // CSeq_data::E_Choice, returns the corresponding symbol (code) 263 // (e.g., for iupacna, idx 65 causes "A" to be returned) 264 // if the code type is available (e.g., ncbi8aa is not currently 265 // available). Use IsCodeAvailable() to find out which code types 266 // are available. Throws CBadType if code_type not available. Throws 267 // CBadIndex if idx out of range for code_type. 268 static const string& GetCode(CSeq_data::E_Choice code_type, 269 TIndex idx); 270 271 // Similar to above, but works for all code types expressible as 272 // a ESeq_code_type (i.e., iupacaa3 is expressible as an 273 // ESeq_code_type but not as a CSeq_data::E_Choice) 274 static const string& GetCode(ESeq_code_type code_type, 275 TIndex idx); 276 277 // Given an index for any code type expressible as a 278 // CSeq_data::E_Choice, returns the corresponding name 279 // (e.g., for iupacna, idx 65 causes "Adenine" to be returned) 280 // if the code type is available (e.g., ncbi8aa is not currently 281 // available). Use IsCodeAvailable() to find out which code types 282 // are available. Throws CBadType if code_type not available. Throws 283 // CBadIndex if idx out of range for code_type. 284 static const string& GetName(CSeq_data::E_Choice code_type, 285 TIndex idx); 286 287 // Similar to above, but works for all code types expressible as 288 // a ESeq_code_type (i.e., iupacaa3 is expressible as an 289 // ESeq_code_type but not as a CSeq_data::E_Choice) 290 static const string& GetName(ESeq_code_type code_type, 291 TIndex idx); 292 293 // Given a code (symbol) for any code type expressible as a 294 // CSeq_data::E_Choice, returns the corresponding index 295 // (e.g., for iupacna, symbol "A" causes 65 to be returned) 296 // if the code type is available (e.g., ncbi8aa is not currently 297 // available). Use IsCodeAvailable() to find out which code types 298 // are available.Throws CBadType if code_type not available. Throws 299 // CBadSymbol if code is not a valid symbol for code_type. 300 static TIndex GetIndex(CSeq_data::E_Choice code_type, const string& code); 301 302 // Similar to above, but works for all code types expressible as 303 // a ESeq_code_type (i.e., iupacaa3 is expressible as an 304 // ESeq_code_type but not as a CSeq_data::E_Choice) 305 static TIndex GetIndex(ESeq_code_type code_type, const string& code); 306 307 // Get the index that is the complement of the index for the 308 // input CSeq_data::E_Choice code type (e.g., for iupacna, the 309 // complement of index 66 is 86). Throws CBadType if complements for 310 // code_type not available. Throws CBadIndex if idx out of range for 311 // code_type 312 static TIndex GetIndexComplement(CSeq_data::E_Choice code_type, 313 TIndex idx); 314 315 // Same as above, but for code type expressible as ESeq_code_type. 316 static TIndex GetIndexComplement(ESeq_code_type code_type, 317 TIndex idx); 318 319 // Takes an index for a from_type code and returns the index for to_type 320 // (e.g., GetMapToIndex(CSeq_data::e_Iupacna, CSeq_data::e_Ncbi2na, 321 // 68) returns 2). Returns 255 if no map value exists for a legal 322 // index (e.g., GetMapToIndex(CSeq_data::e_Iupacna, CSeq_data::e_Ncbi2na, 323 // 69) returns 255). Throws CBadType if map for from_type to to_type not 324 // available. Throws CBadIndex if from_idx out of range for from_type. 325 static TIndex GetMapToIndex(CSeq_data::E_Choice from_type, 326 CSeq_data::E_Choice to_type, 327 TIndex from_idx); 328 329 // Same as above, but uses ESeq_code_type. 330 static TIndex GetMapToIndex(ESeq_code_type from_type, 331 ESeq_code_type to_type, 332 TIndex from_idx); 333 334 private: 335 336 // we maintain a singleton internally 337 // the singleton can be retrieved by calling x_GetImplementation() 338 static CSeqportUtil_implementation& x_GetImplementation (void); 339 }; 340 341 342 END_objects_SCOPE 343 END_NCBI_SCOPE 344 345 #endif /* OBJECTS_SEQ___SEQPORT_UTIL__HPP */ 346