1 /* $Id: cuSequence.hpp 609836 2020-06-08 15:56:03Z grichenk $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * Author: Adapted from CDTree1 code by Chris Lanczycki 27 * 28 * File Description: 29 * 30 * Functions for manipulating Bioseqs and other sequence representations 31 * 32 * =========================================================================== 33 */ 34 35 #ifndef CU_SEQUENCE_HPP 36 #define CU_SEQUENCE_HPP 37 38 // include ncbistd.hpp, ncbiobj.hpp, ncbi_limits.h, various stl containers 39 #include <corelib/ncbiargs.hpp> 40 #include <corelib/ncbienv.hpp> 41 #include <corelib/ncbistre.hpp> 42 #include <objects/seq/Bioseq.hpp> 43 #include <objects/seqloc/Seq_id.hpp> 44 #include <objects/seqset/Seq_entry.hpp> 45 #include <objects/seqblock/PDB_block.hpp> 46 47 BEGIN_NCBI_SCOPE 48 USING_SCOPE(objects); 49 50 BEGIN_SCOPE(cd_utils) 51 52 // the taxid for environmental sequences 53 const TTaxId ENVIRONMENTAL_SEQUENCE_TAX_ID = TAX_ID_CONST(256318); 54 55 // Wraps the CSeq_id.Match(id) method: id1.Match(id2). 56 NCBI_CDUTILS_EXPORT 57 bool SeqIdsMatch(const CRef< CSeq_id>& id1, const CRef< CSeq_id>& id2); 58 59 // Does the CSeq_id match any CSeq_id in the CBioseq? (Uses SeqIdsMatch above.) 60 NCBI_CDUTILS_EXPORT 61 bool SeqIdHasMatchInBioseq(const CRef< CSeq_id>& id, const CBioseq& bioseq); 62 63 // Return 0 if Seq_id is not of proper type (e_General and database 'CDD') 64 NCBI_CDUTILS_EXPORT 65 int GetCDDPssmIdFromSeqId(const CRef< CSeq_id >& id); 66 67 // Return -1 on failure; was FindMMDBIdInBioseq 68 NCBI_CDUTILS_EXPORT 69 int GetMMDBId (const CBioseq& bioseq); 70 71 // Consistent w/ CTaxon1 class, return 0 if no tax id was found, 72 // or -(firstTaxId) if multiple tax ids found. 73 NCBI_CDUTILS_EXPORT 74 TTaxId GetTaxIdInBioseq(const CBioseq& bioseq); 75 76 NCBI_CDUTILS_EXPORT 77 bool IsEnvironmentalSeq(const CBioseq& bioseq); 78 79 // Return species description as a string. 80 // Empty string returned on failure; was CCd::GetSpecies(...). 81 NCBI_CDUTILS_EXPORT 82 string GetSpeciesFromBioseq(const CBioseq& bioseq); 83 84 // length = 0 if detect error condition. 85 // Incorporates code from cdt_vutils & cdt_manipcd 86 NCBI_CDUTILS_EXPORT 87 int GetSeqLength(const CBioseq& bioseq); 88 NCBI_CDUTILS_EXPORT 89 bool GetSeqLength(const CRef< CSeq_entry >& seqEntry, int& len); 90 91 NCBI_CDUTILS_EXPORT 92 void NcbistdaaToNcbieaaString(const vector< char >& vec, string* str); // StringFromStdaa(...) 93 // Return false if there was an exception trying to convert the input string. 94 // Returns true otherwise, including for the case of an empty input string. 95 NCBI_CDUTILS_EXPORT 96 bool NcbieaaToNcbistdaaString(const std::string& str, vector < char >& vec); 97 NCBI_CDUTILS_EXPORT 98 bool GetNcbieaaString(const CBioseq& bioseq, string& str); 99 NCBI_CDUTILS_EXPORT 100 bool GetNcbistdSeq(const CBioseq& bioseq, vector<char>& seqData); 101 NCBI_CDUTILS_EXPORT 102 bool GetNcbieaaString(const CRef< CSeq_entry >& seqEntry, string& str); // from cdt_manipcd 103 NCBI_CDUTILS_EXPORT 104 string GetRawSequenceString(const CBioseq& bioseq); 105 106 // On failure, returns \0 (i.e., null character) 107 // If zeroBased == true, first letter is at index 0, otherwise number residues from 1. 108 NCBI_CDUTILS_EXPORT 109 char GetResidueAtPosition(const CBioseq& bioseq, int pos, bool zeroBasedPos = true); 110 NCBI_CDUTILS_EXPORT 111 char GetResidueAtPosition(const CRef< CSeq_entry >& seqEntry, int pos, bool zeroBasedPos = true); 112 113 NCBI_CDUTILS_EXPORT 114 bool IsConsensus(const CRef< CSeq_id >& seqId); 115 NCBI_CDUTILS_EXPORT 116 bool GetAccAndVersion(const CRef< CBioseq > bioseq, string& acc, int& version, CRef< CSeq_id>& seqId); 117 NCBI_CDUTILS_EXPORT 118 bool GetPDBBlockFromSeqEntry(CRef< CSeq_entry > seqEntry, CRef< CPDB_block >& pdbBlock); 119 NCBI_CDUTILS_EXPORT 120 bool checkAndFixPdbBioseq(CRef< CBioseq > bioseq); 121 122 // Return 'false' if the bioseq doesn't have a gi-typed seq-id. 123 // Last arg tells which id to use if there are multiple gis. 124 NCBI_CDUTILS_EXPORT 125 bool ExtractGi(const CRef<CBioseq>& bioseq, TGi& gi, unsigned int nth = 1); 126 NCBI_CDUTILS_EXPORT 127 bool CopyGiSeqId(const CRef<CBioseq>& bioseq, CRef<CSeq_id>& giSeqId, unsigned int nth = 1); 128 129 // Return 'false' if the bioseq doesn't have a pdb-typed seq-id. 130 // Last arg tells which id to use if there are multiple pdbs. 131 NCBI_CDUTILS_EXPORT 132 bool ExtractPdbMolChain(const CRef<CBioseq>& bioseq, string& pdbMol, string& pdbChain, unsigned int nth = 1); 133 NCBI_CDUTILS_EXPORT 134 bool CopyPdbSeqId(const CRef<CBioseq>& bioseq, CRef<CSeq_id>& pdbSeqId, unsigned int nth = 1); 135 136 // Returns true iff there is at least one ids of the requested type found. 137 NCBI_CDUTILS_EXPORT 138 bool HasSeqIdOfType(const CBioseq& bioseq, CSeq_id::E_Choice choice); 139 NCBI_CDUTILS_EXPORT 140 bool HasSeqIdOfType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice); 141 142 // Returns number of ids of the requested type found. 143 // Returned CSeq_id objects are copies of those found in the bioseq/seqEntry. 144 NCBI_CDUTILS_EXPORT 145 unsigned int CopySeqIdsOfType(const CBioseq& bioseq, CSeq_id::E_Choice choice, list< CRef< CSeq_id > >& idsOfType); 146 NCBI_CDUTILS_EXPORT 147 unsigned int CopySeqIdsOfType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, list< CRef< CSeq_id > >& idsOfType); 148 149 // Return 'false' if the seqEntry doesn't have a bioseq containing a seq-id of the requested type. 150 // Returned CBioseq object is a copy of that found in the bioseq/seqEntry. 151 NCBI_CDUTILS_EXPORT 152 bool CopyBioseqWithType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq >& seqEntryBioseq) ; 153 154 // Return 'false' if the seqEntry doesn't have a bioseq containing a seq-id of the requested type. 155 // Returned CBioseq object is an editable reference to the one in the CSeq_entry passed in. 156 NCBI_CDUTILS_EXPORT 157 bool GetBioseqWithType(CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq >& seqEntryBioseq) ; 158 159 // Return 'false' if the comment was not added. Empty comment strings are not added. 160 NCBI_CDUTILS_EXPORT 161 bool AddCommentToBioseq(CBioseq& bioseq, const string& comment); 162 163 // Simplify the CBioseq object to strip out elements not needed in a CD. 164 // Keep any comment-type CSeqdesc that match a strings in 'keptComments', 165 // and keep the CPDB_block for PDB CSeqdesc if 'keepPDBBlock' is true. 166 // Initially used for simplifying CBioseqs in CSeq_entry blobs retrieved from ID1. 167 NCBI_CDUTILS_EXPORT 168 void SimplifyBioseqForCD(CBioseq& bioseq, const vector<string>& keptComments, bool keepPDBBlock); 169 170 // Simplify the CBioseq objects in a CSeq_entry to strip out elements not needed in a CD. 171 // Wrapper for SimplifyBioseqForCD. 172 NCBI_CDUTILS_EXPORT 173 void SimplifySeqEntryForCD(CRef< CSeq_entry >& seqEntry, const vector<string>& keptComments, bool keepPDBBlock); 174 175 // First two are wrappers for the third function, that extracts a database source or accession 176 // for any Seq-id type. 177 NCBI_CDUTILS_EXPORT 178 string GetDbSourceForSeqId(const CRef< CSeq_id >& seqID); // gets the most exact source 179 NCBI_CDUTILS_EXPORT 180 string GetAccessionForSeqId(const CRef< CSeq_id >& seqID); 181 182 // If the 'getGenericSource' flag is true, only the generic type of the database source is reported; 183 // when false, a more exact dbSource is returned, where possible: relevant primarily when dealing 184 // with a refseq. 185 NCBI_CDUTILS_EXPORT 186 void GetAccessionAndDatabaseSource(const CRef< CSeq_id >& seqID, string& accession, string& dbSource, bool getGenericSource = true); 187 188 struct BioseqInfo 189 { 190 string acession; 191 int version; 192 string defline; 193 short dbsource; 194 }; 195 196 //return false if no accession is found 197 NCBI_CDUTILS_EXPORT 198 bool extractBioseqInfo(const CRef< CBioseq > bioseq, BioseqInfo&); 199 200 END_SCOPE(cd_utils) // namespace ncbi::objects:: 201 202 END_NCBI_SCOPE 203 204 205 #endif // ALGSEQUENCE_HPP 206