1 /* $Id: cuSequence.hpp 609836 2020-06-08 15:56:03Z grichenk $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Adapted from CDTree1 code by Chris Lanczycki
27  *
28  * File Description:
29  *
30  *       Functions for manipulating Bioseqs and other sequence representations
31  *
32  * ===========================================================================
33  */
34 
35 #ifndef CU_SEQUENCE_HPP
36 #define CU_SEQUENCE_HPP
37 
38 // include ncbistd.hpp, ncbiobj.hpp, ncbi_limits.h, various stl containers
39 #include <corelib/ncbiargs.hpp>
40 #include <corelib/ncbienv.hpp>
41 #include <corelib/ncbistre.hpp>
42 #include <objects/seq/Bioseq.hpp>
43 #include <objects/seqloc/Seq_id.hpp>
44 #include <objects/seqset/Seq_entry.hpp>
45 #include <objects/seqblock/PDB_block.hpp>
46 
47 BEGIN_NCBI_SCOPE
48 USING_SCOPE(objects);
49 
50 BEGIN_SCOPE(cd_utils)
51 
52 // the taxid for environmental sequences
53 const  TTaxId ENVIRONMENTAL_SEQUENCE_TAX_ID = TAX_ID_CONST(256318);
54 
55 //  Wraps the CSeq_id.Match(id) method:  id1.Match(id2).
56 NCBI_CDUTILS_EXPORT
57 bool SeqIdsMatch(const CRef< CSeq_id>& id1, const CRef< CSeq_id>& id2);
58 
59 //  Does the CSeq_id match any CSeq_id in the CBioseq?  (Uses SeqIdsMatch above.)
60 NCBI_CDUTILS_EXPORT
61 bool SeqIdHasMatchInBioseq(const CRef< CSeq_id>& id, const CBioseq& bioseq);
62 
63 //   Return 0 if Seq_id is not of proper type (e_General and database 'CDD')
64 NCBI_CDUTILS_EXPORT
65 int  GetCDDPssmIdFromSeqId(const CRef< CSeq_id >& id);
66 
67 //  Return -1 on failure; was FindMMDBIdInBioseq
68 NCBI_CDUTILS_EXPORT
69 int    GetMMDBId (const CBioseq& bioseq);
70 
71 //  Consistent w/ CTaxon1 class, return 0 if no tax id was found,
72 //  or -(firstTaxId) if multiple tax ids found.
73 NCBI_CDUTILS_EXPORT
74 TTaxId  GetTaxIdInBioseq(const CBioseq& bioseq);
75 
76 NCBI_CDUTILS_EXPORT
77 bool IsEnvironmentalSeq(const CBioseq& bioseq);
78 
79 //  Return species description as a string.
80 //  Empty string returned on failure; was CCd::GetSpecies(...).
81 NCBI_CDUTILS_EXPORT
82 string GetSpeciesFromBioseq(const CBioseq& bioseq);
83 
84 //  length = 0 if detect error condition.
85 //  Incorporates code from cdt_vutils & cdt_manipcd
86 NCBI_CDUTILS_EXPORT
87 int    GetSeqLength(const CBioseq& bioseq);
88 NCBI_CDUTILS_EXPORT
89 bool   GetSeqLength(const CRef< CSeq_entry >& seqEntry, int& len);
90 
91 NCBI_CDUTILS_EXPORT
92 void   NcbistdaaToNcbieaaString(const vector< char >& vec, string* str);  //  StringFromStdaa(...)
93 //  Return false if there was an exception trying to convert the input string.
94 //  Returns true otherwise, including for the case of an empty input string.
95 NCBI_CDUTILS_EXPORT
96 bool NcbieaaToNcbistdaaString(const std::string& str, vector < char >& vec);
97 NCBI_CDUTILS_EXPORT
98 bool   GetNcbieaaString(const CBioseq& bioseq, string& str);
99 NCBI_CDUTILS_EXPORT
100 bool GetNcbistdSeq(const CBioseq& bioseq, vector<char>& seqData);
101 NCBI_CDUTILS_EXPORT
102 bool   GetNcbieaaString(const CRef< CSeq_entry >& seqEntry, string& str);  //  from cdt_manipcd
103 NCBI_CDUTILS_EXPORT
104 string GetRawSequenceString(const CBioseq& bioseq);
105 
106 //  On failure, returns \0 (i.e., null character)
107 //  If zeroBased == true, first letter is at index 0, otherwise number residues from 1.
108 NCBI_CDUTILS_EXPORT
109 char   GetResidueAtPosition(const CBioseq& bioseq, int pos, bool zeroBasedPos = true);
110 NCBI_CDUTILS_EXPORT
111 char   GetResidueAtPosition(const CRef< CSeq_entry >& seqEntry, int pos, bool zeroBasedPos = true);
112 
113 NCBI_CDUTILS_EXPORT
114 bool IsConsensus(const CRef< CSeq_id >& seqId);
115 NCBI_CDUTILS_EXPORT
116 bool GetAccAndVersion(const CRef< CBioseq > bioseq, string& acc, int& version, CRef< CSeq_id>& seqId);
117 NCBI_CDUTILS_EXPORT
118 bool GetPDBBlockFromSeqEntry(CRef< CSeq_entry > seqEntry, CRef< CPDB_block >& pdbBlock);
119 NCBI_CDUTILS_EXPORT
120 bool checkAndFixPdbBioseq(CRef< CBioseq > bioseq);
121 
122 //  Return 'false' if the bioseq doesn't have a gi-typed seq-id.
123 //  Last arg tells which id to use if there are multiple gis.
124 NCBI_CDUTILS_EXPORT
125 bool ExtractGi(const CRef<CBioseq>& bioseq, TGi& gi, unsigned int nth = 1);
126 NCBI_CDUTILS_EXPORT
127 bool CopyGiSeqId(const CRef<CBioseq>& bioseq, CRef<CSeq_id>& giSeqId, unsigned int nth = 1);
128 
129 //  Return 'false' if the bioseq doesn't have a pdb-typed seq-id.
130 //  Last arg tells which id to use if there are multiple pdbs.
131 NCBI_CDUTILS_EXPORT
132 bool ExtractPdbMolChain(const CRef<CBioseq>& bioseq, string& pdbMol, string& pdbChain, unsigned int nth = 1);
133 NCBI_CDUTILS_EXPORT
134 bool CopyPdbSeqId(const CRef<CBioseq>& bioseq, CRef<CSeq_id>& pdbSeqId, unsigned int nth = 1);
135 
136 //  Returns true iff there is at least one ids of the requested type found.
137 NCBI_CDUTILS_EXPORT
138 bool HasSeqIdOfType(const CBioseq& bioseq, CSeq_id::E_Choice choice);
139 NCBI_CDUTILS_EXPORT
140 bool HasSeqIdOfType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice);
141 
142 //  Returns number of ids of the requested type found.
143 //  Returned CSeq_id objects are copies of those found in the bioseq/seqEntry.
144 NCBI_CDUTILS_EXPORT
145 unsigned int CopySeqIdsOfType(const CBioseq& bioseq, CSeq_id::E_Choice choice, list< CRef< CSeq_id > >& idsOfType);
146 NCBI_CDUTILS_EXPORT
147 unsigned int CopySeqIdsOfType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, list< CRef< CSeq_id > >& idsOfType);
148 
149 //  Return 'false' if the seqEntry doesn't have a bioseq containing a seq-id of the requested type.
150 //  Returned CBioseq object is a copy of that found in the bioseq/seqEntry.
151 NCBI_CDUTILS_EXPORT
152 bool CopyBioseqWithType(const CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq >& seqEntryBioseq) ;
153 
154 //  Return 'false' if the seqEntry doesn't have a bioseq containing a seq-id of the requested type.
155 //  Returned CBioseq object is an editable reference to the one in the CSeq_entry passed in.
156 NCBI_CDUTILS_EXPORT
157 bool GetBioseqWithType(CRef< CSeq_entry >& seqEntry, CSeq_id::E_Choice choice, CRef< CBioseq >& seqEntryBioseq) ;
158 
159 //  Return 'false' if the comment was not added.  Empty comment strings are not added.
160 NCBI_CDUTILS_EXPORT
161 bool AddCommentToBioseq(CBioseq& bioseq, const string& comment);
162 
163 //  Simplify the CBioseq object to strip out elements not needed in a CD.
164 //  Keep any comment-type CSeqdesc that match a strings in 'keptComments',
165 //  and keep the CPDB_block for PDB CSeqdesc if 'keepPDBBlock' is true.
166 //  Initially used for simplifying CBioseqs in CSeq_entry blobs retrieved from ID1.
167 NCBI_CDUTILS_EXPORT
168 void SimplifyBioseqForCD(CBioseq& bioseq, const vector<string>& keptComments, bool keepPDBBlock);
169 
170 //  Simplify the CBioseq objects in a CSeq_entry to strip out elements not needed in a CD.
171 //  Wrapper for SimplifyBioseqForCD.
172 NCBI_CDUTILS_EXPORT
173 void SimplifySeqEntryForCD(CRef< CSeq_entry >& seqEntry, const vector<string>& keptComments, bool keepPDBBlock);
174 
175 //  First two are wrappers for the third function, that extracts a database source or accession
176 //  for any Seq-id type.
177 NCBI_CDUTILS_EXPORT
178 string GetDbSourceForSeqId(const CRef< CSeq_id >& seqID);   //  gets the most exact source
179 NCBI_CDUTILS_EXPORT
180 string GetAccessionForSeqId(const CRef< CSeq_id >& seqID);
181 
182 //  If the 'getGenericSource' flag is true, only the generic type of the database source is reported;
183 //  when false, a more exact dbSource is returned, where possible:  relevant primarily when dealing
184 //  with a refseq.
185 NCBI_CDUTILS_EXPORT
186 void GetAccessionAndDatabaseSource(const CRef< CSeq_id >& seqID, string& accession, string& dbSource, bool getGenericSource = true);
187 
188 struct BioseqInfo
189 {
190 	string acession;
191 	int version;
192 	string defline;
193 	short dbsource;
194 };
195 
196 //return false if no accession is found
197 NCBI_CDUTILS_EXPORT
198 bool extractBioseqInfo(const CRef< CBioseq > bioseq, BioseqInfo&);
199 
200 END_SCOPE(cd_utils) // namespace ncbi::objects::
201 
202 END_NCBI_SCOPE
203 
204 
205 #endif // ALGSEQUENCE_HPP
206