1 #ifndef ALGO_BLAST_API___REMOTE_SERVICES__HPP
2 #define ALGO_BLAST_API___REMOTE_SERVICES__HPP
3 
4 /*  $Id: blast_services.hpp 575325 2018-11-27 18:22:00Z ucko $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Authors:  Christiam Camacho, Kevin Bealer
30  *
31  */
32 
33 /// @file blast_services.hpp
34 /// Declares the CBlastServices class.
35 
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbiobj.hpp>
38 #include <objects/seqloc/Seq_interval.hpp>
39 #include <objects/blast/blast__.hpp>
40 #include <objects/blast/names.hpp>
41 #include <objects/scoremat/PssmWithParameters.hpp>
42 
43 /** @addtogroup AlgoBlast
44  *
45  * @{
46  */
47 
48 BEGIN_NCBI_SCOPE
49 
50 BEGIN_SCOPE(objects)
51     /// forward declaration of ASN.1 object containing PSSM (scoremat.asn)
52     class CBioseq_set;
53     class CSeq_loc;
54     class CSeq_id;
55     class CSeq_align_set;
56 END_SCOPE(objects)
57 
58 using namespace ncbi::objects;
59 
60 #ifndef NCBI_MODULE
61 #define NCBI_MODULE NETBLAST
62 #endif
63 
64 /// RemoteServicesException
65 ///
66 
67 class NCBI_XOBJREAD_EXPORT CBlastServicesException : public CException {
68 public:
69     /// Errors are classified into one of two types.
70     enum EErrCode {
71         /// Argument validation failed.
72         eArgErr,
73 
74         /// Files were missing or contents were incorrect.
75         eFileErr,
76 
77         /// Request failed
78         eRequestErr,
79 
80         /// Memory allocation failed.
81         eMemErr
82     };
83 
84     /// Get a message describing the situation leading to the throw.
GetErrCodeString() const85     virtual const char* GetErrCodeString() const override
86     {
87         switch ( GetErrCode() ) {
88         case eArgErr:  return "eArgErr";
89         case eFileErr: return "eFileErr";
90         case eRequestErr: return "eRequestErr";
91         default:       return CException::GetErrCodeString();
92         }
93     }
94 
95     /// Include standard NCBI exception behavior.
96     NCBI_EXCEPTION_DEFAULT(CBlastServicesException, CException);
97 };
98 
99 
100 
101 /// API for Remote Blast Services
102 ///
103 /// Class to obtain information and data from the Remote BLAST service that is
104 /// not associated with a specific BLAST search
105 
106 class NCBI_XOBJREAD_EXPORT CBlastServices : public CObject
107 {
108 public:
109     /// Default constructor
CBlastServices()110     CBlastServices() { m_Verbose = false; }
111 
112     /// Analogous to CRemoteBlast::SetVerbose
SetVerbose(bool value=true)113     void SetVerbose(bool value = true) { m_Verbose = value; }
114 
115     /// Returns true if the BLAST database specified exists in the NCBI servers
116     /// @param dbname BLAST database name [in]
117     /// @param is_protein is this a protein database? [in]
118     bool IsValidBlastDb(const string& dbname, bool is_protein);
119 
120     /// Retrieve detailed information for one BLAST database
121     /// If information about multiple databases is needed, use
122     /// the other GetDatabaseInfo method.
123     ///
124     /// @param blastdb object describing the database for which to get
125     /// detailed information
126     /// @return Detailed information for the requested BLAST database or an
127     /// empty object is the requested database wasn't found
128     CRef<objects::CBlast4_database_info>
129     GetDatabaseInfo(CRef<objects::CBlast4_database> blastdb);
130 
131     /// Retrieve detailed information for databases listed
132     /// in the string.  If more than one database is supplied, it
133     /// they should be separated by spaces (e.g., "nt wgs est").
134     ///
135     /// @param dbname string listing the database(s)
136     /// @param is_protein is a protein for true, otherwise dna
137     /// @param found_all true if all databases were found.
138     /// @param missing_names pointer to an array with missing database(s)
139     /// @return Detailed information for the requested BLAST databases or an
140     /// empty vector if no databases were found.
141     vector< CRef<objects::CBlast4_database_info> >
142     GetDatabaseInfo(const string& dbname, bool is_protein, bool *found_all,
143 	    vector<string> *missing_names = NULL);
144     /// Same as GetDatabaseInfo but  retrieving whole list of database
145     vector< CRef<objects::CBlast4_database_info> >
146     GetDatabaseInfoLegacy(const string& dbname, bool is_protein, bool *found_all,
147 	    vector<string> *missing_names = NULL);
148     /// Retrieve organism specific repeats databases
149     vector< CRef<objects::CBlast4_database_info> >
150     GetOrganismSpecificRepeatsDatabases();
151 
152     /// Retrieve a list of NCBI taxonomy IDs for which there exists
153     /// windowmasker masking data to support an alternative organism specific
154     /// filtering
155     objects::CBlast4_get_windowmasked_taxids_reply::Tdata
156     GetTaxIdWithWindowMaskerSupport();
157 
158     /// Defines a std::vector of CRef<CSeq_id>
159     typedef vector< CRef<objects::CSeq_id> > TSeqIdVector;
160     /// Defines a std::vector of CRef<CBioseq>
161     typedef vector< CRef<objects::CBioseq> > TBioseqVector;
162 
163    /// Get a set of Bioseqs without their sequence data given an input set of
164     /// Seq-ids.
165     ///
166     /// @param seqids   A vector of Seq-ids for which Bioseqs are requested.
167     /// @param database A list of databases from which to get the sequences.
168     /// @param seqtype  The residue type, 'p' from protein, 'n' for nucleotide.
169     /// @param bioseqs  The vector used to return the requested Bioseqs.
170     /// @param errors   A null-separated list of errors.
171     /// @param warnings A null-separated list of warnings.
172     /// @param verbose  Produce verbose output. [in]
173     /// @param target_only Filter the defline to include only the requested id. [in]
174     /// @todo FIXME: Add retry logic in case of transient errors
175     static void
176     GetSequencesInfo(TSeqIdVector& seqids,      // in
177                      const string& database,    // in
178                      char seqtype,              // 'p' or 'n'
179                      TBioseqVector& bioseqs,    // out
180                      string& errors,            // out
181                      string& warnings,          // out
182                      bool verbose = false,      // in
183                      bool target_only = false); // in
184 
185     /// Get a set of Bioseqs given an input set of Seq-ids.
186     ///
187     /// This retrieves the Bioseqs corresponding to the given Seq-ids
188     /// from the blast4 server.  Normally this will be much faster
189     /// than consulting ID1 seperately for each sequence.  Sometimes
190     /// there are multiple sequences for a given Seq-id.  In such
191     /// cases, there are always 'non-ambiguous' ids available.  This
192     /// interface does not currently address this issue, and will
193     /// simply return the Bioseqs corresponding to one of the
194     /// sequences.  Errors will be returned if the operation cannot be
195     /// completed (or started).  In the case of a sequence that cannot
196     /// be found, the error will indicate the index of (and Seq-id of)
197     /// the missing sequence; processing will continue, and the
198     /// sequences that can be found will be returned along with the
199     /// error.
200     ///
201     /// @param seqids   A vector of Seq-ids for which Bioseqs are requested.
202     /// @param database A list of databases from which to get the sequences.
203     /// @param seqtype  The residue type, 'p' from protein, 'n' for nucleotide.
204     /// @param bioseqs  The vector used to return the requested Bioseqs.
205     /// @param errors   A null-separated list of errors.
206     /// @param warnings A null-separated list of warnings.
207     /// @param verbose  Produce verbose output. [in]
208     /// @param target_only Filter the defline to include only the requested id. [in]
209     /// @todo FIXME: Add retry logic in case of transient errors
210     static void
211     GetSequences(TSeqIdVector& seqids,      // in
212                  const string& database,    // in
213                  char seqtype,              // 'p' or 'n'
214                  TBioseqVector& bioseqs,    // out
215                  string& errors,            // out
216                  string& warnings,          // out
217                  bool verbose = false,      // in
218                  bool target_only = false); // in
219     /// Defines a std::vector of CRef<CSeq_interval>
220     typedef vector< CRef<objects::CSeq_interval> > TSeqIntervalVector;
221     /// Defines a std::vector of CRef<CSeq_data>
222     typedef vector< CRef<objects::CSeq_data> > TSeqDataVector;
223 
224     /// This retrieves (partial) sequence data from the remote BLAST server.
225     ///
226     /// @param seqid
227     ///     A vector of Seq-ids for which sequence data are requested. [in]
228     /// @param database
229     ///     A list of databases from which to get the sequences. [in]
230     /// @param seqtype
231     ///     The residue type, 'p' from protein, 'n' for nucleotide. [in]
232     /// @param ids
233     ///     The sequence IDs for those sequences which the seq data was
234     //      obtained successfully [out]
235     /// @param seq_data
236     ///     Sequence data in CSeq_data format. [out]
237     /// @param errors
238     ///     An error message (if any). [out]
239     /// @param warnings
240     ///     A warning (if any). [out]
241     /// @param verbose
242     ///     Produce verbose output. [in]
243     /// @todo FIXME: Add retry logic in case of transient errors
244     static void
245     GetSequenceParts(const TSeqIntervalVector   & seqids,    // in
246                      const string               & database,  // in
247                      char                         seqtype,   // 'p' or 'n'
248                      TSeqIdVector               & ids,       // out
249                      TSeqDataVector             & seq_data,  // out
250                      string                     & errors,    // out
251                      string                     & warnings,  // out
252                      bool                         verbose = false);// in
253 
254 private:
255 
256     /// Retrieve the BLAST databases available for searching
257     void x_GetAvailableDatabases();
258 
259     /// Look for a database matching this method's argument and returned
260     /// detailed information about it.
261     /// @param blastdb database description
262     /// @return detailed information about the database requested or an empty
263     /// CRef<> if the database was not found
264     CRef<objects::CBlast4_database_info>
265     x_FindDbInfoFromAvailableDatabases(CRef<objects::CBlast4_database> blastdb);
266 
267     /// Prohibit copy construction.
268     CBlastServices(const CBlastServices &);
269 
270     /// Prohibit assignment.
271     CBlastServices & operator=(const CBlastServices &);
272 
273 
274     // Data
275 
276     /// BLAST databases available to search
277     objects::CBlast4_get_databases_reply::Tdata m_AvailableDatabases;
278     /// Taxonomy IDs for which there's windowmasker masking data at NCBI
279     objects::CBlast4_get_windowmasked_taxids_reply::Tdata m_WindowMaskedTaxIds;
280     /// Display verbose output to stdout?
281     bool m_Verbose;
282 };
283 
284 #undef NCBI_MODULE
285 
286 END_NCBI_SCOPE
287 
288 /* @} */
289 
290 #endif  /* ALGO_BLAST_API___REMOTE_SERVICES__HPP */
291