1 /* $Id: cuSimpleB2SWrapper.hpp 200108 2010-08-04 17:01:17Z lanczyck $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Chris Lanczycki
27  *
28  * File Description:
29  *
30  *       Simplified API for a single blast-two-sequences call.
31  *       Does not involve CDs, and NOT optimized (or intended) to be called
32  *       in batch mode.  If you need to make many calls, use CdBlaster!
33  *
34  * ===========================================================================
35  */
36 
37 #ifndef CU_SIMPLEB2SWRAPPER_HPP
38 #define CU_SIMPLEB2SWRAPPER_HPP
39 
40 #include <vector>
41 #include <objmgr/object_manager.hpp>
42 #include <algo/blast/api/psibl2seq.hpp>
43 #include <algo/structure/cd_utils/cuMatrix.hpp>
44 #include <algo/structure/cd_utils/cuScoringMatrix.hpp>
45 
46 BEGIN_NCBI_SCOPE
47 USING_SCOPE(blast);
48 BEGIN_SCOPE(cd_utils)
49 
50 class NCBI_CDUTILS_EXPORT CSimpleB2SWrapper
51 {
RemoveAllDataLoaders()52     static void RemoveAllDataLoaders() {
53         int i = 1;
54         CRef<CObjectManager> om = CObjectManager::GetInstance();
55         CObjectManager::TRegisteredNames loader_names;
56         om->GetRegisteredNames(loader_names);
57         ITERATE(CObjectManager::TRegisteredNames, itr, loader_names) {
58             cout << "data loader " << i << ":  " << *itr << endl;
59             om->RevokeDataLoader(*itr);
60             ++i;
61         }
62     }
63 public:
64 	static const unsigned int HITLIST_SIZE_DEFAULT    ;
65 	static const unsigned int MAX_HITLIST_SIZE        ;
66 	static const Int8   CDD_DATABASE_SIZE             ;
67 	static const double E_VAL_DEFAULT                 ; // default e-value threshold
68 	static const double E_VAL_WHEN_NO_SEQ_ALIGN       ; // eval when Blast doesn't return a seq-align
69 	static const double SCORE_WHEN_NO_SEQ_ALIGN       ;
70 	static const string SCORING_MATRIX_DEFAULT        ;
71     static const ECompoAdjustModes COMPOSITION_ADJ_DEF;
72 	static const double DO_NOT_USE_PERC_ID_THRESHOLD  ; // user must provide this; no default is set
73 	static const Int8   DO_NOT_USE_EFF_SEARCH_SPACE   ; // user must provide this; no default is set
74 
75     //  If the default for 'percIdThold' is used, then the %identity filter will be off in B2S.
76     CSimpleB2SWrapper(double percIdThold = DO_NOT_USE_PERC_ID_THRESHOLD, string matrixName = SCORING_MATRIX_DEFAULT);
77 
78     //  Searches using the full-length sequences specified.
79     //  If the default for 'percIdThold' is used, then the %identity filter will be off in B2S.
80     CSimpleB2SWrapper(CRef<CBioseq>& seq1, CRef<CBioseq>& seq2, double percIdThold = DO_NOT_USE_PERC_ID_THRESHOLD, string matrixName = SCORING_MATRIX_DEFAULT);
81 
82     //  If from = to = 0, or from > to, use the full length.
83     //  From/to are not validated vs. sequence data in 'seq'.
SetSeq1(CRef<CBioseq> & seq,unsigned int from=0,unsigned int to=0)84     void SetSeq1(CRef<CBioseq>& seq, unsigned int from = 0, unsigned int to = 0) { SetSeq(seq, true, from, to);}
SetSeq2(CRef<CBioseq> & seq,unsigned int from=0,unsigned int to=0)85     void SetSeq2(CRef<CBioseq>& seq, unsigned int from = 0, unsigned int to = 0) { SetSeq(seq, false, from, to);}
86 
87     //  GENERAL NOTE on Set....() functions:
88     //  the return value is the value of the corresponding member variable on completion
89     //  (which is also the corresponding value in m_options when m_options is valid).
90     //  Invalid values used in these functions are ignored and no changes are made
91     //  to the settings, and return value will therefore differ from the passed argument.
92 
93     //  Must be between 0 and 100; otherwise no change is made.
94     //  NOTE:  this sets the %identity filter in the b2s algorithm, which seems to calculate %identity using
95     //         the smaller of the two sequences.
96     double SetPercIdThreshold(double percIdThold);
97 
98     //  Must be between 1 and MAX_HITLIST_SIZE, otherwise change is not made.
99     unsigned int SetHitlistSize(unsigned int hitlistSize);
100 
101     //  Must be > 0, otherwise change is not made.
102     Int8 SetDbLength(Int8 dbLength);
103 
104     //  Must be > 0, otherwise change is not made.
105     Int8 SetEffSearchSpace(Int8 effSearchSpace);
106 
107     //  Must be > 0, otherwise change is not made.
108     ECompoAdjustModes SetCompoAdjustMode(ECompoAdjustModes caMode);
109 
110     //  Sanity checks that eValueThold is non-negative.
111     double SetEValueThreshold(double eValueThold);
112 
113     //  Set matrix name; sanity checks that matrixName is one of those defined in cuScoringMatrix.hpp
114     string SetMatrixName(string matrixName);
115 
116     //  Expose the options handle object so any parameter can be manipulated.
GetOptionsHandle()117     CRef< CBlastAdvancedProteinOptionsHandle >& GetOptionsHandle() { return m_options; }
118 
119     //  Do all parameter configurations before calling this method.
120     //  E-value threshold is 10.0 unless user has previously called 'SetEValueThreshold'.
121     //  Uses Object Manager enabled Blast interface.
122     bool DoBlast2Seqs();
123 
124     //  Do all parameter configurations before calling this method.
125     //  E-value threshold is 10.0 unless user has previously called 'SetEValueThreshold'.
126     //  Uses Object Manager free Blast interface.
127 //    bool DoBlast2Seqs_OMFree();
128 
129     //  If there are no hits, the returned CRef will be invalid.  Test the CRef before using.
130     CRef<CSeq_align> getBestB2SAlignment(double* score = NULL, double* eval = NULL, double* percIdent = NULL) const;
131 
getAllHits() const132     const vector<CRef<CSeq_align> >& getAllHits() const {return m_alignments;}
getNumHits() const133     unsigned int getNumHits() const {return m_alignments.size();}
134 
135     //  All hitIndex values are zero-based!
136     bool getPairwiseBlastAlignment(unsigned int hitIndex, CRef< CSeq_align >& seqAlign) const;
137     double getPairwiseScore(unsigned int hitIndex) const;
138     double getPairwiseEValue(unsigned int hitIndex) const;
139     double getPairwisePercIdent(unsigned int hitIndex) const;  // %id vs. m_seq1!!!
140 
141 private:
142 //	long m_dbSize;
143 //	int m_dbSeqNum;
144 
145     struct SB2SSeq {
146         bool useFull;
147         unsigned int from;
148         unsigned int to;
149         CRef<CBioseq> bs;
150     };
151 
152     SB2SSeq m_seq1;
153     SB2SSeq m_seq2;
154 
155 	string m_scoringMatrix;
156     unsigned int m_hitlistSize;
157     Int8 m_dbLength;
158     double m_eValueThold;
159     double m_percIdThold;    //  only used if explicitly set by user.
160     Int8 m_effSearchSpace;   //  only used if explicitly set by user.
161     ECompoAdjustModes m_caMode;
162 	vector< CRef<CSeq_align> > m_alignments;
163 
164 	vector< double > m_scores;
165 	vector< double > m_evals;
166 	vector< double > m_percIdents;
167 
168     // Stores all options needed for the query.
169 	CRef<CBlastAdvancedProteinOptionsHandle> m_options;
170 
171     //  Create the options object and set defaults.
172     void InitializeToDefaults();
173 
174 	void SetSeq(CRef<CBioseq>& seq, bool isSeq1, unsigned int from, unsigned int to);
175 
176     //  False if there was a problem (e.g., SB2SSeq couldn't provide a Seq-id).
177     bool FillOutSeqLoc(const SB2SSeq& s, CSeq_loc& seqLoc);
178 
179 	void processBlastHits(ncbi::blast::CSearchResults&);
180 //    void processBlastHits_OMFree(ncbi::blast::CSearchResults&);
181 };
182 
183 END_SCOPE(cd_utils)
184 END_NCBI_SCOPE
185 
186 #endif // CU_SIMPLEB2SWRAPPER_HPP
187