1 
2 /* ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Christiam Camacho
27  *
28  */
29 
30 /// @file psiblast_aux_priv.cpp
31 /// Definitions of auxiliary functions/classes for PSI-BLAST
32 
33 #include <ncbi_pch.hpp>
34 #include "psiblast_aux_priv.hpp"
35 #include <algo/blast/core/blast_stat.h>
36 #include <algo/blast/core/blast_options.h>
37 #include <algo/blast/core/blast_encoding.h>
38 
39 #include <algo/blast/api/pssm_engine.hpp>
40 #include <algo/blast/api/psi_pssm_input.hpp>
41 #include <algo/blast/api/blast_exception.hpp>
42 #include <algo/blast/api/blast_options_handle.hpp>
43 #include <algo/blast/api/objmgrfree_query_data.hpp>
44 #include "blast_aux_priv.hpp"
45 #include "../core/blast_psi_priv.h"
46 
47 // Utility headers
48 #include <util/format_guess.hpp>
49 #include <util/math/matrix.hpp>
50 
51 // Object includes
52 #include <objects/seqset/Seq_entry.hpp>
53 #include <objects/seqloc/Seq_id.hpp>
54 #include <objects/scoremat/Pssm.hpp>
55 #include <objects/scoremat/PssmFinalData.hpp>
56 #include <objects/scoremat/PssmParameters.hpp>
57 #include <objects/scoremat/FormatRpsDbParameters.hpp>
58 #include <objects/scoremat/PssmIntermediateData.hpp>
59 #include <objects/scoremat/PssmWithParameters.hpp>
60 #include <objects/seqalign/Seq_align.hpp>
61 #include <objects/seqalign/Seq_align_set.hpp>
62 #include <objects/general/Object_id.hpp>
63 #include <objects/seqalign/Score.hpp>
64 #include <objects/seqalign/Dense_seg.hpp>
65 #include <sstream>
66 
67 /** @addtogroup AlgoBlast
68  *
69  * @{
70  */
71 
72 BEGIN_NCBI_SCOPE
73 USING_SCOPE(objects);
BEGIN_SCOPE(blast)74 BEGIN_SCOPE(blast)
75 
76 void PsiBlastSetupScoreBlock(BlastScoreBlk* score_blk,
77                              CConstRef<objects::CPssmWithParameters> pssm,
78                              TSearchMessages& messages,
79                              CConstRef<CBlastOptions> options)
80 {
81     _ASSERT(score_blk);
82     _ASSERT(pssm.NotEmpty());
83 
84     if ( !score_blk->protein_alphabet ) {
85         NCBI_THROW(CBlastException, eInvalidArgument,
86                    "BlastScoreBlk is not configured for a protein alphabet");
87     }
88 
89     // Assign the ungapped Karlin-Altschul block
90     if (pssm->GetPssm().GetLambdaUngapped() != CPssm::kInvalidStat) {
91         score_blk->kbp_psi[0]->Lambda = pssm->GetPssm().GetLambdaUngapped();
92     } else if (score_blk->kbp_std[0]->Lambda > 0.0) {
93         score_blk->kbp_psi[0]->Lambda = score_blk->kbp_std[0]->Lambda;
94     }
95 
96     if (pssm->GetPssm().GetKappaUngapped() != CPssm::kInvalidStat) {
97         score_blk->kbp_psi[0]->K = pssm->GetPssm().GetKappaUngapped();
98     } else if (score_blk->kbp_std[0]->K > 0.0) {
99         score_blk->kbp_psi[0]->K = score_blk->kbp_std[0]->K;
100     }
101     score_blk->kbp_psi[0]->logK = log(score_blk->kbp_psi[0]->K);
102 
103     if (pssm->GetPssm().GetHUngapped() != CPssm::kInvalidStat) {
104         score_blk->kbp_psi[0]->H = pssm->GetPssm().GetHUngapped();
105     } else if (score_blk->kbp_std[0]->K > 0.0) {
106         score_blk->kbp_psi[0]->H = score_blk->kbp_std[0]->H;
107     }
108 
109     // Assign the gapped Karlin-Altschul block
110     if (pssm->GetPssm().GetLambda() != CPssm::kInvalidStat) {
111         score_blk->kbp_gap_psi[0]->Lambda = pssm->GetPssm().GetLambda();
112     } else if (score_blk->kbp_gap_std[0]->Lambda > 0.0) {
113         score_blk->kbp_gap_psi[0]->Lambda = score_blk->kbp_gap_std[0]->Lambda;
114     }
115 
116     if (pssm->GetPssm().GetKappa() != CPssm::kInvalidStat) {
117         score_blk->kbp_gap_psi[0]->K = pssm->GetPssm().GetKappa();
118     } else if (score_blk->kbp_gap_std[0]->K > 0.0) {
119         score_blk->kbp_gap_psi[0]->K = score_blk->kbp_gap_std[0]->K;
120     }
121     score_blk->kbp_gap_psi[0]->logK = log(score_blk->kbp_gap_psi[0]->K);
122 
123     if (pssm->GetPssm().GetH() != CPssm::kInvalidStat) {
124         score_blk->kbp_gap_psi[0]->H = pssm->GetPssm().GetH();
125     } else if (score_blk->kbp_gap_std[0]->H > 0.0) {
126         score_blk->kbp_gap_psi[0]->H = score_blk->kbp_gap_std[0]->H;
127     }
128 
129     // Assign the matrix scores/frequency ratios
130     const size_t kQueryLength = pssm->GetPssm().GetNumColumns();
131     score_blk->psi_matrix = SPsiBlastScoreMatrixNew(kQueryLength);
132 
133     // Get the scores
134     bool missing_scores = false;
135     try {
136         unique_ptr< CNcbiMatrix<int> > scores
137             (CScorematPssmConverter::GetScores(*pssm));
138         _ASSERT(score_blk->psi_matrix->pssm->ncols == scores->GetCols());
139         _ASSERT(score_blk->psi_matrix->pssm->nrows == scores->GetRows());
140 
141         for (TSeqPos c = 0; c < scores->GetCols(); c++) {
142             for (TSeqPos r = 0; r < scores->GetRows(); r++) {
143                 score_blk->psi_matrix->pssm->data[c][r] = (*scores)(r, c);
144             }
145         }
146     } catch (const std::runtime_error&) {
147         missing_scores = true;
148     }
149 
150     // Get the frequency ratios
151     bool missing_freq_ratios = false;
152     // are all of the frequency ratios zeros? if so, issue a warning
153     bool freq_ratios_all_zeros = true;
154 
155     try {
156         unique_ptr< CNcbiMatrix<double> > freq_ratios
157             (CScorematPssmConverter::GetFreqRatios(*pssm));
158         _ASSERT(score_blk->psi_matrix->pssm->ncols ==
159                freq_ratios->GetCols());
160         _ASSERT(score_blk->psi_matrix->pssm->nrows ==
161                freq_ratios->GetRows());
162 
163         for (TSeqPos c = 0; c < freq_ratios->GetCols(); c++) {
164             for (TSeqPos r = 0; r < freq_ratios->GetRows(); r++) {
165                 score_blk->psi_matrix->freq_ratios[c][r] =
166                     (*freq_ratios)(r, c);
167                 if ((*freq_ratios)(r,c) > kEpsilon) {
168                     freq_ratios_all_zeros = false;
169                 }
170             }
171         }
172     } catch (const std::runtime_error&) {
173         missing_freq_ratios = true;
174     }
175 
176     if (missing_scores && missing_freq_ratios) {
177         NCBI_THROW(CBlastException, eInvalidArgument,
178                    "Missing scores and frequency ratios in PSSM");
179     }
180 
181     _ASSERT(options->GetCompositionBasedStats() < eNumCompoAdjustModes);
182     // the message below is meaningless for deltablast
183     if (options->GetProgram() != eDeltaBlast &&
184         (options->GetCompositionBasedStats() != eNoCompositionBasedStats) &&
185         freq_ratios_all_zeros) {
186         ostringstream os;
187         os << "Frequency ratios for PSSM are all zeros, frequency ratios for ";
188         os << options->GetMatrixName() << " will be used during traceback ";
189         os << "in composition based statistics";
190         CRef<CSearchMessage> sm(new CSearchMessage(eBlastSevWarning, 0,
191                                                    os.str()));
192         _ASSERT(messages.size() == 1); // PSI-BLAST only works with one query
193         messages.front().push_back(sm);
194     }
195 
196     if (options->GetCompositionBasedStats() > eCompositionBasedStats) {
197         // ugly, but necessary
198         const_cast<CBlastOptions*>(&*options)
199             ->SetCompositionBasedStats(eCompositionBasedStats);
200         ostringstream os;
201         os << "Composition-based score adjustment conditioned on "
202            << "sequence properties and unconditional composition-based score "
203            << "adjustment is not supported with PSSMs, resetting to default "
204            << "value of standard composition-based statistics";
205         CRef<CSearchMessage> sm(new CSearchMessage(eBlastSevWarning, 0,
206                                                    os.str()));
207         _ASSERT(messages.size() == 1); // PSI-BLAST only works with one query
208         messages.front().push_back(sm);
209     }
210 }
211 
212 /// Convert a list of values into a CNcbiMatrix
213 /// @param source source of data [in]
214 /// @param dest destination of data [out]
215 /// @param by_row is the matrix data stored by row? [in]
216 /// @param num_rows number of rows [in]
217 /// @param num_cols number of columns [in]
218 template <class T>
Convert2Matrix(const list<T> & source,CNcbiMatrix<T> & dest,bool by_row,SIZE_TYPE num_rows,SIZE_TYPE num_columns)219 void Convert2Matrix(const list<T>& source, CNcbiMatrix<T>& dest,
220                     bool by_row, SIZE_TYPE num_rows, SIZE_TYPE num_columns)
221 {
222     typename list<T>::const_iterator itr = source.begin();
223     if (by_row == true) {
224         for (SIZE_TYPE r = 0; r < num_rows; r++) {
225             for (SIZE_TYPE c = 0; c < num_columns; c++) {
226                 dest(r, c) = *itr++;
227             }
228         }
229     } else {
230         for (SIZE_TYPE c = 0; c < num_columns; c++) {
231             for (SIZE_TYPE r = 0; r < num_rows; r++) {
232                 dest(r, c) = *itr++;
233             }
234         }
235     }
236     _ASSERT(itr == source.end());
237 }
238 
239 CNcbiMatrix<int>*
GetScores(const objects::CPssmWithParameters & pssm_asn)240 CScorematPssmConverter::GetScores(const objects::CPssmWithParameters& pssm_asn)
241 {
242     if ( !pssm_asn.GetPssm().CanGetFinalData() ||
243          !pssm_asn.GetPssm().GetFinalData().CanGetScores() ||
244          pssm_asn.GetPssm().GetFinalData().GetScores().empty() ) {
245         throw runtime_error("Cannot obtain scores from ASN.1 PSSM");
246     }
247 
248     const CPssm& pssm = pssm_asn.GetPssm();
249     _ASSERT((size_t)pssm.GetFinalData().GetScores().size() ==
250            (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
251 
252     unique_ptr< CNcbiMatrix<int> > retval
253         (new CNcbiMatrix<int>(BLASTAA_SIZE,
254                               pssm.GetNumColumns(),
255                               BLAST_SCORE_MIN));
256 
257     Convert2Matrix(pssm.GetFinalData().GetScores(),
258                    *retval, pssm.GetByRow(), pssm.GetNumRows(),
259                    pssm.GetNumColumns());
260     return retval.release();
261 }
262 
263 CNcbiMatrix<double>*
GetFreqRatios(const objects::CPssmWithParameters & pssm_asn)264 CScorematPssmConverter::GetFreqRatios(const objects::CPssmWithParameters&
265                                       pssm_asn)
266 {
267     if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
268          !pssm_asn.GetPssm().GetIntermediateData().CanGetFreqRatios() ||
269          pssm_asn.GetPssm().GetIntermediateData().GetFreqRatios().empty() ) {
270         throw runtime_error("Cannot obtain frequency ratios from ASN.1 PSSM");
271     }
272 
273     const CPssm& pssm = pssm_asn.GetPssm();
274     _ASSERT((size_t)pssm.GetIntermediateData().GetFreqRatios().size() ==
275            (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
276 
277     unique_ptr< CNcbiMatrix<double> > retval
278         (new CNcbiMatrix<double>(BLASTAA_SIZE, pssm.GetNumColumns(), 0.0));
279 
280     Convert2Matrix(pssm.GetIntermediateData().GetFreqRatios(),
281                    *retval, pssm.GetByRow(), pssm.GetNumRows(),
282                    pssm.GetNumColumns());
283     return retval.release();
284 }
285 
286 CNcbiMatrix<int>*
GetResidueFrequencies(const objects::CPssmWithParameters & pssm_asn)287 CScorematPssmConverter::GetResidueFrequencies
288     (const objects::CPssmWithParameters& pssm_asn)
289 {
290     if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
291          !pssm_asn.GetPssm().GetIntermediateData().CanGetResFreqsPerPos() ||
292          pssm_asn.GetPssm().GetIntermediateData().GetResFreqsPerPos().empty() )
293     {
294         return NULL;
295     }
296 
297     const CPssm& pssm = pssm_asn.GetPssm();
298     _ASSERT((size_t)pssm.GetIntermediateData().GetResFreqsPerPos().size() ==
299            (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
300 
301     unique_ptr< CNcbiMatrix<int> > retval
302         (new CNcbiMatrix<int>(BLASTAA_SIZE, pssm.GetNumColumns(), 0));
303 
304     Convert2Matrix(pssm.GetIntermediateData().GetResFreqsPerPos(),
305                    *retval, pssm.GetByRow(), pssm.GetNumRows(),
306                    pssm.GetNumColumns());
307     return retval.release();
308 }
309 
310 CNcbiMatrix<double>*
GetWeightedResidueFrequencies(const objects::CPssmWithParameters & pssm_asn)311 CScorematPssmConverter::GetWeightedResidueFrequencies
312     (const objects::CPssmWithParameters& pssm_asn)
313 {
314     if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
315          !pssm_asn.GetPssm().GetIntermediateData().
316             CanGetWeightedResFreqsPerPos() ||
317          pssm_asn.GetPssm().GetIntermediateData().
318             GetWeightedResFreqsPerPos().empty() ) {
319         return NULL;
320     }
321 
322     const CPssm& pssm = pssm_asn.GetPssm();
323     _ASSERT((size_t)pssm.GetIntermediateData().
324                 GetWeightedResFreqsPerPos().size() ==
325            (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
326 
327     unique_ptr< CNcbiMatrix<double> > retval
328         (new CNcbiMatrix<double>(BLASTAA_SIZE, pssm.GetNumColumns(), 0.0));
329 
330     Convert2Matrix(pssm.GetIntermediateData().GetWeightedResFreqsPerPos(),
331                    *retval, pssm.GetByRow(), pssm.GetNumRows(),
332                    pssm.GetNumColumns());
333     return retval.release();
334 }
335 
336 void
GetInformationContent(const objects::CPssmWithParameters & pssm_asn,vector<double> & retval)337 CScorematPssmConverter::GetInformationContent
338     (const objects::CPssmWithParameters& pssm_asn,
339      vector<double>& retval)
340 {
341     retval.clear();
342     if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
343          !pssm_asn.GetPssm().GetIntermediateData().CanGetInformationContent() ||
344          pssm_asn.GetPssm().
345             GetIntermediateData().GetInformationContent().empty() ) {
346         return;
347     }
348     const CPssm& pssm = pssm_asn.GetPssm();
349     copy(pssm.GetIntermediateData().GetInformationContent().begin(),
350          pssm.GetIntermediateData().GetInformationContent().end(),
351          back_inserter(retval));
352 }
353 
354 void
GetGaplessColumnWeights(const objects::CPssmWithParameters & pssm_asn,vector<double> & retval)355 CScorematPssmConverter::GetGaplessColumnWeights
356     (const objects::CPssmWithParameters& pssm_asn,
357      vector<double>& retval)
358 {
359     retval.clear();
360     if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
361          !pssm_asn.GetPssm().
362             GetIntermediateData().CanGetGaplessColumnWeights() ||
363          pssm_asn.GetPssm().
364             GetIntermediateData().GetGaplessColumnWeights().empty() ) {
365         return;
366     }
367     const CPssm& pssm = pssm_asn.GetPssm();
368     copy(pssm.GetIntermediateData().GetGaplessColumnWeights().begin(),
369          pssm.GetIntermediateData().GetGaplessColumnWeights().end(),
370          back_inserter(retval));
371 }
372 
373 void
GetSigma(const objects::CPssmWithParameters & pssm_asn,vector<double> & retval)374 CScorematPssmConverter::GetSigma(const objects::CPssmWithParameters& pssm_asn,
375                                  vector<double>& retval)
376 {
377     retval.clear();
378     if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
379          !pssm_asn.GetPssm().GetIntermediateData().CanGetSigma() ||
380          pssm_asn.GetPssm().GetIntermediateData().GetSigma().empty() ) {
381         return;
382     }
383     const CPssm& pssm = pssm_asn.GetPssm();
384     copy(pssm.GetIntermediateData().GetSigma().begin(),
385          pssm.GetIntermediateData().GetSigma().end(),
386          back_inserter(retval));
387 }
388 
389 void
GetIntervalSizes(const objects::CPssmWithParameters & pssm_asn,vector<int> & retval)390 CScorematPssmConverter::GetIntervalSizes
391     (const objects::CPssmWithParameters& pssm_asn, vector<int>& retval)
392 {
393     retval.clear();
394     if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
395          !pssm_asn.GetPssm().
396             GetIntermediateData().CanGetIntervalSizes() ||
397          pssm_asn.GetPssm().
398             GetIntermediateData().GetIntervalSizes().empty() ) {
399         return;
400     }
401     const CPssm& pssm = pssm_asn.GetPssm();
402     copy(pssm.GetIntermediateData().GetIntervalSizes().begin(),
403          pssm.GetIntermediateData().GetIntervalSizes().end(),
404          back_inserter(retval));
405 }
406 
407 void
GetNumMatchingSeqs(const objects::CPssmWithParameters & pssm_asn,vector<int> & retval)408 CScorematPssmConverter::GetNumMatchingSeqs
409     (const objects::CPssmWithParameters& pssm_asn, vector<int>& retval)
410 {
411     retval.clear();
412     if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
413          !pssm_asn.GetPssm().
414             GetIntermediateData().CanGetNumMatchingSeqs() ||
415          pssm_asn.GetPssm().
416             GetIntermediateData().GetNumMatchingSeqs().empty() ) {
417         return;
418     }
419     const CPssm& pssm = pssm_asn.GetPssm();
420     copy(pssm.GetIntermediateData().GetNumMatchingSeqs().begin(),
421          pssm.GetIntermediateData().GetNumMatchingSeqs().end(),
422          back_inserter(retval));
423 }
424 
425 void
PsiBlastAddAncillaryPssmData(objects::CPssmWithParameters & pssm,int gap_open,int gap_extend)426 PsiBlastAddAncillaryPssmData(objects::CPssmWithParameters& pssm,
427                          int gap_open,
428                          int gap_extend)
429 {
430     _ASSERT(pssm.GetParams().GetRpsdbparams().IsSetMatrixName());
431     pssm.SetParams().SetRpsdbparams().SetGapOpen(gap_open);
432     pssm.SetParams().SetRpsdbparams().SetGapExtend(gap_extend);
433 }
434 
435 /** After creating the PSSM from frequency ratios, adjust the frequency ratios
436  * matrix to match the dimensions of the score matrix
437  * @param pssm matrix to adjust [in|out]
438  */
439 static void
s_AdjustFrequencyRatiosMatrixToMatchScoreMatrix(objects::CPssmWithParameters & pssm)440 s_AdjustFrequencyRatiosMatrixToMatchScoreMatrix(objects::CPssmWithParameters&
441                                                 pssm)
442 {
443     _ASSERT(pssm.GetPssm().GetNumRows() < BLASTAA_SIZE);
444     if (pssm.GetPssm().CanGetFinalData()) {
445         _ASSERT(pssm.GetPssm().GetFinalData().GetScores().size() ==
446                 (size_t)BLASTAA_SIZE*pssm.GetPssm().GetNumColumns());
447     }
448 
449     const size_t diff = (size_t)BLASTAA_SIZE - pssm.GetPssm().GetNumRows();
450     CPssmIntermediateData::TFreqRatios& freq_ratios =
451         pssm.SetPssm().SetIntermediateData().SetFreqRatios();
452 
453     if (pssm.GetPssm().GetByRow() == true) {
454         freq_ratios.resize(pssm.GetPssm().GetNumColumns() * BLASTAA_SIZE, 0.0);
455     } else {
456         CPssmIntermediateData::TFreqRatios::iterator itr = freq_ratios.begin();
457         for (int c = 0; c < pssm.GetPssm().GetNumColumns(); c++) {
458             advance(itr, pssm.GetPssm().GetNumRows());
459             freq_ratios.insert(itr, diff, 0.0);
460         }
461     }
462 
463     pssm.SetPssm().SetNumRows() = BLASTAA_SIZE;
464 }
465 
PsiBlastComputePssmScores(CRef<objects::CPssmWithParameters> pssm,const CBlastOptions & opts)466 void PsiBlastComputePssmScores(CRef<objects::CPssmWithParameters> pssm,
467                                const CBlastOptions& opts)
468 {
469     CConstRef<CBioseq> query(&pssm->GetQuery().GetSeq());
470     CRef<IQueryFactory> seq_fetcher(new CObjMgrFree_QueryFactory(query)); /* NCBI_FAKE_WARNING */
471 
472     CRef<ILocalQueryData> query_data(seq_fetcher->MakeLocalQueryData(&opts));
473     BLAST_SequenceBlk* seqblk = query_data->GetSequenceBlk();
474     _ASSERT(query_data->GetSeqLength(0) == (size_t)seqblk->length);
475     _ASSERT(query_data->GetSeqLength(0) ==
476             (size_t)pssm->GetPssm().GetNumColumns());
477     unique_ptr< CNcbiMatrix<double> > freq_ratios
478         (CScorematPssmConverter::GetFreqRatios(*pssm));
479 
480     CPsiBlastInputFreqRatios pssm_engine_input(seqblk->sequence,
481                                                seqblk->length,
482                                                *freq_ratios,
483                                                opts.GetMatrixName());
484     CPssmEngine pssm_engine(&pssm_engine_input);
485     CRef<CPssmWithParameters> pssm_with_scores(pssm_engine.Run());
486 
487     if (pssm->GetPssm().GetNumRows() !=
488         pssm_with_scores->GetPssm().GetNumRows()) {
489         _ASSERT(pssm_with_scores->GetPssm().GetNumRows() == BLASTAA_SIZE);
490         s_AdjustFrequencyRatiosMatrixToMatchScoreMatrix(*pssm);
491     }
492     pssm->SetPssm().SetFinalData().SetScores() =
493         pssm_with_scores->GetPssm().GetFinalData().GetScores();
494     pssm->SetPssm().SetFinalData().SetLambda() =
495         pssm_with_scores->GetPssm().GetFinalData().GetLambda();
496     pssm->SetPssm().SetFinalData().SetKappa() =
497         pssm_with_scores->GetPssm().GetFinalData().GetKappa();
498     pssm->SetPssm().SetFinalData().SetH() =
499         pssm_with_scores->GetPssm().GetFinalData().GetH();
500 
501     PsiBlastAddAncillaryPssmData(*pssm,
502                                   opts.GetGapOpeningCost(),
503                                   opts.GetGapExtensionCost());
504 }
505 
506 /// Returns the evalue from this score object
507 /// @param score ASN.1 score object [in]
s_GetEvalue(const CScore & score)508 static double s_GetEvalue(const CScore& score)
509 {
510     string score_type = score.GetId().GetStr();
511     if (score.GetValue().IsReal() &&
512        (score_type == "e_value" || score_type == "sum_e")) {
513         return score.GetValue().GetReal();
514     }
515     return numeric_limits<double>::max();
516 }
517 
518 /// Returns the bit_score from this score object
519 /// @param score ASN.1 score object [in]
s_GetBitScore(const CScore & score)520 static double s_GetBitScore(const CScore& score)
521 {
522     string score_type = score.GetId().GetStr();
523     if (score.GetValue().IsReal() && score_type == "bit_score") {
524         return score.GetValue().GetReal();
525     }
526     return BLAST_EXPECT_VALUE;
527 }
528 
GetLowestEvalue(const objects::CDense_seg::TScores & scores,double * bit_score)529 double GetLowestEvalue(const objects::CDense_seg::TScores& scores,
530                        double* bit_score /* = NULL */)
531 {
532     double retval = BLAST_EXPECT_VALUE;
533     double tmp;
534     if (bit_score) {
535         *bit_score = retval;
536     }
537 
538     ITERATE(CDense_seg::TScores, i, scores) {
539         if ( (tmp = s_GetEvalue(**i)) < retval) {
540             retval = tmp;
541         }
542         if (bit_score && ((tmp = s_GetBitScore(**i)) > *bit_score)) {
543             *bit_score = tmp;
544         }
545     }
546     return retval;
547 }
548 
549 void
operator ()(const objects::CSeq_align_set & alignments,double evalue_inclusion_threshold,THitIdentifiers & output)550 CPsiBlastAlignmentProcessor::operator()
551     (const objects::CSeq_align_set& alignments,
552      double evalue_inclusion_threshold,
553      THitIdentifiers& output)
554 {
555     output.clear();
556 
557     ITERATE(CSeq_align_set::Tdata, hsp, alignments.Get()) {
558         // Look for HSP with score less than inclusion_ethresh
559         double e = GetLowestEvalue((*hsp)->GetScore());
560         if (e < evalue_inclusion_threshold) {
561             CSeq_id_Handle sid =
562                 CSeq_id_Handle::GetHandle((*hsp)->GetSeq_id(1));
563             output.insert(sid);
564         }
565     }
566 }
567 
568 void
Pssm(const objects::CPssmWithParameters & pssm,bool require_scores)569 CPsiBlastValidate::Pssm(const objects::CPssmWithParameters& pssm,
570                         bool require_scores)
571 {
572     if ( !pssm.CanGetPssm() ) {
573         NCBI_THROW(CBlastException, eInvalidArgument,
574                    "Missing PSSM data");
575     }
576 
577     bool missing_scores(false);
578     if ( !pssm.GetPssm().CanGetFinalData() ||
579          !pssm.GetPssm().GetFinalData().CanGetScores() ||
580          pssm.GetPssm().GetFinalData().GetScores().empty() ) {
581         missing_scores = true;
582     }
583 
584     bool missing_freq_ratios(false);
585     if ( !pssm.GetPssm().CanGetIntermediateData() ||
586          !pssm.GetPssm().GetIntermediateData().CanGetFreqRatios() ||
587          pssm.GetPssm().GetIntermediateData().GetFreqRatios().empty() ) {
588         missing_freq_ratios = true;
589     }
590 
591     if (missing_freq_ratios && missing_scores) {
592         NCBI_THROW(CBlastException, eInvalidArgument,
593                    "PSSM data must contain either scores or frequency ratios");
594     }
595     if (missing_scores && require_scores) {
596         NCBI_THROW(CBlastException, eInvalidArgument,
597                "PSSM data must contain scores (did you run the PSSM engine?)");
598     }
599 
600     // Only unscaled PSSMs are supported
601     if (!missing_scores &&
602         pssm.GetPssm().GetFinalData().CanGetScalingFactor() &&
603         pssm.GetPssm().GetFinalData().GetScalingFactor() != 1) {
604         string msg("PSSM has a scaling factor of ");
605         msg += NStr::IntToString(pssm.GetPssm()
606                                  .GetFinalData()
607                                  .GetScalingFactor());
608         msg += ". PSI-BLAST does not accept scaled PSSMs";
609         NCBI_THROW(CBlastException, eInvalidArgument, msg);
610     }
611 
612     if ( !pssm.HasQuery() ) {
613         NCBI_THROW(CBlastException, eInvalidArgument,
614                    "Missing query sequence in PSSM");
615     }
616     if ( !pssm.GetQuery().IsSeq() ) {
617         NCBI_THROW(CBlastException, eInvalidArgument,
618                    "Query sequence in ASN.1 PSSM is not a single Bioseq");
619     }
620 
621     if ( !pssm.GetPssm().GetIsProtein() ) {
622         NCBI_THROW(CBlastException, eInvalidArgument,
623                    "PSSM does not represent protein scoring matrix");
624     }
625 }
626 
627 void
QueryFactory(CRef<IQueryFactory> query_factory,const CBlastOptionsHandle & opts_handle,EQueryFactoryType qf_type)628 CPsiBlastValidate::QueryFactory(CRef<IQueryFactory> query_factory,
629                                 const CBlastOptionsHandle& opts_handle,
630                                 EQueryFactoryType qf_type)
631 {
632     CRef<ILocalQueryData> query_data =
633         query_factory->MakeLocalQueryData(&opts_handle.GetOptions());
634 
635     // Compose the exception error message
636     string excpt_msg("PSI-BLAST only accepts ");
637     if (qf_type == eQFT_Query) {
638         excpt_msg += "one protein sequence as query";
639     } else if (qf_type == eQFT_Subject) {
640         excpt_msg += "protein sequences as subjects";
641     } else {
642         abort();
643     }
644 
645     if (qf_type == eQFT_Query) {
646         if (query_data->GetNumQueries() != 1) {
647             NCBI_THROW(CBlastException, eInvalidArgument, excpt_msg);
648         }
649     }
650 
651     BLAST_SequenceBlk* sblk = NULL;
652     try { sblk = query_data->GetSequenceBlk(); }
653     catch (const CBlastException& e) {
654         if (e.GetMsg().find("Incompatible sequence codings") != NPOS) {
655             NCBI_THROW(CBlastException, eInvalidArgument, excpt_msg);
656         }
657     }
658     _ASSERT(sblk);
659     _ASSERT(sblk->length > 0);
660 
661     CFormatGuess::ESequenceType sequence_type =
662         CFormatGuess::SequenceType((const char*)sblk->sequence_start,
663                                    static_cast<unsigned>(sblk->length));
664     if (sequence_type == CFormatGuess::eNucleotide) {
665         excpt_msg.assign("PSI-BLAST cannot accept nucleotide ");
666         excpt_msg += (qf_type == eQFT_Query ? "queries" : "subjects");
667         NCBI_THROW(CBlastException, eInvalidArgument, excpt_msg);
668     }
669 }
670 
671 END_SCOPE(blast)
672 END_NCBI_SCOPE
673 
674 /* @} */
675