1
2 /* ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho
27 *
28 */
29
30 /// @file psiblast_aux_priv.cpp
31 /// Definitions of auxiliary functions/classes for PSI-BLAST
32
33 #include <ncbi_pch.hpp>
34 #include "psiblast_aux_priv.hpp"
35 #include <algo/blast/core/blast_stat.h>
36 #include <algo/blast/core/blast_options.h>
37 #include <algo/blast/core/blast_encoding.h>
38
39 #include <algo/blast/api/pssm_engine.hpp>
40 #include <algo/blast/api/psi_pssm_input.hpp>
41 #include <algo/blast/api/blast_exception.hpp>
42 #include <algo/blast/api/blast_options_handle.hpp>
43 #include <algo/blast/api/objmgrfree_query_data.hpp>
44 #include "blast_aux_priv.hpp"
45 #include "../core/blast_psi_priv.h"
46
47 // Utility headers
48 #include <util/format_guess.hpp>
49 #include <util/math/matrix.hpp>
50
51 // Object includes
52 #include <objects/seqset/Seq_entry.hpp>
53 #include <objects/seqloc/Seq_id.hpp>
54 #include <objects/scoremat/Pssm.hpp>
55 #include <objects/scoremat/PssmFinalData.hpp>
56 #include <objects/scoremat/PssmParameters.hpp>
57 #include <objects/scoremat/FormatRpsDbParameters.hpp>
58 #include <objects/scoremat/PssmIntermediateData.hpp>
59 #include <objects/scoremat/PssmWithParameters.hpp>
60 #include <objects/seqalign/Seq_align.hpp>
61 #include <objects/seqalign/Seq_align_set.hpp>
62 #include <objects/general/Object_id.hpp>
63 #include <objects/seqalign/Score.hpp>
64 #include <objects/seqalign/Dense_seg.hpp>
65 #include <sstream>
66
67 /** @addtogroup AlgoBlast
68 *
69 * @{
70 */
71
72 BEGIN_NCBI_SCOPE
73 USING_SCOPE(objects);
BEGIN_SCOPE(blast)74 BEGIN_SCOPE(blast)
75
76 void PsiBlastSetupScoreBlock(BlastScoreBlk* score_blk,
77 CConstRef<objects::CPssmWithParameters> pssm,
78 TSearchMessages& messages,
79 CConstRef<CBlastOptions> options)
80 {
81 _ASSERT(score_blk);
82 _ASSERT(pssm.NotEmpty());
83
84 if ( !score_blk->protein_alphabet ) {
85 NCBI_THROW(CBlastException, eInvalidArgument,
86 "BlastScoreBlk is not configured for a protein alphabet");
87 }
88
89 // Assign the ungapped Karlin-Altschul block
90 if (pssm->GetPssm().GetLambdaUngapped() != CPssm::kInvalidStat) {
91 score_blk->kbp_psi[0]->Lambda = pssm->GetPssm().GetLambdaUngapped();
92 } else if (score_blk->kbp_std[0]->Lambda > 0.0) {
93 score_blk->kbp_psi[0]->Lambda = score_blk->kbp_std[0]->Lambda;
94 }
95
96 if (pssm->GetPssm().GetKappaUngapped() != CPssm::kInvalidStat) {
97 score_blk->kbp_psi[0]->K = pssm->GetPssm().GetKappaUngapped();
98 } else if (score_blk->kbp_std[0]->K > 0.0) {
99 score_blk->kbp_psi[0]->K = score_blk->kbp_std[0]->K;
100 }
101 score_blk->kbp_psi[0]->logK = log(score_blk->kbp_psi[0]->K);
102
103 if (pssm->GetPssm().GetHUngapped() != CPssm::kInvalidStat) {
104 score_blk->kbp_psi[0]->H = pssm->GetPssm().GetHUngapped();
105 } else if (score_blk->kbp_std[0]->K > 0.0) {
106 score_blk->kbp_psi[0]->H = score_blk->kbp_std[0]->H;
107 }
108
109 // Assign the gapped Karlin-Altschul block
110 if (pssm->GetPssm().GetLambda() != CPssm::kInvalidStat) {
111 score_blk->kbp_gap_psi[0]->Lambda = pssm->GetPssm().GetLambda();
112 } else if (score_blk->kbp_gap_std[0]->Lambda > 0.0) {
113 score_blk->kbp_gap_psi[0]->Lambda = score_blk->kbp_gap_std[0]->Lambda;
114 }
115
116 if (pssm->GetPssm().GetKappa() != CPssm::kInvalidStat) {
117 score_blk->kbp_gap_psi[0]->K = pssm->GetPssm().GetKappa();
118 } else if (score_blk->kbp_gap_std[0]->K > 0.0) {
119 score_blk->kbp_gap_psi[0]->K = score_blk->kbp_gap_std[0]->K;
120 }
121 score_blk->kbp_gap_psi[0]->logK = log(score_blk->kbp_gap_psi[0]->K);
122
123 if (pssm->GetPssm().GetH() != CPssm::kInvalidStat) {
124 score_blk->kbp_gap_psi[0]->H = pssm->GetPssm().GetH();
125 } else if (score_blk->kbp_gap_std[0]->H > 0.0) {
126 score_blk->kbp_gap_psi[0]->H = score_blk->kbp_gap_std[0]->H;
127 }
128
129 // Assign the matrix scores/frequency ratios
130 const size_t kQueryLength = pssm->GetPssm().GetNumColumns();
131 score_blk->psi_matrix = SPsiBlastScoreMatrixNew(kQueryLength);
132
133 // Get the scores
134 bool missing_scores = false;
135 try {
136 unique_ptr< CNcbiMatrix<int> > scores
137 (CScorematPssmConverter::GetScores(*pssm));
138 _ASSERT(score_blk->psi_matrix->pssm->ncols == scores->GetCols());
139 _ASSERT(score_blk->psi_matrix->pssm->nrows == scores->GetRows());
140
141 for (TSeqPos c = 0; c < scores->GetCols(); c++) {
142 for (TSeqPos r = 0; r < scores->GetRows(); r++) {
143 score_blk->psi_matrix->pssm->data[c][r] = (*scores)(r, c);
144 }
145 }
146 } catch (const std::runtime_error&) {
147 missing_scores = true;
148 }
149
150 // Get the frequency ratios
151 bool missing_freq_ratios = false;
152 // are all of the frequency ratios zeros? if so, issue a warning
153 bool freq_ratios_all_zeros = true;
154
155 try {
156 unique_ptr< CNcbiMatrix<double> > freq_ratios
157 (CScorematPssmConverter::GetFreqRatios(*pssm));
158 _ASSERT(score_blk->psi_matrix->pssm->ncols ==
159 freq_ratios->GetCols());
160 _ASSERT(score_blk->psi_matrix->pssm->nrows ==
161 freq_ratios->GetRows());
162
163 for (TSeqPos c = 0; c < freq_ratios->GetCols(); c++) {
164 for (TSeqPos r = 0; r < freq_ratios->GetRows(); r++) {
165 score_blk->psi_matrix->freq_ratios[c][r] =
166 (*freq_ratios)(r, c);
167 if ((*freq_ratios)(r,c) > kEpsilon) {
168 freq_ratios_all_zeros = false;
169 }
170 }
171 }
172 } catch (const std::runtime_error&) {
173 missing_freq_ratios = true;
174 }
175
176 if (missing_scores && missing_freq_ratios) {
177 NCBI_THROW(CBlastException, eInvalidArgument,
178 "Missing scores and frequency ratios in PSSM");
179 }
180
181 _ASSERT(options->GetCompositionBasedStats() < eNumCompoAdjustModes);
182 // the message below is meaningless for deltablast
183 if (options->GetProgram() != eDeltaBlast &&
184 (options->GetCompositionBasedStats() != eNoCompositionBasedStats) &&
185 freq_ratios_all_zeros) {
186 ostringstream os;
187 os << "Frequency ratios for PSSM are all zeros, frequency ratios for ";
188 os << options->GetMatrixName() << " will be used during traceback ";
189 os << "in composition based statistics";
190 CRef<CSearchMessage> sm(new CSearchMessage(eBlastSevWarning, 0,
191 os.str()));
192 _ASSERT(messages.size() == 1); // PSI-BLAST only works with one query
193 messages.front().push_back(sm);
194 }
195
196 if (options->GetCompositionBasedStats() > eCompositionBasedStats) {
197 // ugly, but necessary
198 const_cast<CBlastOptions*>(&*options)
199 ->SetCompositionBasedStats(eCompositionBasedStats);
200 ostringstream os;
201 os << "Composition-based score adjustment conditioned on "
202 << "sequence properties and unconditional composition-based score "
203 << "adjustment is not supported with PSSMs, resetting to default "
204 << "value of standard composition-based statistics";
205 CRef<CSearchMessage> sm(new CSearchMessage(eBlastSevWarning, 0,
206 os.str()));
207 _ASSERT(messages.size() == 1); // PSI-BLAST only works with one query
208 messages.front().push_back(sm);
209 }
210 }
211
212 /// Convert a list of values into a CNcbiMatrix
213 /// @param source source of data [in]
214 /// @param dest destination of data [out]
215 /// @param by_row is the matrix data stored by row? [in]
216 /// @param num_rows number of rows [in]
217 /// @param num_cols number of columns [in]
218 template <class T>
Convert2Matrix(const list<T> & source,CNcbiMatrix<T> & dest,bool by_row,SIZE_TYPE num_rows,SIZE_TYPE num_columns)219 void Convert2Matrix(const list<T>& source, CNcbiMatrix<T>& dest,
220 bool by_row, SIZE_TYPE num_rows, SIZE_TYPE num_columns)
221 {
222 typename list<T>::const_iterator itr = source.begin();
223 if (by_row == true) {
224 for (SIZE_TYPE r = 0; r < num_rows; r++) {
225 for (SIZE_TYPE c = 0; c < num_columns; c++) {
226 dest(r, c) = *itr++;
227 }
228 }
229 } else {
230 for (SIZE_TYPE c = 0; c < num_columns; c++) {
231 for (SIZE_TYPE r = 0; r < num_rows; r++) {
232 dest(r, c) = *itr++;
233 }
234 }
235 }
236 _ASSERT(itr == source.end());
237 }
238
239 CNcbiMatrix<int>*
GetScores(const objects::CPssmWithParameters & pssm_asn)240 CScorematPssmConverter::GetScores(const objects::CPssmWithParameters& pssm_asn)
241 {
242 if ( !pssm_asn.GetPssm().CanGetFinalData() ||
243 !pssm_asn.GetPssm().GetFinalData().CanGetScores() ||
244 pssm_asn.GetPssm().GetFinalData().GetScores().empty() ) {
245 throw runtime_error("Cannot obtain scores from ASN.1 PSSM");
246 }
247
248 const CPssm& pssm = pssm_asn.GetPssm();
249 _ASSERT((size_t)pssm.GetFinalData().GetScores().size() ==
250 (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
251
252 unique_ptr< CNcbiMatrix<int> > retval
253 (new CNcbiMatrix<int>(BLASTAA_SIZE,
254 pssm.GetNumColumns(),
255 BLAST_SCORE_MIN));
256
257 Convert2Matrix(pssm.GetFinalData().GetScores(),
258 *retval, pssm.GetByRow(), pssm.GetNumRows(),
259 pssm.GetNumColumns());
260 return retval.release();
261 }
262
263 CNcbiMatrix<double>*
GetFreqRatios(const objects::CPssmWithParameters & pssm_asn)264 CScorematPssmConverter::GetFreqRatios(const objects::CPssmWithParameters&
265 pssm_asn)
266 {
267 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
268 !pssm_asn.GetPssm().GetIntermediateData().CanGetFreqRatios() ||
269 pssm_asn.GetPssm().GetIntermediateData().GetFreqRatios().empty() ) {
270 throw runtime_error("Cannot obtain frequency ratios from ASN.1 PSSM");
271 }
272
273 const CPssm& pssm = pssm_asn.GetPssm();
274 _ASSERT((size_t)pssm.GetIntermediateData().GetFreqRatios().size() ==
275 (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
276
277 unique_ptr< CNcbiMatrix<double> > retval
278 (new CNcbiMatrix<double>(BLASTAA_SIZE, pssm.GetNumColumns(), 0.0));
279
280 Convert2Matrix(pssm.GetIntermediateData().GetFreqRatios(),
281 *retval, pssm.GetByRow(), pssm.GetNumRows(),
282 pssm.GetNumColumns());
283 return retval.release();
284 }
285
286 CNcbiMatrix<int>*
GetResidueFrequencies(const objects::CPssmWithParameters & pssm_asn)287 CScorematPssmConverter::GetResidueFrequencies
288 (const objects::CPssmWithParameters& pssm_asn)
289 {
290 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
291 !pssm_asn.GetPssm().GetIntermediateData().CanGetResFreqsPerPos() ||
292 pssm_asn.GetPssm().GetIntermediateData().GetResFreqsPerPos().empty() )
293 {
294 return NULL;
295 }
296
297 const CPssm& pssm = pssm_asn.GetPssm();
298 _ASSERT((size_t)pssm.GetIntermediateData().GetResFreqsPerPos().size() ==
299 (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
300
301 unique_ptr< CNcbiMatrix<int> > retval
302 (new CNcbiMatrix<int>(BLASTAA_SIZE, pssm.GetNumColumns(), 0));
303
304 Convert2Matrix(pssm.GetIntermediateData().GetResFreqsPerPos(),
305 *retval, pssm.GetByRow(), pssm.GetNumRows(),
306 pssm.GetNumColumns());
307 return retval.release();
308 }
309
310 CNcbiMatrix<double>*
GetWeightedResidueFrequencies(const objects::CPssmWithParameters & pssm_asn)311 CScorematPssmConverter::GetWeightedResidueFrequencies
312 (const objects::CPssmWithParameters& pssm_asn)
313 {
314 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
315 !pssm_asn.GetPssm().GetIntermediateData().
316 CanGetWeightedResFreqsPerPos() ||
317 pssm_asn.GetPssm().GetIntermediateData().
318 GetWeightedResFreqsPerPos().empty() ) {
319 return NULL;
320 }
321
322 const CPssm& pssm = pssm_asn.GetPssm();
323 _ASSERT((size_t)pssm.GetIntermediateData().
324 GetWeightedResFreqsPerPos().size() ==
325 (size_t)pssm.GetNumRows()*pssm_asn.GetPssm().GetNumColumns());
326
327 unique_ptr< CNcbiMatrix<double> > retval
328 (new CNcbiMatrix<double>(BLASTAA_SIZE, pssm.GetNumColumns(), 0.0));
329
330 Convert2Matrix(pssm.GetIntermediateData().GetWeightedResFreqsPerPos(),
331 *retval, pssm.GetByRow(), pssm.GetNumRows(),
332 pssm.GetNumColumns());
333 return retval.release();
334 }
335
336 void
GetInformationContent(const objects::CPssmWithParameters & pssm_asn,vector<double> & retval)337 CScorematPssmConverter::GetInformationContent
338 (const objects::CPssmWithParameters& pssm_asn,
339 vector<double>& retval)
340 {
341 retval.clear();
342 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
343 !pssm_asn.GetPssm().GetIntermediateData().CanGetInformationContent() ||
344 pssm_asn.GetPssm().
345 GetIntermediateData().GetInformationContent().empty() ) {
346 return;
347 }
348 const CPssm& pssm = pssm_asn.GetPssm();
349 copy(pssm.GetIntermediateData().GetInformationContent().begin(),
350 pssm.GetIntermediateData().GetInformationContent().end(),
351 back_inserter(retval));
352 }
353
354 void
GetGaplessColumnWeights(const objects::CPssmWithParameters & pssm_asn,vector<double> & retval)355 CScorematPssmConverter::GetGaplessColumnWeights
356 (const objects::CPssmWithParameters& pssm_asn,
357 vector<double>& retval)
358 {
359 retval.clear();
360 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
361 !pssm_asn.GetPssm().
362 GetIntermediateData().CanGetGaplessColumnWeights() ||
363 pssm_asn.GetPssm().
364 GetIntermediateData().GetGaplessColumnWeights().empty() ) {
365 return;
366 }
367 const CPssm& pssm = pssm_asn.GetPssm();
368 copy(pssm.GetIntermediateData().GetGaplessColumnWeights().begin(),
369 pssm.GetIntermediateData().GetGaplessColumnWeights().end(),
370 back_inserter(retval));
371 }
372
373 void
GetSigma(const objects::CPssmWithParameters & pssm_asn,vector<double> & retval)374 CScorematPssmConverter::GetSigma(const objects::CPssmWithParameters& pssm_asn,
375 vector<double>& retval)
376 {
377 retval.clear();
378 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
379 !pssm_asn.GetPssm().GetIntermediateData().CanGetSigma() ||
380 pssm_asn.GetPssm().GetIntermediateData().GetSigma().empty() ) {
381 return;
382 }
383 const CPssm& pssm = pssm_asn.GetPssm();
384 copy(pssm.GetIntermediateData().GetSigma().begin(),
385 pssm.GetIntermediateData().GetSigma().end(),
386 back_inserter(retval));
387 }
388
389 void
GetIntervalSizes(const objects::CPssmWithParameters & pssm_asn,vector<int> & retval)390 CScorematPssmConverter::GetIntervalSizes
391 (const objects::CPssmWithParameters& pssm_asn, vector<int>& retval)
392 {
393 retval.clear();
394 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
395 !pssm_asn.GetPssm().
396 GetIntermediateData().CanGetIntervalSizes() ||
397 pssm_asn.GetPssm().
398 GetIntermediateData().GetIntervalSizes().empty() ) {
399 return;
400 }
401 const CPssm& pssm = pssm_asn.GetPssm();
402 copy(pssm.GetIntermediateData().GetIntervalSizes().begin(),
403 pssm.GetIntermediateData().GetIntervalSizes().end(),
404 back_inserter(retval));
405 }
406
407 void
GetNumMatchingSeqs(const objects::CPssmWithParameters & pssm_asn,vector<int> & retval)408 CScorematPssmConverter::GetNumMatchingSeqs
409 (const objects::CPssmWithParameters& pssm_asn, vector<int>& retval)
410 {
411 retval.clear();
412 if ( !pssm_asn.GetPssm().CanGetIntermediateData() ||
413 !pssm_asn.GetPssm().
414 GetIntermediateData().CanGetNumMatchingSeqs() ||
415 pssm_asn.GetPssm().
416 GetIntermediateData().GetNumMatchingSeqs().empty() ) {
417 return;
418 }
419 const CPssm& pssm = pssm_asn.GetPssm();
420 copy(pssm.GetIntermediateData().GetNumMatchingSeqs().begin(),
421 pssm.GetIntermediateData().GetNumMatchingSeqs().end(),
422 back_inserter(retval));
423 }
424
425 void
PsiBlastAddAncillaryPssmData(objects::CPssmWithParameters & pssm,int gap_open,int gap_extend)426 PsiBlastAddAncillaryPssmData(objects::CPssmWithParameters& pssm,
427 int gap_open,
428 int gap_extend)
429 {
430 _ASSERT(pssm.GetParams().GetRpsdbparams().IsSetMatrixName());
431 pssm.SetParams().SetRpsdbparams().SetGapOpen(gap_open);
432 pssm.SetParams().SetRpsdbparams().SetGapExtend(gap_extend);
433 }
434
435 /** After creating the PSSM from frequency ratios, adjust the frequency ratios
436 * matrix to match the dimensions of the score matrix
437 * @param pssm matrix to adjust [in|out]
438 */
439 static void
s_AdjustFrequencyRatiosMatrixToMatchScoreMatrix(objects::CPssmWithParameters & pssm)440 s_AdjustFrequencyRatiosMatrixToMatchScoreMatrix(objects::CPssmWithParameters&
441 pssm)
442 {
443 _ASSERT(pssm.GetPssm().GetNumRows() < BLASTAA_SIZE);
444 if (pssm.GetPssm().CanGetFinalData()) {
445 _ASSERT(pssm.GetPssm().GetFinalData().GetScores().size() ==
446 (size_t)BLASTAA_SIZE*pssm.GetPssm().GetNumColumns());
447 }
448
449 const size_t diff = (size_t)BLASTAA_SIZE - pssm.GetPssm().GetNumRows();
450 CPssmIntermediateData::TFreqRatios& freq_ratios =
451 pssm.SetPssm().SetIntermediateData().SetFreqRatios();
452
453 if (pssm.GetPssm().GetByRow() == true) {
454 freq_ratios.resize(pssm.GetPssm().GetNumColumns() * BLASTAA_SIZE, 0.0);
455 } else {
456 CPssmIntermediateData::TFreqRatios::iterator itr = freq_ratios.begin();
457 for (int c = 0; c < pssm.GetPssm().GetNumColumns(); c++) {
458 advance(itr, pssm.GetPssm().GetNumRows());
459 freq_ratios.insert(itr, diff, 0.0);
460 }
461 }
462
463 pssm.SetPssm().SetNumRows() = BLASTAA_SIZE;
464 }
465
PsiBlastComputePssmScores(CRef<objects::CPssmWithParameters> pssm,const CBlastOptions & opts)466 void PsiBlastComputePssmScores(CRef<objects::CPssmWithParameters> pssm,
467 const CBlastOptions& opts)
468 {
469 CConstRef<CBioseq> query(&pssm->GetQuery().GetSeq());
470 CRef<IQueryFactory> seq_fetcher(new CObjMgrFree_QueryFactory(query)); /* NCBI_FAKE_WARNING */
471
472 CRef<ILocalQueryData> query_data(seq_fetcher->MakeLocalQueryData(&opts));
473 BLAST_SequenceBlk* seqblk = query_data->GetSequenceBlk();
474 _ASSERT(query_data->GetSeqLength(0) == (size_t)seqblk->length);
475 _ASSERT(query_data->GetSeqLength(0) ==
476 (size_t)pssm->GetPssm().GetNumColumns());
477 unique_ptr< CNcbiMatrix<double> > freq_ratios
478 (CScorematPssmConverter::GetFreqRatios(*pssm));
479
480 CPsiBlastInputFreqRatios pssm_engine_input(seqblk->sequence,
481 seqblk->length,
482 *freq_ratios,
483 opts.GetMatrixName());
484 CPssmEngine pssm_engine(&pssm_engine_input);
485 CRef<CPssmWithParameters> pssm_with_scores(pssm_engine.Run());
486
487 if (pssm->GetPssm().GetNumRows() !=
488 pssm_with_scores->GetPssm().GetNumRows()) {
489 _ASSERT(pssm_with_scores->GetPssm().GetNumRows() == BLASTAA_SIZE);
490 s_AdjustFrequencyRatiosMatrixToMatchScoreMatrix(*pssm);
491 }
492 pssm->SetPssm().SetFinalData().SetScores() =
493 pssm_with_scores->GetPssm().GetFinalData().GetScores();
494 pssm->SetPssm().SetFinalData().SetLambda() =
495 pssm_with_scores->GetPssm().GetFinalData().GetLambda();
496 pssm->SetPssm().SetFinalData().SetKappa() =
497 pssm_with_scores->GetPssm().GetFinalData().GetKappa();
498 pssm->SetPssm().SetFinalData().SetH() =
499 pssm_with_scores->GetPssm().GetFinalData().GetH();
500
501 PsiBlastAddAncillaryPssmData(*pssm,
502 opts.GetGapOpeningCost(),
503 opts.GetGapExtensionCost());
504 }
505
506 /// Returns the evalue from this score object
507 /// @param score ASN.1 score object [in]
s_GetEvalue(const CScore & score)508 static double s_GetEvalue(const CScore& score)
509 {
510 string score_type = score.GetId().GetStr();
511 if (score.GetValue().IsReal() &&
512 (score_type == "e_value" || score_type == "sum_e")) {
513 return score.GetValue().GetReal();
514 }
515 return numeric_limits<double>::max();
516 }
517
518 /// Returns the bit_score from this score object
519 /// @param score ASN.1 score object [in]
s_GetBitScore(const CScore & score)520 static double s_GetBitScore(const CScore& score)
521 {
522 string score_type = score.GetId().GetStr();
523 if (score.GetValue().IsReal() && score_type == "bit_score") {
524 return score.GetValue().GetReal();
525 }
526 return BLAST_EXPECT_VALUE;
527 }
528
GetLowestEvalue(const objects::CDense_seg::TScores & scores,double * bit_score)529 double GetLowestEvalue(const objects::CDense_seg::TScores& scores,
530 double* bit_score /* = NULL */)
531 {
532 double retval = BLAST_EXPECT_VALUE;
533 double tmp;
534 if (bit_score) {
535 *bit_score = retval;
536 }
537
538 ITERATE(CDense_seg::TScores, i, scores) {
539 if ( (tmp = s_GetEvalue(**i)) < retval) {
540 retval = tmp;
541 }
542 if (bit_score && ((tmp = s_GetBitScore(**i)) > *bit_score)) {
543 *bit_score = tmp;
544 }
545 }
546 return retval;
547 }
548
549 void
operator ()(const objects::CSeq_align_set & alignments,double evalue_inclusion_threshold,THitIdentifiers & output)550 CPsiBlastAlignmentProcessor::operator()
551 (const objects::CSeq_align_set& alignments,
552 double evalue_inclusion_threshold,
553 THitIdentifiers& output)
554 {
555 output.clear();
556
557 ITERATE(CSeq_align_set::Tdata, hsp, alignments.Get()) {
558 // Look for HSP with score less than inclusion_ethresh
559 double e = GetLowestEvalue((*hsp)->GetScore());
560 if (e < evalue_inclusion_threshold) {
561 CSeq_id_Handle sid =
562 CSeq_id_Handle::GetHandle((*hsp)->GetSeq_id(1));
563 output.insert(sid);
564 }
565 }
566 }
567
568 void
Pssm(const objects::CPssmWithParameters & pssm,bool require_scores)569 CPsiBlastValidate::Pssm(const objects::CPssmWithParameters& pssm,
570 bool require_scores)
571 {
572 if ( !pssm.CanGetPssm() ) {
573 NCBI_THROW(CBlastException, eInvalidArgument,
574 "Missing PSSM data");
575 }
576
577 bool missing_scores(false);
578 if ( !pssm.GetPssm().CanGetFinalData() ||
579 !pssm.GetPssm().GetFinalData().CanGetScores() ||
580 pssm.GetPssm().GetFinalData().GetScores().empty() ) {
581 missing_scores = true;
582 }
583
584 bool missing_freq_ratios(false);
585 if ( !pssm.GetPssm().CanGetIntermediateData() ||
586 !pssm.GetPssm().GetIntermediateData().CanGetFreqRatios() ||
587 pssm.GetPssm().GetIntermediateData().GetFreqRatios().empty() ) {
588 missing_freq_ratios = true;
589 }
590
591 if (missing_freq_ratios && missing_scores) {
592 NCBI_THROW(CBlastException, eInvalidArgument,
593 "PSSM data must contain either scores or frequency ratios");
594 }
595 if (missing_scores && require_scores) {
596 NCBI_THROW(CBlastException, eInvalidArgument,
597 "PSSM data must contain scores (did you run the PSSM engine?)");
598 }
599
600 // Only unscaled PSSMs are supported
601 if (!missing_scores &&
602 pssm.GetPssm().GetFinalData().CanGetScalingFactor() &&
603 pssm.GetPssm().GetFinalData().GetScalingFactor() != 1) {
604 string msg("PSSM has a scaling factor of ");
605 msg += NStr::IntToString(pssm.GetPssm()
606 .GetFinalData()
607 .GetScalingFactor());
608 msg += ". PSI-BLAST does not accept scaled PSSMs";
609 NCBI_THROW(CBlastException, eInvalidArgument, msg);
610 }
611
612 if ( !pssm.HasQuery() ) {
613 NCBI_THROW(CBlastException, eInvalidArgument,
614 "Missing query sequence in PSSM");
615 }
616 if ( !pssm.GetQuery().IsSeq() ) {
617 NCBI_THROW(CBlastException, eInvalidArgument,
618 "Query sequence in ASN.1 PSSM is not a single Bioseq");
619 }
620
621 if ( !pssm.GetPssm().GetIsProtein() ) {
622 NCBI_THROW(CBlastException, eInvalidArgument,
623 "PSSM does not represent protein scoring matrix");
624 }
625 }
626
627 void
QueryFactory(CRef<IQueryFactory> query_factory,const CBlastOptionsHandle & opts_handle,EQueryFactoryType qf_type)628 CPsiBlastValidate::QueryFactory(CRef<IQueryFactory> query_factory,
629 const CBlastOptionsHandle& opts_handle,
630 EQueryFactoryType qf_type)
631 {
632 CRef<ILocalQueryData> query_data =
633 query_factory->MakeLocalQueryData(&opts_handle.GetOptions());
634
635 // Compose the exception error message
636 string excpt_msg("PSI-BLAST only accepts ");
637 if (qf_type == eQFT_Query) {
638 excpt_msg += "one protein sequence as query";
639 } else if (qf_type == eQFT_Subject) {
640 excpt_msg += "protein sequences as subjects";
641 } else {
642 abort();
643 }
644
645 if (qf_type == eQFT_Query) {
646 if (query_data->GetNumQueries() != 1) {
647 NCBI_THROW(CBlastException, eInvalidArgument, excpt_msg);
648 }
649 }
650
651 BLAST_SequenceBlk* sblk = NULL;
652 try { sblk = query_data->GetSequenceBlk(); }
653 catch (const CBlastException& e) {
654 if (e.GetMsg().find("Incompatible sequence codings") != NPOS) {
655 NCBI_THROW(CBlastException, eInvalidArgument, excpt_msg);
656 }
657 }
658 _ASSERT(sblk);
659 _ASSERT(sblk->length > 0);
660
661 CFormatGuess::ESequenceType sequence_type =
662 CFormatGuess::SequenceType((const char*)sblk->sequence_start,
663 static_cast<unsigned>(sblk->length));
664 if (sequence_type == CFormatGuess::eNucleotide) {
665 excpt_msg.assign("PSI-BLAST cannot accept nucleotide ");
666 excpt_msg += (qf_type == eQFT_Query ? "queries" : "subjects");
667 NCBI_THROW(CBlastException, eInvalidArgument, excpt_msg);
668 }
669 }
670
671 END_SCOPE(blast)
672 END_NCBI_SCOPE
673
674 /* @} */
675