1 /* $Id: cuDmBlastscore.cpp 337868 2011-09-15 14:35:39Z lanczyck $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Chris Lanczycki
27 *
28 * File Description:
29 *
30 * Concrete distance matrix class.
31 * Distance is computed based on a scoring matrix, where the
32 * score is derived from a pairwise BLAST of sequences in the CD.
33 * There is an option to extend an alignment at the N-terminal and C-terminal
34 * end of an existing alignment by a specified amount. One can also specify
35 * to do an unrestricted BLAST of the complete sequences.
36 *
37 *
38 */
39
40 #include <ncbi_pch.hpp>
41 //#include <objects/seqalign/Score.hpp>
42
43 #include <algo/structure/cd_utils/cuDmBlastscore.hpp>
44 #include <algo/structure/cd_utils/cuAlign.hpp>
45 #include <algo/structure/cd_utils/cuBlast2Seq.hpp>
46 BEGIN_NCBI_SCOPE
47 BEGIN_SCOPE(cd_utils)
48
49 // DM_BlastScore class
50
51 // The distance between two rows in an alignment is pairwise BLAST
52 // score of the specified region of the sequences:
53 //
54 // d[i][j] = offset - pairwise_BlastScore(i, j)
55 //
56 // where 'offset' is a CD-dependent constant that allows transformation of the
57 // largest Blast scores to the shortest distances. Note that each row will
58 // in general have a different score for a Blast against itself, making d=0 ambiguous.
59
60 const bool DM_BlastScore::USE_FULL_SEQUENCE_DEFAULT = true;
61 const double DM_BlastScore::E_VAL_ON_BLAST_FAILURE = E_VAL_NOT_FOUND;
62 const double DM_BlastScore::SCORE_ON_BLAST_FAILURE = SCORE_NOT_FOUND;
63
~DM_BlastScore()64 DM_BlastScore::~DM_BlastScore() {
65 }
66
DM_BlastScore(EScoreMatrixType type,int ext)67 DM_BlastScore::DM_BlastScore(EScoreMatrixType type, int ext) : DistanceMatrix() {
68 initDMBlastScore(type, ext, ext);
69 }
70
DM_BlastScore(EScoreMatrixType type,int nTermExt,int cTermExt)71 DM_BlastScore::DM_BlastScore(EScoreMatrixType type, int nTermExt, int cTermExt)
72 : DistanceMatrix()
73 {
74 initDMBlastScore(type, nTermExt, cTermExt);
75 }
76 /*
77 DM_BlastScore::DM_BlastScore(const CCd* cd, EScoreMatrixType type, int ext) :
78 DistanceMatrix(cd) {
79 initDMBlastScore(type, ext, ext);
80 }
81
82 DM_BlastScore::DM_BlastScore(const CCd* cd, EScoreMatrixType type, int nTermExt, int cTermExt) :
83 DistanceMatrix(cd){
84 initDMBlastScore(type, nTermExt, cTermExt);
85 }
86 */
initDMBlastScore(EScoreMatrixType type,int nTermExt,int cTermExt)87 void DM_BlastScore::initDMBlastScore(EScoreMatrixType type, int nTermExt, int cTermExt) {
88 m_scoreMatrix = new ScoreMatrix(type);
89 m_useAligned = false;
90 m_nTermExt = nTermExt;
91 m_cTermExt = cTermExt;
92 SetUseFullSequence(USE_FULL_SEQUENCE_DEFAULT);
93 if (m_dMethod == eScoreBlastFoot && m_nTermExt == 0 && m_cTermExt == 0) {
94 m_useAligned = true;
95 }
96 }
97
SetUseFullSequence(bool value)98 void DM_BlastScore::SetUseFullSequence(bool value) {
99 m_useFullSequence = value;
100 m_dMethod = (m_useFullSequence) ? eScoreBlastFull : eScoreBlastFoot;
101 }
102
ComputeMatrix(pProgressFunction pFunc)103 bool DM_BlastScore::ComputeMatrix(pProgressFunction pFunc) {
104 bool result = false;
105 if (m_aligns == NULL) {
106 return result;
107 }
108 result = CalcPairwiseScoresOnTheFly(pFunc);
109 //#if _DEBUG
110 // ofstream ofs;
111 // ofs.open("dm.txt");
112 // DistanceMatrix::writeMat(ofs, *this);
113 // ofs.close();
114 //#endif
115
116 return result;
117 }
118
CalcPairwiseScoresOnTheFly(pProgressFunction pFunc)119 bool DM_BlastScore::CalcPairwiseScoresOnTheFly(pProgressFunction pFunc) {
120 int i, j, k;
121 int nrows = 0;
122 double maxScore, minScore;
123
124 vector<double> AllScores;
125 nrows = m_aligns->GetNumRows();
126 CdBlaster blaster(*m_aligns, GetMatrixName());
127 if (m_useFullSequence)
128 blaster.useWholeSequence(true);
129 else
130 blaster.setFootprintExtension(GetNTermExt(), GetCTermExt());
131 blaster.blast(pFunc);
132 // The seq_aligns are returned in order (10),(20),(21),(30),(31),(32),...
133 i=0;
134 m_Array[0][0] = 0.0;
135 for (j=1; j<nrows; j++) {
136 m_Array[j][j] = 0.0;
137 // for each other row in the alignment
138 for (k=0; k<j; k++) {
139 /*
140 m_Array[j][k] = AllScores[i];*/
141 m_Array[j][k] = blaster.getPairwiseScore(j,k);
142 m_Array[k][j] = m_Array[j][k];
143 ++i;
144 }
145 }
146
147 // FudgeFactor: don't want a zero-distance for maxScore
148 double FudgeFactor = 1.01;
149 GetExtremalEntries(maxScore, minScore, true);
150 LinearTransform(maxScore*FudgeFactor, -1.0, true);
151 return true;
152 }
153 END_SCOPE(cd_utils)
154 END_NCBI_SCOPE
155