1 /*  $Id: cuDmBlastscore.cpp 337868 2011-09-15 14:35:39Z lanczyck $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Chris Lanczycki
27 *
28 * File Description:
29 *
30 *      Concrete distance matrix class.
31 *      Distance is computed based on a scoring matrix, where the
32 *      score is derived from a pairwise BLAST of sequences in the CD.
33 *      There is an option to extend an alignment at the N-terminal and C-terminal
34 *      end of an existing alignment by a specified amount.  One can also specify
35 *      to do an unrestricted BLAST of the complete sequences.
36 *
37 *
38 */
39 
40 #include <ncbi_pch.hpp>
41 //#include <objects/seqalign/Score.hpp>
42 
43 #include <algo/structure/cd_utils/cuDmBlastscore.hpp>
44 #include <algo/structure/cd_utils/cuAlign.hpp>
45 #include <algo/structure/cd_utils/cuBlast2Seq.hpp>
46 BEGIN_NCBI_SCOPE
47 BEGIN_SCOPE(cd_utils)
48 
49 //  DM_BlastScore class
50 
51 //  The distance between two rows in an alignment is pairwise BLAST
52 //  score of the specified region of the sequences:
53 //
54 //  d[i][j] = offset - pairwise_BlastScore(i, j)
55 //
56 //  where 'offset' is a CD-dependent constant that allows transformation of the
57 //  largest Blast scores to the shortest distances.  Note that each row will
58 //  in general have a different score for a Blast against itself, making d=0 ambiguous.
59 
60 const bool   DM_BlastScore::USE_FULL_SEQUENCE_DEFAULT = true;
61 const double DM_BlastScore::E_VAL_ON_BLAST_FAILURE    = E_VAL_NOT_FOUND;
62 const double DM_BlastScore::SCORE_ON_BLAST_FAILURE    = SCORE_NOT_FOUND;
63 
~DM_BlastScore()64 DM_BlastScore::~DM_BlastScore() {
65 }
66 
DM_BlastScore(EScoreMatrixType type,int ext)67 DM_BlastScore::DM_BlastScore(EScoreMatrixType type, int ext) : DistanceMatrix() {
68     initDMBlastScore(type, ext, ext);
69 }
70 
DM_BlastScore(EScoreMatrixType type,int nTermExt,int cTermExt)71 DM_BlastScore::DM_BlastScore(EScoreMatrixType type, int nTermExt, int cTermExt)
72 : DistanceMatrix()
73 {
74     initDMBlastScore(type, nTermExt, cTermExt);
75 }
76 /*
77 DM_BlastScore::DM_BlastScore(const CCd* cd, EScoreMatrixType type, int ext) :
78     DistanceMatrix(cd) {
79     initDMBlastScore(type, ext, ext);
80 }
81 
82 DM_BlastScore::DM_BlastScore(const CCd* cd, EScoreMatrixType type, int nTermExt, int cTermExt) :
83     DistanceMatrix(cd){
84     initDMBlastScore(type, nTermExt, cTermExt);
85 }
86 */
initDMBlastScore(EScoreMatrixType type,int nTermExt,int cTermExt)87 void DM_BlastScore::initDMBlastScore(EScoreMatrixType type, int nTermExt, int cTermExt) {
88     m_scoreMatrix = new ScoreMatrix(type);
89     m_useAligned = false;
90     m_nTermExt = nTermExt;
91     m_cTermExt = cTermExt;
92 	SetUseFullSequence(USE_FULL_SEQUENCE_DEFAULT);
93     if (m_dMethod == eScoreBlastFoot && m_nTermExt == 0 && m_cTermExt == 0) {
94         m_useAligned = true;
95     }
96 }
97 
SetUseFullSequence(bool value)98 void DM_BlastScore::SetUseFullSequence(bool value) {
99 	m_useFullSequence = value;
100 	m_dMethod = (m_useFullSequence) ? eScoreBlastFull : eScoreBlastFoot;
101 }
102 
ComputeMatrix(pProgressFunction pFunc)103 bool DM_BlastScore::ComputeMatrix(pProgressFunction pFunc) {
104 	bool result = false;
105     if (m_aligns == NULL) {
106         return result;
107     }
108 	result = CalcPairwiseScoresOnTheFly(pFunc);
109 	//#if _DEBUG
110 	//    ofstream ofs;
111 	//	ofs.open("dm.txt");
112 	//	DistanceMatrix::writeMat(ofs, *this);
113 	//	ofs.close();
114 	//#endif
115 
116 	return result;
117 }
118 
CalcPairwiseScoresOnTheFly(pProgressFunction pFunc)119 bool DM_BlastScore::CalcPairwiseScoresOnTheFly(pProgressFunction pFunc) {
120     int i, j, k;
121 	int nrows = 0;
122 	double maxScore, minScore;
123 
124     vector<double> AllScores;
125     nrows = m_aligns->GetNumRows();
126 	CdBlaster blaster(*m_aligns, GetMatrixName());
127 	if (m_useFullSequence)
128 		blaster.useWholeSequence(true);
129 	else
130 		blaster.setFootprintExtension(GetNTermExt(), GetCTermExt());
131 	blaster.blast(pFunc);
132 	//  The seq_aligns are returned in order (10),(20),(21),(30),(31),(32),...
133 	i=0;
134 	m_Array[0][0] = 0.0;
135     for (j=1; j<nrows; j++) {
136         m_Array[j][j] = 0.0;
137         // for each other row in the alignment
138         for (k=0; k<j; k++) {
139 			/*
140             m_Array[j][k] = AllScores[i];*/
141 			m_Array[j][k] = blaster.getPairwiseScore(j,k);
142             m_Array[k][j] = m_Array[j][k];
143             ++i;
144         }
145     }
146 
147 	//  FudgeFactor:  don't want a zero-distance for maxScore
148 	double FudgeFactor = 1.01;
149 	GetExtremalEntries(maxScore, minScore, true);
150 	LinearTransform(maxScore*FudgeFactor, -1.0, true);
151 	return true;
152 }
153 END_SCOPE(cd_utils)
154 END_NCBI_SCOPE
155