1 static char const rcsid[] = "$Id: posit2.c,v 6.12 2006/09/18 20:54:27 kans Exp $";
2
3 /* $Id: posit2.c,v 6.12 2006/09/18 20:54:27 kans Exp $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28
29 /*****************************************************************************
30
31 File name: posit2.c
32
33 Author: Alejandro Schaffer
34
35 Contents: utilities for makematrices.
36
37 $Revision: 6.12 $
38
39 *****************************************************************************/
40
41 /*
42 * $Log: posit2.c,v $
43 * Revision 6.12 2006/09/18 20:54:27 kans
44 * removed PROTEIN_ALPHABET - collides with new version in posit.h
45 *
46 * Revision 6.11 2005/07/28 14:57:10 coulouri
47 * remove dead code
48 *
49 * Revision 6.10 2004/06/22 14:16:56 camacho
50 * Changed invocation of posFreqsToMatrix to conform with new signature
51 *
52 * Revision 6.9 2003/05/30 17:25:37 coulouri
53 * add rcsid
54 *
55 * Revision 6.8 2003/04/25 12:55:13 thiessen
56 * return Boolean value to indicate success/failure of impalaScaling()
57 *
58 * Revision 6.7 2001/11/14 13:57:13 madden
59 * Add warning if (maxScore - minScore) >= scoreRange
60 *
61 * Revision 6.6 2001/05/04 19:16:00 shavirin
62 * Corrected error reporting functions.
63 *
64 * Revision 6.5 2000/11/20 14:35:51 madden
65 * Changed FileOpen mode for byte-encoded checkpoint files from "r" to "rb" or from "w" to "wb" to solve a problem on Windows NT.
66 *
67 * Revision 6.4 2000/07/31 16:41:02 shavirin
68 * Reduced POSIT_SCALE_FACTOR from 1000 to 200 to avoid overflow
69 * with BLOSUM80; moved declaration os POSIT_SCALE_FACTOR to posit.h
70 *
71 * Revision 6.3 2000/07/25 18:12:05 shavirin
72 * WARNING: This is no-turning-back changed related to S&W Blast from
73 * Alejandro Schaffer
74 *
75 *
76 */
77
78 #include<ncbi.h>
79 #include <blastpri.h>
80 #include<objcode.h>
81 #include<objseq.h>
82 #include<objsset.h>
83 #include<sequtil.h>
84 #include <posit.h>
85 #include <txalign.h>
86 #include <profiles.h>
87
88 /*Used to check that diagnostics printing routine will work*/
89 #define EFFECTIVE_ALPHABET 20
90
91 #define POSIT_PERCENT 0.05
92
93 #define POSIT_NUM_ITERATIONS 10
94
95
96
fillSfp(BLAST_Score ** matrix,Int4 matrixLength,Nlm_FloatHi * queryProbArray,Nlm_FloatHi * scoreArray,BLAST_ScoreFreqPtr return_sfp)97 static BLAST_ScoreFreqPtr fillSfp(BLAST_Score **matrix, Int4 matrixLength, Nlm_FloatHi *queryProbArray, Nlm_FloatHi *scoreArray, BLAST_ScoreFreqPtr return_sfp)
98 {
99 Int4 minScore, maxScore; /*observed minimum and maximum scores*/
100 Int4 i,j,k; /* indices */
101 Nlm_FloatHi onePosFrac; /*1/matrix length as a double*/
102 Int4 charPositions[20] = {1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22};
103
104
105 minScore = maxScore = 0;
106
107 for(i = 0; i < matrixLength; i++) {
108 for(j = 0 ; j < EFFECTIVE_ALPHABET; j++) {
109 k = charPositions[j];
110 if ((matrix[i][k] != BLAST_SCORE_MIN) && (matrix[i][k] < minScore))
111 minScore = matrix[i][k];
112 if (matrix[i][k] > maxScore)
113 maxScore = matrix[i][k];
114 }
115 }
116 return_sfp->obs_min = minScore;
117 return_sfp->obs_max = maxScore;
118 if ((maxScore - minScore) >= scoreRange) {
119 ErrPostEx(SEV_WARNING, 0, 0, "maxScore is %d, minScore is %d difference is >= allowed score range %d", maxScore, minScore, scoreRange);
120 return NULL;
121 }
122 for (i = 0; i < scoreRange; i++)
123 scoreArray[i] = 0.0;
124 return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/
125 onePosFrac = 1.0/ ((Nlm_FloatHi) matrixLength);
126 for(i = 0; i < matrixLength; i++) {
127 for (j = 0; j < EFFECTIVE_ALPHABET; j++) {
128 k = charPositions[j];
129 if(matrix[i][k] >= minScore) {
130 return_sfp->sprob[matrix[i][k]] += (onePosFrac * queryProbArray[k]);
131 }
132 }
133 }
134 return_sfp->score_avg = 0;
135 for(i = minScore; i <= maxScore; i++)
136 return_sfp->score_avg += i * return_sfp->sprob[i];
137 return(return_sfp);
138 }
139
140
141 static Boolean
impalaScaleMatrix(BlastMatrixRescalePtr matrix_rescale,Nlm_FloatHi scalingFactor,Boolean doBinarySearch)142 impalaScaleMatrix(BlastMatrixRescalePtr matrix_rescale, Nlm_FloatHi scalingFactor, Boolean doBinarySearch)
143 {
144 Int4 dim1, dim2; /*number of rows and number of columns*/
145 Int4 a,c; /*loop indices*/
146 Boolean too_high=TRUE, done, first_time; /*control variables for binary search*/
147 Nlm_FloatHi factor, factor_low=1.0, factor_high=1.0; /*multiplicative factors in binary search*/
148 Nlm_FloatHi lambda, new_lambda; /*Karlin-Altschul parameter*/
149 Int4 index; /*loop index for binary search*/
150 Int4Ptr *private_matrix; /*pointer to locally manipulated version of the
151 matrix*/
152 Int4Ptr *matrix;
153 Nlm_FloatHi scalefactor; /*local version of amount of scaling*/
154 Nlm_FloatHi divFactor; /*1/scalefacor*/
155 BLAST_ScoreFreqPtr this_sfp, return_sfp; /*score frequency pointers to
156 compute lambda*/
157 Nlm_FloatHi *scoreArray; /*array of score probabilities*/
158
159 scoreArray = (Nlm_FloatHi *) MemNew(scoreRange * sizeof(Nlm_FloatHi));
160 return_sfp = (BLAST_ScoreFreqPtr) MemNew(1 * sizeof(BLAST_ScoreFreq));
161
162
163 private_matrix = matrix_rescale->private_matrix;
164 matrix = matrix_rescale->matrix;
165
166 /* Bracket the values. */
167 dim1 = matrix_rescale->query_length;
168 dim2 = matrix_rescale->alphabet_size;
169
170 lambda = matrix_rescale->lambda_ideal/scalingFactor;
171 divFactor = ((Nlm_FloatHi) POSIT_SCALE_FACTOR)/scalingFactor;
172 factor = 1.0;
173
174 if (doBinarySearch) {
175 done = FALSE;
176 first_time = TRUE;
177 while (done != TRUE) {
178 for(c = 0; c < dim1; c++) {
179 for(a = 0; a < dim2; a++) {
180 if (private_matrix[c][a] == BLAST_SCORE_MIN) {
181 matrix[c][a] = BLAST_SCORE_MIN;
182 } else {
183 matrix[c][a] = (factor*private_matrix[c][a]) /divFactor;
184 }
185 }
186 }
187
188 this_sfp = fillSfp(matrix, dim1, matrix_rescale->standardProb, scoreArray, return_sfp);
189 if (!this_sfp) {
190 MemFree(scoreArray);
191 MemFree(return_sfp);
192 return FALSE;
193 }
194 new_lambda = impalaKarlinLambdaNR(this_sfp, matrix_rescale->kbp_psi[0]->Lambda/scalingFactor);
195
196 if (new_lambda > lambda) {
197 if (first_time) {
198 factor_high = 1.0 + POSIT_PERCENT;
199 factor = factor_high;
200 factor_low = 1.0;
201 too_high = TRUE;
202 first_time = FALSE;
203 } else {
204 if (too_high == FALSE)
205 break;
206 factor_high += (factor_high-1.0);
207 factor = factor_high;
208 }
209 } else {
210 if (first_time) {
211 factor_high = 1.0;
212 factor_low = 1.0 - POSIT_PERCENT;
213 factor = factor_low;
214 too_high = FALSE;
215 first_time = FALSE;
216 } else {
217 if (too_high == TRUE)
218 break;
219 factor_low += (factor_low-1.0);
220 factor = factor_low;
221 }
222 }
223 }
224
225 /* binary search for ten times. */
226 for (index=0; index<POSIT_NUM_ITERATIONS; index++) {
227 factor = 0.5*(factor_high+factor_low);
228 for(c = 0; c < dim1; c++) {
229 for(a = 0; a < dim2; a++) {
230 if (private_matrix[c][a] == BLAST_SCORE_MIN) {
231 matrix[c][a] = BLAST_SCORE_MIN;
232 } else {
233 matrix[c][a] = (factor*private_matrix[c][a])/divFactor;
234 }
235 }
236 }
237
238 this_sfp = fillSfp(matrix, dim1, matrix_rescale->standardProb, scoreArray, return_sfp);
239 if (!this_sfp) {
240 MemFree(scoreArray);
241 MemFree(return_sfp);
242 return FALSE;
243 }
244 new_lambda = impalaKarlinLambdaNR(this_sfp, matrix_rescale->kbp_psi[0]->Lambda/scalingFactor);
245
246 if (new_lambda > lambda) {
247 factor_low = factor;
248 } else {
249 factor_high = factor;
250 }
251 }
252 }
253
254 for(c = 0; c < dim1; c++)
255 for(a = 0; a < dim2; a++) {
256 if (BLAST_SCORE_MIN != private_matrix[c][a]) {
257 matrix[c][a] = Nlm_Nint((Nlm_FloatHi) private_matrix[c][a] *
258 factor / POSIT_SCALE_FACTOR);
259 }
260 }
261
262 updateLambdaK(matrix_rescale, TRUE);
263
264 scalefactor = ((Nlm_FloatHi) scalingFactor) / POSIT_SCALE_FACTOR;
265 for(c = 0; c < dim1; c++)
266 for(a = 0; a < dim2; a++) {
267 if (BLAST_SCORE_MIN != private_matrix[c][a]) {
268 private_matrix[c][a] = Nlm_Nint((Nlm_FloatHi) private_matrix[c][a] *
269 factor * scalefactor);
270 }
271 }
272
273 for(a = 0; a < matrix_rescale->alphabet_size; a++) {
274 matrix[dim1][a] = BLAST_SCORE_MIN;
275 private_matrix[dim1][a] = BLAST_SCORE_MIN;
276 }
277
278 MemFree(scoreArray);
279 MemFree(return_sfp);
280 return TRUE;
281 }
282
impalaScaling(posSearchItems * posSearch,compactSearchItems * compactSearch,Nlm_FloatHi scalingFactor,Boolean doBinarySearch)283 Boolean LIBCALL impalaScaling(posSearchItems *posSearch, compactSearchItems * compactSearch, Nlm_FloatHi scalingFactor, Boolean doBinarySearch)
284 {
285 BlastMatrixRescalePtr matrix_rescale;
286 Boolean okay; /* return value - whether impalaScaleMatrix succeeded */
287
288 matrix_rescale = BlastMatrixRescaleNew(compactSearch->alphabetSize,
289 compactSearch->qlength,
290 compactSearch->query,
291 compactSearch->standardProb,
292 posSearch->posMatrix,
293 posSearch->posPrivateMatrix,
294 compactSearch->kbp_std,
295 compactSearch->kbp_psi,
296 compactSearch->kbp_gap_std,
297 compactSearch->kbp_gap_psi,
298 compactSearch->lambda_ideal,
299 compactSearch->K_ideal);
300
301 okay = impalaScaleMatrix(matrix_rescale, scalingFactor, doBinarySearch);
302
303 matrix_rescale = BlastMatrixRescaleDestruct(matrix_rescale);
304
305 return okay;
306 }
307
308 /*Some of the following checkpointing code is taken and adapted from
309 code written by K. Shriram for FASTLINK.
310 Reference:
311 A. A. Schaffer, S. K. Gupta, K. Shriram, and R. W. Cottingham, Jr.
312 Avoiding Recomputation in Linkage Analysis,
313 Human Heredity 44(1994), pp. 225-237. */
314
315
316 /* Front-ends to retrieve numbers. */
317
318 #define getCkptNlm_FloatHi(d, ckptFile) (getCkptNumber(&(d),sizeof(Nlm_FloatHi),ckptFile))
319 #define getCkptInt4(i, ckptFile) (getCkptNumber(&(i),sizeof(Int4),ckptFile))
320 #define getCkptChar(c, ckptFile) (getCkptNumber(&(c),sizeof(Char),ckptFile))
321
322 /*Read a checkpoint from the end of a previous PSI-BLAST round, get
323 query length, query, and position-specific target frequencies.
324 Returns TRUE if checkpoint was read sucessfully and FALSE otherwise.
325 scalingFactor records by how much to scale output scores */
326
impalaReadCheckpoint(posSearchItems * posSearch,compactSearchItems * compactSearch,CharPtr fileName,ValNodePtr * error_return,Nlm_FloatHi scalingFactor)327 Boolean LIBCALL impalaReadCheckpoint(posSearchItems * posSearch, compactSearchItems * compactSearch, CharPtr fileName, ValNodePtr * error_return,
328 Nlm_FloatHi scalingFactor)
329 {
330 FILE * checkFile; /*file in which to take the checkpoint*/
331 Int4 length1, length2, c; /*length of query sequence, and an index for it*/
332 Char nextRes; /*next residue in stored copy of the query sequence*/
333 Uint1Ptr oldQuery; /*array to hold the query sequence*/
334 Int4 i,j; /*indices for position and character in alphabet*/
335
336 ErrPostEx(SEV_INFO, 0, 0, "Attempting to recover data from previous checkpoint");
337
338 checkFile = FileOpen(fileName, "rb");
339
340 if (NULL == checkFile) {
341 ErrPostEx(SEV_WARNING, 0, 0, "Could not open checkpoint file");
342 return(FALSE);
343 }
344
345 length1 = compactSearch->qlength;
346 getCkptInt4(length2,checkFile);
347
348 if (length1 != length2) {
349 ErrPostEx(SEV_WARNING, 0, 0, "Invalid usage of checkpoint recovery; old query has length %ld, new query has length %ld", (long) length2, (long) length1);
350 ErrPostEx(SEV_WARNING, 0, 0, "Failed to recover data");
351 FileClose(checkFile);
352 return(FALSE);
353 }
354
355 oldQuery = (Uint1Ptr) MemNew(length1 * sizeof(Uint1));
356
357 if (NULL == oldQuery) {
358 ErrPostEx(SEV_WARNING, 0, 0, "Failed to reconstruct previous query");
359 ErrPostEx(SEV_WARNING, 0, 0, "Failed to recover data");
360 FileClose(checkFile);
361 return(FALSE);
362 }
363
364 for(c = 0; c < length1; c++) {
365 getCkptChar(nextRes, checkFile);
366 oldQuery[c] = ResToInt(nextRes);
367 if ((oldQuery[c] != compactSearch->query[c]) && (oldQuery[c] != Xchar)) {
368 ErrPostEx(SEV_WARNING, 0, 0, "Stored query has a %c at position %ld, while new query has a %c there",getRes(oldQuery[c]), (long) c, getRes(compactSearch->query[c]));
369 ErrPostEx(SEV_WARNING, 0, 0, "Failed to recover data");
370 MemFree(oldQuery);
371 FileClose(checkFile);
372 return(FALSE);
373 }
374 }
375 posSearch->posMatrix = (BLAST_Score **) MemNew((length1 + 1) * sizeof(BLAST_Score *));
376 posSearch->posPrivateMatrix = (BLAST_Score **) MemNew((length1 + 1) * sizeof(BLAST_Score *));
377 posSearch->posFreqs = (Nlm_FloatHi **) MemNew((length1 + 1) * sizeof(Nlm_FloatHi *));
378 if ((NULL == posSearch->posMatrix) || (NULL == posSearch->posPrivateMatrix) || (NULL == posSearch->posFreqs)) {
379
380 ErrPostEx(SEV_WARNING, 0, 0, "Failed to allocate position-specific score matrix");
381 ErrPostEx(SEV_WARNING, 0, 0, "Failed to recover data");
382 MemFree(oldQuery);
383 FileClose(checkFile);
384 return(FALSE);
385 }
386 for(i = 0; i <= length1; i++) {
387 posSearch->posMatrix[i] = (BLAST_Score *) MemNew(compactSearch->alphabetSize * sizeof(BLAST_Score));
388 posSearch->posPrivateMatrix[i] = (BLAST_Score *) MemNew(compactSearch->alphabetSize * sizeof(BLAST_Score));
389 posSearch->posFreqs[i] = (Nlm_FloatHi *) MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
390
391 if ((NULL == posSearch->posMatrix[i]) || (NULL == posSearch->posPrivateMatrix[i]) || (NULL == posSearch->posFreqs[i])) {
392 ErrPostEx(SEV_ERROR, 0, 0, "Failed to allocate position-specific score matrix");
393 ErrPostEx(SEV_ERROR, 0, 0, "Failed to recover data\n");
394 MemFree(oldQuery);
395 FileClose(checkFile);
396 return(FALSE);
397 }
398 for(j = 0; j < compactSearch->alphabetSize; j++) {
399 posSearch->posFreqs[i][j] = 0.0;
400 }
401 }
402 getCkptFreqMatrix(posSearch->posFreqs,length1,compactSearch->alphabetSize,checkFile);
403 posFreqsToMatrix(posSearch,compactSearch);
404 impalaScaling(posSearch, compactSearch, scalingFactor, TRUE);
405
406 ErrPostEx(SEV_INFO, 0, 0, "Data recovered successfully");
407
408 MemFree(oldQuery);
409 FileClose(checkFile);
410 return(TRUE);
411 }
412
413
414
415
416