1 static char const rcsid[] = "$Id: posit2.c,v 6.12 2006/09/18 20:54:27 kans Exp $";
2 
3 /* $Id: posit2.c,v 6.12 2006/09/18 20:54:27 kans Exp $
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28 
29 /*****************************************************************************
30 
31 File name: posit2.c
32 
33 Author: Alejandro Schaffer
34 
35 Contents: utilities for makematrices.
36 
37 $Revision: 6.12 $
38 
39 *****************************************************************************/
40 
41 /*
42  * $Log: posit2.c,v $
43  * Revision 6.12  2006/09/18 20:54:27  kans
44  * removed PROTEIN_ALPHABET - collides with new version in posit.h
45  *
46  * Revision 6.11  2005/07/28 14:57:10  coulouri
47  * remove dead code
48  *
49  * Revision 6.10  2004/06/22 14:16:56  camacho
50  * Changed invocation of posFreqsToMatrix to conform with new signature
51  *
52  * Revision 6.9  2003/05/30 17:25:37  coulouri
53  * add rcsid
54  *
55  * Revision 6.8  2003/04/25 12:55:13  thiessen
56  * return Boolean value to indicate success/failure of impalaScaling()
57  *
58  * Revision 6.7  2001/11/14 13:57:13  madden
59  * Add warning if (maxScore - minScore) >= scoreRange
60  *
61  * Revision 6.6  2001/05/04 19:16:00  shavirin
62  * Corrected error reporting functions.
63  *
64  * Revision 6.5  2000/11/20 14:35:51  madden
65  * Changed FileOpen mode for byte-encoded checkpoint files from "r" to "rb" or from "w" to "wb" to solve a problem on Windows NT.
66  *
67  * Revision 6.4  2000/07/31 16:41:02  shavirin
68  * Reduced POSIT_SCALE_FACTOR from 1000 to 200 to avoid overflow
69  * with BLOSUM80; moved declaration os POSIT_SCALE_FACTOR to posit.h
70  *
71  * Revision 6.3  2000/07/25 18:12:05  shavirin
72  * WARNING: This is no-turning-back changed related to S&W Blast from
73  * Alejandro Schaffer
74  *
75  *
76  */
77 
78 #include<ncbi.h>
79 #include <blastpri.h>
80 #include<objcode.h>
81 #include<objseq.h>
82 #include<objsset.h>
83 #include<sequtil.h>
84 #include <posit.h>
85 #include <txalign.h>
86 #include <profiles.h>
87 
88 /*Used to check that diagnostics printing routine will work*/
89 #define EFFECTIVE_ALPHABET 20
90 
91 #define POSIT_PERCENT 0.05
92 
93 #define POSIT_NUM_ITERATIONS 10
94 
95 
96 
fillSfp(BLAST_Score ** matrix,Int4 matrixLength,Nlm_FloatHi * queryProbArray,Nlm_FloatHi * scoreArray,BLAST_ScoreFreqPtr return_sfp)97 static BLAST_ScoreFreqPtr fillSfp(BLAST_Score **matrix, Int4 matrixLength, Nlm_FloatHi *queryProbArray, Nlm_FloatHi *scoreArray,  BLAST_ScoreFreqPtr return_sfp)
98 {
99     Int4 minScore, maxScore; /*observed minimum and maximum scores*/
100     Int4 i,j,k; /* indices */
101     Nlm_FloatHi onePosFrac; /*1/matrix length as a double*/
102     Int4 charPositions[20] = {1,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,22};
103 
104 
105     minScore = maxScore = 0;
106 
107     for(i = 0; i < matrixLength; i++) {
108         for(j = 0 ; j < EFFECTIVE_ALPHABET; j++) {
109             k = charPositions[j];
110             if ((matrix[i][k] != BLAST_SCORE_MIN) && (matrix[i][k] < minScore))
111                 minScore = matrix[i][k];
112             if (matrix[i][k] > maxScore)
113                 maxScore = matrix[i][k];
114         }
115     }
116     return_sfp->obs_min = minScore;
117     return_sfp->obs_max = maxScore;
118     if ((maxScore - minScore) >= scoreRange) {
119       ErrPostEx(SEV_WARNING, 0, 0, "maxScore is %d, minScore is %d difference is >= allowed score range %d", maxScore, minScore, scoreRange);
120       return NULL;
121     }
122     for (i = 0; i < scoreRange; i++)
123         scoreArray[i] = 0.0;
124     return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/
125     onePosFrac = 1.0/ ((Nlm_FloatHi) matrixLength);
126     for(i = 0; i < matrixLength; i++) {
127         for (j = 0; j < EFFECTIVE_ALPHABET; j++) {
128             k = charPositions[j];
129             if(matrix[i][k] >= minScore) {
130                 return_sfp->sprob[matrix[i][k]] += (onePosFrac * queryProbArray[k]);
131             }
132         }
133     }
134     return_sfp->score_avg = 0;
135     for(i = minScore; i <= maxScore; i++)
136         return_sfp->score_avg += i * return_sfp->sprob[i];
137     return(return_sfp);
138 }
139 
140 
141 static Boolean
impalaScaleMatrix(BlastMatrixRescalePtr matrix_rescale,Nlm_FloatHi scalingFactor,Boolean doBinarySearch)142 impalaScaleMatrix(BlastMatrixRescalePtr matrix_rescale, Nlm_FloatHi scalingFactor, Boolean doBinarySearch)
143 {
144     Int4 dim1, dim2; /*number of rows and number of columns*/
145     Int4 a,c; /*loop indices*/
146     Boolean too_high=TRUE, done, first_time; /*control variables for binary search*/
147     Nlm_FloatHi factor, factor_low=1.0, factor_high=1.0; /*multiplicative factors in binary search*/
148     Nlm_FloatHi lambda, new_lambda; /*Karlin-Altschul parameter*/
149     Int4 index; /*loop index for binary search*/
150     Int4Ptr *private_matrix; /*pointer to locally manipulated version of the
151                                matrix*/
152     Int4Ptr *matrix;
153     Nlm_FloatHi scalefactor; /*local version of amount of scaling*/
154     Nlm_FloatHi divFactor; /*1/scalefacor*/
155     BLAST_ScoreFreqPtr this_sfp, return_sfp; /*score frequency pointers to
156                                                compute lambda*/
157     Nlm_FloatHi *scoreArray; /*array of score probabilities*/
158 
159     scoreArray = (Nlm_FloatHi *) MemNew(scoreRange * sizeof(Nlm_FloatHi));
160     return_sfp = (BLAST_ScoreFreqPtr) MemNew(1 * sizeof(BLAST_ScoreFreq));
161 
162 
163     private_matrix = matrix_rescale->private_matrix;
164     matrix = matrix_rescale->matrix;
165 
166     /* Bracket the values. */
167     dim1 = matrix_rescale->query_length;
168     dim2 = matrix_rescale->alphabet_size;
169 
170     lambda = matrix_rescale->lambda_ideal/scalingFactor;
171     divFactor = ((Nlm_FloatHi) POSIT_SCALE_FACTOR)/scalingFactor;
172     factor = 1.0;
173 
174     if (doBinarySearch) {
175         done = FALSE;
176         first_time = TRUE;
177         while (done != TRUE) {
178             for(c = 0; c < dim1; c++) {
179                 for(a = 0; a < dim2; a++) {
180                     if (private_matrix[c][a] == BLAST_SCORE_MIN) {
181                         matrix[c][a] = BLAST_SCORE_MIN;
182                     } else {
183                         matrix[c][a] = (factor*private_matrix[c][a]) /divFactor;
184                     }
185                 }
186             }
187 
188             this_sfp =  fillSfp(matrix, dim1, matrix_rescale->standardProb, scoreArray, return_sfp);
189             if (!this_sfp) {
190                 MemFree(scoreArray);
191                 MemFree(return_sfp);
192                 return FALSE;
193             }
194             new_lambda = impalaKarlinLambdaNR(this_sfp, matrix_rescale->kbp_psi[0]->Lambda/scalingFactor);
195 
196             if (new_lambda > lambda) {
197                 if (first_time) {
198                     factor_high = 1.0 + POSIT_PERCENT;
199                     factor = factor_high;
200                     factor_low = 1.0;
201                     too_high = TRUE;
202                     first_time = FALSE;
203                 } else {
204                     if (too_high == FALSE)
205                         break;
206                     factor_high += (factor_high-1.0);
207                     factor = factor_high;
208                 }
209             } else  {
210                 if (first_time) {
211                     factor_high = 1.0;
212                     factor_low = 1.0 - POSIT_PERCENT;
213                     factor = factor_low;
214                     too_high = FALSE;
215                     first_time = FALSE;
216                 } else {
217                     if (too_high == TRUE)
218                         break;
219                     factor_low += (factor_low-1.0);
220                     factor = factor_low;
221                 }
222             }
223         }
224 
225         /* binary search for ten times. */
226         for (index=0; index<POSIT_NUM_ITERATIONS; index++) {
227             factor = 0.5*(factor_high+factor_low);
228             for(c = 0; c < dim1; c++) {
229                 for(a = 0; a < dim2; a++) {
230                     if (private_matrix[c][a] == BLAST_SCORE_MIN) {
231                         matrix[c][a] = BLAST_SCORE_MIN;
232 		   } else {
233                        matrix[c][a] = (factor*private_matrix[c][a])/divFactor;
234 		   }
235                 }
236             }
237 
238             this_sfp =  fillSfp(matrix, dim1, matrix_rescale->standardProb, scoreArray, return_sfp);
239             if (!this_sfp) {
240                 MemFree(scoreArray);
241                 MemFree(return_sfp);
242                 return FALSE;
243             }
244             new_lambda = impalaKarlinLambdaNR(this_sfp, matrix_rescale->kbp_psi[0]->Lambda/scalingFactor);
245 
246             if (new_lambda > lambda) {
247                 factor_low = factor;
248             } else {
249                 factor_high = factor;
250             }
251         }
252     }
253 
254     for(c = 0; c < dim1; c++)
255         for(a = 0; a < dim2; a++) {
256             if (BLAST_SCORE_MIN != private_matrix[c][a]) {
257                 matrix[c][a] = Nlm_Nint((Nlm_FloatHi) private_matrix[c][a] *
258                                         factor / POSIT_SCALE_FACTOR);
259             }
260         }
261 
262     updateLambdaK(matrix_rescale, TRUE);
263 
264     scalefactor = ((Nlm_FloatHi) scalingFactor) / POSIT_SCALE_FACTOR;
265     for(c = 0; c < dim1; c++)
266         for(a = 0; a < dim2; a++) {
267             if (BLAST_SCORE_MIN != private_matrix[c][a]) {
268                 private_matrix[c][a] = Nlm_Nint((Nlm_FloatHi) private_matrix[c][a] *
269                                                 factor * scalefactor);
270             }
271         }
272 
273     for(a = 0; a < matrix_rescale->alphabet_size; a++) {
274         matrix[dim1][a] = BLAST_SCORE_MIN;
275         private_matrix[dim1][a] = BLAST_SCORE_MIN;
276     }
277 
278     MemFree(scoreArray);
279     MemFree(return_sfp);
280     return TRUE;
281 }
282 
impalaScaling(posSearchItems * posSearch,compactSearchItems * compactSearch,Nlm_FloatHi scalingFactor,Boolean doBinarySearch)283 Boolean LIBCALL impalaScaling(posSearchItems *posSearch, compactSearchItems * compactSearch, Nlm_FloatHi scalingFactor, Boolean doBinarySearch)
284 {
285     BlastMatrixRescalePtr matrix_rescale;
286     Boolean okay;   /* return value - whether impalaScaleMatrix succeeded */
287 
288     matrix_rescale = BlastMatrixRescaleNew(compactSearch->alphabetSize,
289                                            compactSearch->qlength,
290                                            compactSearch->query,
291                                            compactSearch->standardProb,
292                                            posSearch->posMatrix,
293                                            posSearch->posPrivateMatrix,
294                                            compactSearch->kbp_std,
295                                            compactSearch->kbp_psi,
296                                            compactSearch->kbp_gap_std,
297                                            compactSearch->kbp_gap_psi,
298                                            compactSearch->lambda_ideal,
299                                            compactSearch->K_ideal);
300 
301     okay = impalaScaleMatrix(matrix_rescale, scalingFactor, doBinarySearch);
302 
303     matrix_rescale = BlastMatrixRescaleDestruct(matrix_rescale);
304 
305     return okay;
306 }
307 
308 /*Some of the following checkpointing code is taken and adapted from
309   code written by K. Shriram for FASTLINK.
310   Reference:
311   A. A. Schaffer, S. K. Gupta, K. Shriram, and R. W. Cottingham, Jr.
312   Avoiding Recomputation in Linkage Analysis,
313   Human Heredity 44(1994), pp. 225-237. */
314 
315 
316 /* Front-ends to retrieve numbers. */
317 
318 #define  getCkptNlm_FloatHi(d, ckptFile)  (getCkptNumber(&(d),sizeof(Nlm_FloatHi),ckptFile))
319 #define  getCkptInt4(i, ckptFile)         (getCkptNumber(&(i),sizeof(Int4),ckptFile))
320 #define  getCkptChar(c, ckptFile)         (getCkptNumber(&(c),sizeof(Char),ckptFile))
321 
322 /*Read a checkpoint from the end of a previous PSI-BLAST round, get
323   query length, query, and position-specific target frequencies.
324   Returns TRUE if checkpoint was read sucessfully and FALSE otherwise.
325   scalingFactor records by how much to scale output scores */
326 
impalaReadCheckpoint(posSearchItems * posSearch,compactSearchItems * compactSearch,CharPtr fileName,ValNodePtr * error_return,Nlm_FloatHi scalingFactor)327 Boolean LIBCALL  impalaReadCheckpoint(posSearchItems * posSearch, compactSearchItems * compactSearch, CharPtr fileName, ValNodePtr * error_return,
328                                       Nlm_FloatHi scalingFactor)
329 {
330     FILE * checkFile; /*file in which to take the checkpoint*/
331     Int4 length1, length2, c; /*length of query sequence, and an index for it*/
332     Char  nextRes; /*next residue in stored copy of the query sequence*/
333     Uint1Ptr oldQuery; /*array to hold the query sequence*/
334     Int4 i,j; /*indices for position and character in alphabet*/
335 
336     ErrPostEx(SEV_INFO, 0, 0, "Attempting to recover data from previous checkpoint");
337 
338     checkFile = FileOpen(fileName, "rb");
339 
340     if (NULL == checkFile) {
341         ErrPostEx(SEV_WARNING, 0, 0, "Could not open checkpoint file");
342         return(FALSE);
343     }
344 
345     length1 = compactSearch->qlength;
346     getCkptInt4(length2,checkFile);
347 
348     if (length1 != length2) {
349         ErrPostEx(SEV_WARNING, 0, 0, "Invalid usage of checkpoint recovery; old query has length %ld, new query has length %ld", (long) length2,  (long) length1);
350         ErrPostEx(SEV_WARNING, 0, 0, "Failed to recover data");
351         FileClose(checkFile);
352         return(FALSE);
353     }
354 
355     oldQuery = (Uint1Ptr) MemNew(length1 * sizeof(Uint1));
356 
357     if (NULL == oldQuery) {
358         ErrPostEx(SEV_WARNING, 0, 0, "Failed to reconstruct previous query");
359         ErrPostEx(SEV_WARNING, 0, 0, "Failed to recover data");
360         FileClose(checkFile);
361         return(FALSE);
362     }
363 
364     for(c = 0; c < length1; c++) {
365         getCkptChar(nextRes, checkFile);
366         oldQuery[c] = ResToInt(nextRes);
367         if ((oldQuery[c] != compactSearch->query[c]) && (oldQuery[c] != Xchar)) {
368             ErrPostEx(SEV_WARNING, 0, 0, "Stored query has a %c at position %ld, while new query has a %c there",getRes(oldQuery[c]), (long) c, getRes(compactSearch->query[c]));
369             ErrPostEx(SEV_WARNING, 0, 0, "Failed to recover data");
370             MemFree(oldQuery);
371             FileClose(checkFile);
372             return(FALSE);
373         }
374     }
375     posSearch->posMatrix = (BLAST_Score **) MemNew((length1 + 1) * sizeof(BLAST_Score *));
376     posSearch->posPrivateMatrix = (BLAST_Score **) MemNew((length1 + 1) * sizeof(BLAST_Score *));
377     posSearch->posFreqs = (Nlm_FloatHi **) MemNew((length1 + 1) * sizeof(Nlm_FloatHi *));
378     if ((NULL == posSearch->posMatrix) || (NULL == posSearch->posPrivateMatrix) || (NULL == posSearch->posFreqs)) {
379 
380         ErrPostEx(SEV_WARNING, 0, 0, "Failed to allocate position-specific score matrix");
381         ErrPostEx(SEV_WARNING, 0, 0, "Failed to recover data");
382         MemFree(oldQuery);
383         FileClose(checkFile);
384         return(FALSE);
385     }
386     for(i = 0; i <= length1; i++) {
387         posSearch->posMatrix[i] = (BLAST_Score *) MemNew(compactSearch->alphabetSize * sizeof(BLAST_Score));
388         posSearch->posPrivateMatrix[i] = (BLAST_Score *) MemNew(compactSearch->alphabetSize * sizeof(BLAST_Score));
389         posSearch->posFreqs[i] = (Nlm_FloatHi *) MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
390 
391         if ((NULL == posSearch->posMatrix[i]) || (NULL == posSearch->posPrivateMatrix[i]) || (NULL == posSearch->posFreqs[i])) {
392             ErrPostEx(SEV_ERROR, 0, 0, "Failed to allocate position-specific score matrix");
393             ErrPostEx(SEV_ERROR, 0, 0, "Failed to recover data\n");
394             MemFree(oldQuery);
395             FileClose(checkFile);
396             return(FALSE);
397         }
398         for(j = 0; j < compactSearch->alphabetSize; j++) {
399             posSearch->posFreqs[i][j] = 0.0;
400         }
401     }
402     getCkptFreqMatrix(posSearch->posFreqs,length1,compactSearch->alphabetSize,checkFile);
403     posFreqsToMatrix(posSearch,compactSearch);
404     impalaScaling(posSearch, compactSearch, scalingFactor, TRUE);
405 
406     ErrPostEx(SEV_INFO, 0, 0, "Data recovered successfully");
407 
408     MemFree(oldQuery);
409     FileClose(checkFile);
410     return(TRUE);
411 }
412 
413 
414 
415 
416