1 /* $Id: posit.h,v 6.33 2011/10/25 14:33:16 boratyng Exp $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 */
26 
27 /*****************************************************************************
28 
29 File name: posit.h
30 
31 Author: Alejandro Schaffer
32 
33 Contents: header file for position-based BLAST.
34 
35 $Revision: 6.33 $
36 
37 *****************************************************************************/
38 /*
39 * $Log: posit.h,v $
40 * Revision 6.33  2011/10/25 14:33:16  boratyng
41 * Fix for printing gapless column weights relative to pseudocounts in ascii pssm output JIRA SB-589
42 *
43 * Revision 6.32  2008/03/31 13:36:10  madden
44 * Implemented a new method to compute effective observations.
45 * Implemented a new entropy-based method to compute column-specific pseudocounts.
46 *
47 * Revision 6.31  2007/01/22 19:20:55  camacho
48 * From Alejandro Schaffer:
49 * In posPurgeMatches, when in command-line mode, added a warning for the
50 * situation in which only the query is used to construct the PSSM.
51 *
52 * Revision 6.30  2006/09/18 17:49:06  papadopo
53 * increase size of PROTEIN_ALPHABET
54 *
55 * Revision 6.29  2004/10/12 15:06:57  papadopo
56 * 1. Modify residue frequency IO to comply with new scoremat spec
57 * 2. Remove check that residue frequencies read from scoremat are <= 1.0
58 * 3. Pass gap open and gap extend penalties into BposComputation and
59 * 	CposComputation, so that scoremats can contain them
60 *
61 * Revision 6.28  2004/08/23 17:09:22  papadopo
62 * From Michael Gertz: move static arrays out of header and into the one file that needs them
63 *
64 * Revision 6.27  2004/07/19 17:13:13  papadopo
65 * add capability to perform input and output of residue frequencies in scoremat form; also call PSIMatrixFrequencyRatiosNew before restarting from checkpoint
66 *
67 * Revision 6.26  2004/06/23 14:53:29  camacho
68 * Copy renamed versions of SFreqRatios and its *{New,Free} functions to avoid
69 * dependency ncbitool -> blast
70 *
71 * Revision 6.25  2004/06/22 14:16:46  camacho
72 * Changed signature of posFreqsToMatrix, added use of SFreqRatios structure from
73 * algo/blast/core/ to obtain underlying matrices' frequency ratios.
74 * This change results in using the frequency ratios to provide the scores
75 * for the PSSM in columns where all residue frequencies are 0. Previously the
76 * standard scoring matrix were used.
77 *
78 * Revision 6.24  2004/05/14 12:13:09  camacho
79 * Made posDemographics non-static for testing purposes.
80 *
81 * Revision 6.23  2001/08/29 19:05:03  madden
82 * added parameter posComputationCalled in outputPosComputation
83 *
84 * Revision 6.22  2001/04/03 19:38:24  madden
85 * Changed IDENTITY_PERCENTAGE to 0.94, Added to output of -Q option in outputPosMatrix
86 *
87 * Revision 6.21  2001/01/03 01:49:38  bauer
88 * Changed from static to "LIBCALL":
89 *  posAllocateMemory
90 *  posPurgeMatches
91 *  posCancel
92 *  posComputeExtents
93 *  posComputeSequenceWeights
94 *  posCheckWeights
95 *  posComputePseudoFreqs
96 *  posScaling
97 *
98 * Revision 6.20  2000/12/29 00:39:00  hurwitz
99 * added ints for freeing of posSearchItems
100 *
101 * Revision 6.19  2000/11/13 14:00:39  madden
102 * Added frequency ratios for * in all standard matrices
103 *
104 * Revision 6.18  2000/11/09 14:27:52  madden
105 * psi-blast fixes for star character
106 *
107 * Revision 6.17  2000/10/10 21:46:31  shavirin
108 * Added support for BLOSUM50, BLOSUM90, PAM250 with -t T
109 * Changed frequency ratio for X from 0.707 to 0.750 to ensure that
110 * score is always -1.
111 *
112 * Revision 6.16  2000/07/31 16:41:02  shavirin
113 * Reduced POSIT_SCALE_FACTOR from 1000 to 200 to avoid overflow
114 * with BLOSUM80; moved declaration os POSIT_SCALE_FACTOR to posit.h
115 *
116 * Revision 6.15  2000/07/25 18:12:04  shavirin
117 * WARNING: This is no-turning-back changed related to S&W Blast from
118 * Alejandro Schaffer
119 *
120 * Revision 6.14  2000/07/07 21:20:08  vakatov
121 * Get all "#include" out of the 'extern "C" { }' scope!
122 *
123 * Revision 6.13  2000/04/29 21:49:05  kans
124 * removed bad characters on Mac
125 *
126 * Revision 6.12  1999/11/15 21:47:00  shavirin
127 * Added parameter Boolean use_best_align into compactSearchItems structure
128 *
129 * Revision 6.11  1999/10/21 16:15:04  shavirin
130 * Removed unused array and all references to array threshSequences
131 *
132 * Revision 6.10  1999/09/03 17:23:48  madden
133 * Eliminated use of posMaxThresh field in posSearchItems
134 *
135 * Revision 6.9  1999/08/04 13:27:11  madden
136 * Added -B option
137 *
138 * Revision 6.8  1999/03/21 19:40:30  madden
139 * Added 3rd argument matrixfp to definition of outputPosMatrix
140 *
141 * Revision 6.7  1999/03/17 16:49:11  madden
142 * Removed comment within comment
143 *
144 * Revision 6.6  1999/01/26 18:27:58  madden
145 * Made functions public for AS
146 *
147 * Revision 6.5  1998/09/28 12:31:32  madden
148 * Used BlastConstructErrorMessage
149 *
150  * Revision 6.3  1998/04/24 19:29:50  madden
151  * Added ideal values to compactSearch
152  *
153  * Revision 6.2  1998/03/25 22:36:19  egorov
154  * Change type of posRepeatSequences
155  *
156  * Revision 6.1  1997/12/23 21:07:08  madden
157  * Changes for checkpointing
158  *
159  * Revision 6.0  1997/08/25 18:53:52  madden
160  * Revision changed to 6.0
161  *
162  * Revision 1.11  1997/08/11 15:45:28  madden
163  * eliminated obsolete fields
164  *
165  * Revision 1.10  1997/06/25 14:04:51  madden
166  * prototype change
167  *
168  * Revision 1.9  1997/05/29 20:36:23  madden
169  * Add Boolean *posUseSequences
170  *
171  * Revision 1.8  1997/05/22 21:25:30  madden
172  * fixed memory leaks
173  *
174  * Revision 1.7  1997/05/16 20:10:10  madden
175  * Added BLAST_Score **posPrivateMatrix
176  *
177  * Revision 1.6  1997/05/01 15:53:27  madden
178  * Addition of extra KarlinBlk's for psi-blast
179  *
180  * Revision 1.5  1997/04/22  16:36:49  madden
181  * Changes for use of psi-blast with www.
182  *
183  * Revision 1.4  1997/04/10  19:25:53  madden
184  * COMMAND_LINE replaced by ALL_ROUNDS, Char to Int1.
185  *
186  * Revision 1.3  1997/04/09  20:01:53  madden
187  * Functions CposComputation and WposComputation replace posComputations.
188  *
189  * Revision 1.2  1997/04/04  20:44:55  madden
190  * Changed posComputation to return Int4Ptr *.
191  *
192  * Revision 1.1  1997/02/13  15:22:13  madden
193  * Initial revision
194  *
195 */
196 
197 #ifndef __POSIT__
198 #define __POSIT__
199 
200 #include <ncbi.h>
201 #include <math.h>
202 #include <blast.h>
203 #include <blastdef.h>
204 
205 #ifdef __cplusplus
206 extern "C" {
207 #endif
208 
209 #define charsPerLine 20 /*Number of characters of a sequence to print
210                           per line for score matrix*/
211 
212 #define ALIGN_LINE_LENGTH 8192
213 #define ALIGN_CHARS_PER_LINE 70
214 
215 #define UNUSED (-1)
216 
217 #define Xchar   21    /*character for low-complexity columns*/
218 #define StarChar   25    /*character for stop codons*/
219 
220 #define ALL_ROUNDS 1 /*do all rounds without interruption*/
221 
222 /* Front-ends to retrieve numbers. */
223 
224 #define  getCkptNlm_FloatHi(d, ckptFile)  (getCkptNumber(&(d),sizeof(Nlm_FloatHi),ckptFile))
225 #define  getCkptInt4(i, ckptFile)         (getCkptNumber(&(i),sizeof(Int4),ckptFile))
226 #define  getCkptChar(c, ckptFile)         (getCkptNumber(&(c),sizeof(Char),ckptFile))
227 
228 
229 /****************************************************************************/
230 /* PLEASE NOTE: The following structure and the PSIMatrixFrequencyRatios*
231  * functions have been copied and renamed from
232  * algo/blast/core/matrix_freq_ratios.[hc] to eliminate a dependency from the
233  * ncbitool library to the blast library.
234  */
235 
236 /** Stores the frequency ratios along with their bit scale factor */
237 typedef struct FreqRatios {
238 
239     /** The actual frequency ratios */
240     double**   data;
241 
242     /** Used to multiply the values in the above matrix to obtain scores in bit
243      * units */
244     int        bit_scale_factor;
245 
246 } FreqRatios;
247 
248 /** Retrive the matrix's frequency ratios.
249  * @param matrix_name Available options include:
250  *          BLOSUM62
251  *          BLOSUM62_20
252  *          BLOSUM62_20A
253  *          BLOSUM62_20B
254  *          BLOSUM45
255  *          BLOSUM80
256  *          BLOSUM50
257  *          BLOSUM90
258  *          PAM30
259  *          PAM70
260  *          PAM250
261  * @return NULL on error
262  */
263 FreqRatios*
264 PSIMatrixFrequencyRatiosNew(const char* matrix_name);
265 
266 /** Deallocate the frequency ratios structure */
267 FreqRatios*
268 PSIMatrixFrequencyRatiosFree(FreqRatios* freq_ratios);
269 
270 /* END of copied code */
271 /****************************************************************************/
272 
273 typedef struct posDesc {
274   Int1 letter;  /*what is the preferred letter here*/
275   Boolean used;  /*is there any letter here */
276   Nlm_FloatHi e_value; /*score of highest hsp including this position */
277   Int4 leftExtent; /*How far left do same sequences match?*/
278   Int4 rightExtent; /*How far right do same sequences match?*/
279 } posDesc;
280 
281 typedef struct posSearchItems {
282   Int4 *posCount; /*count of how many sequences match at
283                   each query position, default value is 1 to
284                   include query*/
285   Int4 **posC; /*position-sepcific occurrence counts*/
286   Int4 **posDistinctDistrib; /*For position i, how many positions in its block
287                                have j distinct letters*/
288   Int4 *posNumParticipating; /*number of sequences at each position*/
289   Nlm_FloatHi **posMatchWeights;
290   BLAST_Score **posMatrix;
291   BLAST_Score **posPrivateMatrix;
292   Nlm_FloatHi **posFreqs;
293   Nlm_FloatHi *posGaplessColumnWeights;
294   Nlm_FloatHi *pseudoWeights; /*pseudo count constant for each query position*/
295   Int4 posNumSequences;
296   Int4 posResultsCounter;
297   Int4 *posResultSequences;
298   Nlm_FloatHi *posA;
299   Nlm_FloatHi *posRowSigma;
300   Int4 posDescMatrixLength;	/* Length of posDescMatrix, for deallocation. */
301   posDesc **posDescMatrix;
302   posDesc *posExtents;
303   Nlm_FloatHi *posSigma;
304   Int4 *posIntervalSizes;  /*interval size used for this column*/
305   Int2Ptr posRepeatSequences;
306   Boolean *posUseSequences;
307   Nlm_FloatHi *posInformation;
308   Int4 QuerySize;
309   Int4 NumSequences;
310   FreqRatios* stdFreqRatios; /* underlying scoring matrix's frequency ratios */
311 } posSearchItems;
312 
313 typedef struct compactSearchItems {
314     Uint1Ptr  query;
315     Int4 qlength;
316     Boolean gapped_calculation;
317     Int4 alphabetSize;
318     Int4 pseudoCountConst;
319     Nlm_FloatHi ethresh;
320     Nlm_FloatHi lambda;
321     Nlm_FloatHi *standardProb;
322     Int4Ptr  *matrix;
323     Char standardMatrixName[50];
324     BLAST_KarlinBlkPtr *kbp_std, *kbp_psi, *kbp_gap_std, *kbp_gap_psi;
325     Nlm_FloatHi	lambda_ideal, K_ideal;
326     Boolean use_best_align;
327     Int4 currentPass;
328     Int4 maximumPass;
329     Nlm_FloatHi standardProbWeight; /*weight of standard probs for
330 				      column-specific pseudocounts*/
331     Nlm_FloatHi HmethodNumerator;
332     Nlm_FloatHi HmethodDenominator;
333     Char queryFileName[100];
334 } compactSearchItems;
335 
336 
337 void LIBCALL outputPosMatrix PROTO((posSearchItems *posSearch, compactSearchItems * compactSearch, FILE *matrixfp, Boolean posComputationCalled));
338 
339 Int4Ptr * LIBCALL CposComputation PROTO((posSearchItems *posSearch, BlastSearchBlkPtr search, compactSearchItems * compactSearch, SeqAlignPtr listOfSeqAligns, Char *ckptFileName, Boolean patternSearchStart, Int4 scorematOutput, Bioseq *query_bsp, Int4 gap_open, Int4 gap_extend, ValNodePtr * error_return,
340  Nlm_FloatHi weightExponent));
341 
342 Int4Ptr * LIBCALL WposComputation PROTO((compactSearchItems *compactSearch, SeqAlignPtr listOfSeqAligns, Nlm_FloatHi **posFreqs));
343 
344 Int4Ptr * LIBCALL BposComputation PROTO((posSearchItems *posSearch, BlastSearchBlkPtr search, compactSearchItems * compactSearch, Char *ckptFileName, Char *takeCkptFileName, Int4 scorematOutput, Bioseq *query_bsp, Int4 gap_open, Int4 gap_extend, ValNodePtr * error_return));
345 
346 void LIBCALL posPrintInformation PROTO((posSearchItems *posSearch, BlastSearchBlkPtr search, Int4 passNum));
347 
348 void LIBCALL posInitializeInformation PROTO((posSearchItems *posSearch, BlastSearchBlkPtr search));
349 
350 void LIBCALL posFreeInformation PROTO((posSearchItems *posSearch));
351 
352 void LIBCALL posConvergenceTest PROTO((posSearchItems *posSearch, BlastSearchBlkPtr search, SeqAlignPtr listOfSeqAligns, Int4 thisPassNum));
353 
354 void LIBCALL posCancel(posSearchItems *posSearch, compactSearchItems * compactSearch, Int4 first, Int4 second, Int4 matchStart, Int4 intervalLength);
355 
356 void LIBCALL posPurgeMatches(posSearchItems *posSearch, compactSearchItems * compactSearch, ValNodePtr * error_return);
357 
358 void LIBCALL posDemographics(posSearchItems *posSearch,
359                              compactSearchItems * compactSearch,
360                              SeqAlignPtr listOfSeqAligns);
361 
362 /*Cleanup position-specific  data structures after one pass*/
363 void LIBCALL posCleanup PROTO((posSearchItems *posSearch, compactSearchItems * compactSearch));
364 
365 void LIBCALL copySearchItems(compactSearchItems * compactSearch, BlastSearchBlkPtr search, Char * matrixName);
366 
367 compactSearchItems * LIBCALL compactSearchNew(compactSearchItems * compactSearch);
368 
369 void LIBCALL compactSearchDestruct(compactSearchItems * compactSearch);
370 
371 Boolean LIBCALL posTakeCheckpoint(posSearchItems * posSearch, compactSearchItems * compactSearch, CharPtr fileName, ValNodePtr * error_return);
372 
373 Boolean LIBCALL posTakeScoremat(posSearchItems * posSearch, compactSearchItems * compactSearch, CharPtr fileName, Int4 scorematOutput, Bioseq *query_bsp, Int4 gap_open, Int4 gap_extend, ValNodePtr * error_return);
374 
375 Boolean LIBCALL posReadCheckpoint(posSearchItems * posSearch, compactSearchItems * compactSearch, CharPtr fileName, Int4 ScorematInput, ValNodePtr * error_return);
376 
377 void LIBCALL posAllocateMemory(posSearchItems * posSearch, Int4 alphabetSize, Int4 querySize, Int4 numSequences);
378 
379 void LIBCALL posCheckpointFreeMemory(posSearchItems *posSearch, Int4 querySize);
380 
381 void LIBCALL posComputeExtents(posSearchItems *posSearch, compactSearchItems * compactSearch);
382 
383 void LIBCALL posComputeSequenceWeights(posSearchItems *posSearch, compactSearchItems * compactSearch, Nlm_FloatHi weightExponent);
384 
385 void LIBCALL posCheckWeights(posSearchItems *posSearch, compactSearchItems * compactSearch);
386 
387 void LIBCALL posFreqsToMatrix(posSearchItems *posSearch, compactSearchItems *compactSearch);
388 
389 Uint1 LIBCALL ResToInt(Char input);
390 
391 void    LIBCALL getCkptFreqMatrix (Nlm_FloatHi ** theMatrix, Int4 length, Int4 width, FILE * ckptFile);
392 
393 void  LIBCALL getCkptNumber(void * numberPtr, Int4 numberSize, FILE * ckptFile );
394 
395 void LIBCALL copyPosFreqs(Nlm_FloatHi **posFreqsFrom, Nlm_FloatHi **posFreqsTo, Int4 qlength, Int4 alphabetSize);
396 
397 Nlm_FloatHi ** LIBCALL allocatePosFreqs(Int4 length, Int4 alphabetSize);
398 
399 Nlm_FloatHi ** LIBCALL posComputePseudoFreqs(posSearchItems *posSearch, compactSearchItems * compactSearch, Boolean Cpos);
400 
401 void LIBCALL posScaling(posSearchItems *posSearch, compactSearchItems * compactSearch);
402 
403 
404 #define PROTEIN_ALPHABET 28
405 
406 #define POSIT_SCALE_FACTOR 200
407 
408 #define NO_SCOREMAT_IO 0
409 #define ASCII_SCOREMAT 1
410 #define BINARY_SCOREMAT 2
411 
412 
413 #ifdef __cplusplus
414 
415 }
416 #endif
417 
418 #endif /* __POSIT__ */
419 
420