1 /* $Id: cddposutil.c,v 1.22 2007/03/13 20:38:05 madden Exp $
2 *===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: cddposutil.c
27 *
28 * Author: Aron Marchler-Bauer
29 *
30 * Initial Version Creation Date: 12/21/1999
31 *
32 * $Revision: 1.22 $
33 *
34 * File Description: CDD utilities involving position-specific scoring
35 * matrices (PSSMs)
36 *
37 * Modifications:
38 * --------------------------------------------------------------------------
39 * $Log: cddposutil.c,v $
40 * Revision 1.22 2007/03/13 20:38:05 madden
41 * - In CddSetUpSearchInternalByLoc, don't cast DROPOFF_NUMBER_OF_BITS
42 * to integer.
43 *
44 * - In CddSetUpSearchWithReadDb, use integer division to compute dblen
45 * to avoid possible truncation of an Int8 value.
46 * [From Mike Gertz]
47 *
48 * Revision 1.21 2006/09/18 20:54:53 kans
49 * removed PROTEIN_ALPHABET - collides with new version in posit.h
50 *
51 * Revision 1.20 2005/10/22 02:04:53 thiessen
52 * plug memory leak
53 *
54 * Revision 1.19 2005/08/31 20:34:47 coulouri
55 * From Mike Gertz:
56 * in CddSetUpSearchWithReadDb:
57 * Call BlastSingleQueryResultSize to get the hitlist size for
58 * preliminary alignments as is done for other modes of blast; this
59 * will increase the hitlist size for either modes with a final
60 * traceback computation or modes that use composition-based
61 * statistics.
62 *
63 * Revision 1.18 2004/03/31 17:58:51 papadopo
64 * Mike Gertz' changes for length adjustment calculations
65 *
66 * Revision 1.17 2004/01/22 15:43:57 bauer
67 * call BlastCalculateEffectiveLengths in CddSetUpSearchInternalByLoc
68 *
69 * Revision 1.16 2003/05/29 13:17:56 thiessen
70 * fix memory leak in CddposFreeMemory()
71 *
72 * Revision 1.15 2001/12/31 13:47:06 bauer
73 * made block_width a local variable in CddSetUpSearchInternalByLoc to deal with changes in blast data structures
74 *
75 * Revision 1.14 2001/02/06 20:54:43 hurwitz
76 * added a couple asserts
77 *
78 * Revision 1.13 2001/01/19 23:34:55 bauer
79 * fixed memory leaks
80 *
81 * Revision 1.12 2001/01/17 20:21:54 bauer
82 * merged in changes for PSSM calculation
83 *
84 * Revision 1.11 2001/01/03 11:11:49 bauer
85 * renamed posCancel to CddposCancel
86 *
87 * Revision 1.10 2000/12/29 00:52:21 hurwitz
88 * cleaning memory leaks
89 *
90 * Revision 1.9 2000/10/26 01:55:54 bauer
91 * renamed BLASTSetUpSearchInternalByLoc to CddSetUpSearchInternalByLoc
92 *
93 * Revision 1.8 2000/09/08 21:43:51 hurwitz
94 * adding PSSM calculation to DDE
95 *
96 * Revision 1.7 2000/08/17 19:00:24 hurwitz
97 * fixed some codewarrior compiler warnings
98 *
99 * Revision 1.6 2000/08/10 22:31:19 bauer
100 * CddposTakeCheckpoint now declared to return Boolean
101 *
102 * Revision 1.5 2000/08/09 21:29:08 hurwitz
103 * adding cddutil.c to VC++ build
104 *
105 * Revision 1.4 2000/08/08 21:18:58 bauer
106 * changed POSIT_SCALE_FACTOR from 1000 to 200
107 *
108 * Revision 1.3 2000/07/28 18:01:59 bauer
109 * fixed typecasts
110 *
111 * Revision 1.2 2000/07/19 19:36:22 bauer
112 * added modification logging
113 *
114 *
115 * ==========================================================================
116 */
117
118
119 #include <ncbi.h>
120 #include <blastpri.h>
121 #include <objcode.h>
122 #include <objseq.h>
123 #include <objsset.h>
124 #include <sequtil.h>
125 #include <posit.h>
126 #include <txalign.h>
127 #include "cddposutil.h"
128 #include <blast.h>
129 #include <objloc.h>
130 #include <accid1.h>
131 #include <cddutil.h>
132 #include <mblast.h>
133 #include <profiles.h>
134 /* #include <blastdef.h> */
135
136 #define GAP_CHAR 0
137 #define GAP_HERE -1
138 #define POS_RESTING 0
139 #define POS_COUNTING 1
140 #define IDENTITY_RATIO 0.98
141 #define posEpsilon 0.0001
142 #define posEpsilon2 0.0000001
143 #define POSIT_SCALE_FACTOR2 200
144 #define EFFECTIVE_ALPHABET 20
145
146 /* function definitions */
147 static Uint1Ptr LIBCALL CddGetSequenceWithDenseDiag(DenseDiagPtr ddp, Boolean query,
148 Int4Ptr start, Int4Ptr stop);
149 static Uint1Ptr LIBCALL CddGetSequenceWithDenseSeg(DenseSegPtr dsp, Boolean query,
150 Int4Ptr start, Int4Ptr length);
151 static void LIBCALL CddposCancel(posSearchItems *posSearch, compactSearchItems * compactSearch,
152 Int4 first, Int4 second, Int4 matchStart, Int4 intervalLength);
153 static Nlm_FloatHi ** LIBCALL CddallocatePosFreqs(Int4 length, Int4 alphabetSize);
154 static Nlm_FloatHi LIBCALL countsFunction(Nlm_FloatHi Sigma, Int4 intervalLength);
155 static Nlm_FloatHi LIBCALL posit_rounddown(Nlm_FloatHi value);
156 static Int2 LIBCALL CddSetUpSearchInternalByLoc
157 (BlastSearchBlkPtr search, SeqLocPtr query_slp, BioseqPtr query_bsp,
158 CharPtr prog_name, Int4 qlen, BLAST_OptionsBlkPtr options,
159 int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)));
160 static void LIBCALL putCkptNumber(void * numberPtr, Int4 numberSize, FILE * ckptFile);
161 static void LIBCALL putFreqVector(Nlm_FloatHi * theVector, Int4 length, FILE * ckptFile);
162 static void LIBCALL putCkptFreqMatrix (Nlm_FloatHi **theMatrix, Int4 length, Int4 width, FILE * ckptFile);
163
164 /*---------------------------------------------------------------------------*/
165 /*Allocate memory for data structures inside posSearch used in */
166 /* position-specific caculations */
167 /* posSearch -- to be filled in */
168 /* alphabetSize -- number of distinct characters used in the sequences */
169 /* querySize -- number of characters in the query sequence */
170 /* numSequences -- number of matching sequences potentially in the model */
171 /*---------------------------------------------------------------------------*/
172
CddposAllocateMemory(posSearchItems * posSearch,Int4 alphabetSize,Int4 querySize,Int4 numSequences)173 void LIBCALL CddposAllocateMemory(posSearchItems * posSearch, Int4 alphabetSize, Int4 querySize, Int4 numSequences)
174 {
175 Int4 i, j; /*loop indices*/
176
177 posSearch->QuerySize = querySize;
178 posSearch->NumSequences = numSequences;
179
180 posSearch->posCount = (Int4 *) MemNew(querySize * sizeof(Int4));
181 if (NULL == posSearch->posCount)
182 exit(EXIT_FAILURE);
183 for(i = 0; i < querySize; i++)
184 posSearch->posCount[i] = 0;
185
186 posSearch->posC = (Int4 **) MemNew((querySize + 1) * sizeof(Int4 *));
187 if (NULL == posSearch->posC)
188 exit(EXIT_FAILURE);
189 for(i = 0; i <= querySize; i++) {
190 posSearch->posC[i] = (Int4 *) MemNew(alphabetSize * sizeof(Int4));
191 if (NULL == posSearch->posC[i])
192 exit(EXIT_FAILURE);
193 for(j = 0; j < alphabetSize; j++)
194 posSearch->posC[i][j] = 0;
195 }
196 posSearch->posMatchWeights = (Nlm_FloatHi **) MemNew((querySize+1) * sizeof(Nlm_FloatHi *));
197 if (NULL == posSearch->posMatchWeights)
198 exit(EXIT_FAILURE);
199 for (i = 0; i <= querySize ; i++) {
200 posSearch->posMatchWeights[i] = (Nlm_FloatHi *) MemNew(alphabetSize * sizeof(Nlm_FloatHi));
201 if (NULL == posSearch->posMatchWeights[i])
202 exit(EXIT_FAILURE);
203 for(j = 0; j < alphabetSize; j++)
204 posSearch->posMatchWeights[i][j] = 0.0;
205 }
206 posSearch->posMatrix = (BLAST_Score **) MemNew((querySize + 1) * sizeof(BLAST_Score *));
207 posSearch->posPrivateMatrix = (BLAST_Score **) MemNew((querySize + 1) * sizeof(BLAST_Score *));
208 posSearch->posFreqs = (Nlm_FloatHi **) MemNew((querySize + 1) * sizeof(Nlm_FloatHi *));
209 if (NULL == posSearch->posMatrix) exit(EXIT_FAILURE);
210 if (NULL == posSearch->posPrivateMatrix) exit(EXIT_FAILURE);
211 if (NULL == posSearch->posFreqs) exit(EXIT_FAILURE);
212 for(i = 0; i <= querySize; i++) {
213 posSearch->posMatrix[i] = (BLAST_Score *) MemNew(alphabetSize * sizeof(BLAST_Score));
214 posSearch->posPrivateMatrix[i] = (BLAST_Score *) MemNew(alphabetSize * sizeof(BLAST_Score));
215 posSearch->posFreqs[i] = (Nlm_FloatHi *) MemNew(alphabetSize * sizeof(Nlm_FloatHi));
216 if (NULL == posSearch->posMatrix[i]) exit(EXIT_FAILURE);
217 if (NULL == posSearch->posPrivateMatrix[i]) exit(EXIT_FAILURE);
218 if (NULL == posSearch->posFreqs[i]) exit(EXIT_FAILURE);
219 for(j = 0; j < alphabetSize; j++) {
220 posSearch->posMatrix[i][j] = 0;
221 posSearch->posFreqs[i][j] = 0;
222 }
223 }
224 posSearch->posSigma = (Nlm_FloatHi *) MemNew((querySize) * sizeof(Nlm_FloatHi));
225 if (NULL == posSearch->posSigma)
226 exit(EXIT_FAILURE);
227 for(i = 0; i < querySize; i++) {
228 posSearch->posSigma[i] = 0.0;
229 }
230 posSearch->posIntervalSizes = (Int4 *) MemNew((querySize) * sizeof(Int4));
231 if (NULL == posSearch->posIntervalSizes)
232 exit(EXIT_FAILURE);
233 for(i=0; i < querySize; i++)
234 posSearch->posIntervalSizes[i] = 0;
235 posSearch->posDescMatrixLength = numSequences;
236 posSearch->posDescMatrix = (posDesc **) MemNew((numSequences + 1) * sizeof(posDesc *));
237 if (NULL == posSearch->posDescMatrix)
238 exit(EXIT_FAILURE);
239 for (i = 0; i <= numSequences; i++) {
240 posSearch->posDescMatrix[i] = (posDesc *) MemNew(querySize * sizeof(posDesc));
241 if (NULL == posSearch->posDescMatrix[i])
242 exit(EXIT_FAILURE);
243 for(j = 0; j < querySize; j++) {
244 posSearch->posDescMatrix[i][j].letter = UNUSED;
245 posSearch->posDescMatrix[i][j].used = FALSE;
246 posSearch->posDescMatrix[i][j].e_value = 1.0;
247 posSearch->posDescMatrix[i][j].leftExtent = -1;
248 posSearch->posDescMatrix[i][j].rightExtent = querySize;
249 }
250 }
251 posSearch->posExtents = (posDesc *) MemNew(querySize * sizeof(posDesc));
252 if (NULL == posSearch->posExtents)
253 exit(EXIT_FAILURE);
254 for(j = 0; j < querySize; j++) {
255 posSearch->posExtents[j].used = FALSE;
256 posSearch->posExtents[j].leftExtent = -1;
257 posSearch->posExtents[j].rightExtent = querySize;
258 }
259 posSearch->posA = (Nlm_FloatHi *) MemNew((numSequences+ 1) * sizeof(Nlm_FloatHi));
260 if (NULL == posSearch->posA)
261 exit(EXIT_FAILURE);
262 posSearch->posRowSigma = (Nlm_FloatHi *) MemNew((numSequences + 1) * sizeof(Nlm_FloatHi));
263 if (NULL == posSearch->posRowSigma)
264 exit(EXIT_FAILURE);
265 }
266
267
CddposFreeMemory2(compactSearchItems * compactSearch)268 void LIBCALL CddposFreeMemory2(compactSearchItems * compactSearch)
269 {
270 MemFree(compactSearch->standardProb);
271 }
272
CddposFreeMemory(posSearchItems * posSearch)273 void LIBCALL CddposFreeMemory(posSearchItems * posSearch) {
274 /*---------------------------------------------------------------------------*/
275 /* free memory that was allocated in posAllocateMemory. */
276 /*---------------------------------------------------------------------------*/
277 Int4 i;
278
279 MemFree(posSearch->posCount);
280 for (i=0; i<=posSearch->QuerySize; i++) {
281 MemFree(posSearch->posMatchWeights[i]);
282 MemFree(posSearch->posFreqs[i]);
283 MemFree(posSearch->posMatrix[i]);
284 MemFree(posSearch->posPrivateMatrix[i]);
285 MemFree(posSearch->posC[i]);
286 }
287 MemFree(posSearch->posC);
288 MemFree(posSearch->posMatchWeights);
289 MemFree(posSearch->posMatrix);
290 MemFree(posSearch->posPrivateMatrix);
291 MemFree(posSearch->posFreqs);
292
293 for (i=0; i<=posSearch->posDescMatrixLength; i++) {
294 MemFree(posSearch->posDescMatrix[i]);
295 }
296 MemFree(posSearch->posDescMatrix);
297
298 MemFree(posSearch->posSigma);
299 MemFree(posSearch->posIntervalSizes);
300 MemFree(posSearch->posExtents);
301 MemFree(posSearch->posA);
302 MemFree(posSearch->posRowSigma);
303 MemFree(posSearch->posInformation);
304 MemFree(posSearch->posUseSequences);
305 MemFree(posSearch->posGaplessColumnWeights);
306
307 PSIMatrixFrequencyRatiosFree(posSearch->stdFreqRatios);
308 }
309
310
311 /*---------------------------------------------------------------------------*/
312 /* Copy a few fields from the lasrge record search into the small record */
313 /* compactSearch, so that a small amount of information */
314 /* is passed into posit.c */
315 /*---------------------------------------------------------------------------*/
CddcopySearchItems(compactSearchItems * compactSearch,BlastSearchBlkPtr search)316 void LIBCALL CddcopySearchItems(compactSearchItems *compactSearch,
317 BlastSearchBlkPtr search)
318 {
319 BLAST_ResFreqPtr stdrfp; /* gets standard frequencies in prob field */
320 Int4 a; /*index over characters*/
321
322 compactSearch->query = search->context[0].query->sequence;
323 compactSearch->qlength = search->context[0].query->length;
324 compactSearch->gapped_calculation = search->pbp->gapped_calculation;
325 compactSearch->alphabetSize = search->sbp->alphabet_size;
326 compactSearch->pseudoCountConst = search->pbp->pseudoCountConst;
327 compactSearch->ethresh = search->pbp->ethresh;
328 compactSearch->matrix = search->sbp->matrix;
329 compactSearch->kbp_psi = search->sbp->kbp_psi;
330 compactSearch->kbp_gap_psi = search->sbp->kbp_gap_psi;
331 compactSearch->kbp_std = search->sbp->kbp_std;
332 compactSearch->kbp_gap_std = search->sbp->kbp_gap_std;
333 compactSearch->use_best_align = search->pbp->use_best_align;
334
335 /*---------------------------------------------------------------------------*/
336 /* these are removed, not initialized in CddCposComputation */
337 /*---------------------------------------------------------------------------*/
338 /* compactSearch->lambda = search->sbp->kbp[0]->Lambda; */
339 /* compactSearch->lambda_ideal = search->sbp->kbp_ideal->Lambda; */
340 /* compactSearch->K_ideal = search->sbp->kbp_ideal->K; */
341
342 stdrfp = BlastResFreqNew(search->sbp);
343 BlastResFreqStdComp(search->sbp,stdrfp);
344 compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
345 if (NULL == compactSearch->standardProb)
346 exit(EXIT_FAILURE);
347 for(a = 0; a < compactSearch->alphabetSize; a++)
348 compactSearch->standardProb[a] = stdrfp->prob[a];
349 stdrfp = BlastResFreqDestruct(stdrfp);
350 }
351
352 /*---------------------------------------------------------------------------*/
353 /* - find all sequences, basically, for Cdd-specific purposes */
354 /*---------------------------------------------------------------------------*/
CddfindThreshSequences(posSearchItems * posSearch,SeqAlignPtr listOfSeqAligns,Int4 numalign,Int4 numseq)355 void LIBCALL CddfindThreshSequences(posSearchItems *posSearch, SeqAlignPtr listOfSeqAligns, Int4 numalign, Int4 numseq)
356 {
357 SeqAlignPtr curSeqAlign; /* pointers into list of seqAligns */
358 DenseSegPtr curSegs; /* Item in list of seqAligns */
359 SeqIdPtr thisId; /* Ids of target sequences in current SeqAlign */
360 Int4 ordinalNumber; /* index of sequence within database */
361
362 /* Allocate boolean array to store values */
363 posSearch->posResultSequences = (Int4 *) MemNew(numalign * sizeof(Int4));
364 posSearch->posResultsCounter = 0;
365
366 curSeqAlign = listOfSeqAligns;
367 ordinalNumber = -1;
368 while (curSeqAlign) {
369 curSegs = (DenseSegPtr) curSeqAlign->segs;
370 thisId = curSegs->ids->next;
371 /* Get ordinal ids of sequences in result */
372 ordinalNumber++;
373 posSearch->posResultSequences[posSearch->posResultsCounter]=ordinalNumber;
374 posSearch->posResultsCounter++;
375 curSeqAlign = curSeqAlign->next;
376 }
377 }
378
379 /*---------------------------------------------------------------------------*/
380 /* - find all sequences, basically, for Cdd-specific purposes */
381 /*---------------------------------------------------------------------------*/
CddfindDenseDiagThreshSequences(posSearchItems * posSearch,SeqAlignPtr listOfSeqAligns,Int4 numalign,Int4 numseq)382 void LIBCALL CddfindDenseDiagThreshSequences(posSearchItems *posSearch, SeqAlignPtr listOfSeqAligns, Int4 numalign, Int4 numseq)
383 {
384 SeqAlignPtr curSeqAlign; /* pointers into list of seqAligns */
385 DenseDiagPtr curSegs; /* Item in list of seqAligns */
386 SeqIdPtr thisId; /* Ids of target sequences in current SeqAlign */
387 Int4 ordinalNumber; /* index of sequence within database */
388
389 /* Allocate boolean array to store values */
390 posSearch->posResultSequences = (Int4 *) MemNew(numalign * sizeof(Int4));
391 posSearch->posResultsCounter = 0;
392
393 curSeqAlign = listOfSeqAligns;
394 ordinalNumber = -1;
395 while (curSeqAlign) {
396 curSegs = (DenseDiagPtr) curSeqAlign->segs;
397 thisId = curSegs->id->next;
398 /* Get ordinal ids of sequences in result */
399 ordinalNumber++;
400 posSearch->posResultSequences[posSearch->posResultsCounter]=ordinalNumber;
401 posSearch->posResultsCounter++;
402 curSeqAlign = curSeqAlign->next;
403 }
404 }
405
406
407 /*---------------------------------------------------------------------------*/
408 /*---------------------------------------------------------------------------*/
CddposInitializeInformation(posSearchItems * posSearch,BLAST_ScoreBlkPtr sbp,compactSearchItems * compactSearch)409 void LIBCALL CddposInitializeInformation(posSearchItems *posSearch, BLAST_ScoreBlkPtr sbp, compactSearchItems *compactSearch)
410 {
411 Uint1Ptr query;
412 Int4 querySize;
413 Int4 c, a, alphabetSize;
414 BLAST_ResFreqPtr stdrfp; /*standard frequencies*/
415 Nlm_FloatHi lambda;
416 Nlm_FloatHi term1, term2, term3, term4;
417 Nlm_FloatHi infoSum;
418
419 querySize = compactSearch->qlength;
420 query = compactSearch->query;
421 posSearch->posInformation = (Nlm_FloatHi *) MemNew(querySize * sizeof(Nlm_FloatHi));
422 if (NULL == posSearch->posInformation) exit(EXIT_FAILURE);
423 for(c = 0; c < querySize; c++)
424 posSearch->posInformation[c] = 0.0;
425 alphabetSize = compactSearch->alphabetSize;
426 /*Compute standard frequencies as in BlastScoreBlkFill in blastkar.c*/
427 stdrfp = BlastResFreqNew(sbp);
428 BlastResFreqStdComp(sbp,stdrfp);
429 lambda = sbp->kbp[0]->Lambda;
430 for(c = 0; c < querySize; c++) {
431 infoSum = 0;
432 for(a = 0; a < alphabetSize; a++)
433 if (stdrfp->prob[a] > posEpsilon) {
434 term1 = sbp->matrix[query[c]][a];
435 term2 = term1 * lambda;
436 term3 = exp(term2);
437 term4 = stdrfp->prob[a] * term3;
438 infoSum += term4 * log(term4/stdrfp->prob[a])/NCBIMATH_LN2;
439 }
440 posSearch->posInformation[c] = infoSum;
441 }
442 }
443
444
445 /*---------------------------------------------------------------------------*/
446 /* have added sequence retrieval via ID1 in case BioseqLockById fails */
447 /*---------------------------------------------------------------------------*/
CddGetSequenceWithDenseDiag(DenseDiagPtr ddp,Boolean query,Int4Ptr start,Int4Ptr stop)448 static Uint1Ptr LIBCALL CddGetSequenceWithDenseDiag(DenseDiagPtr ddp, Boolean query,
449 Int4Ptr start, Int4Ptr stop)
450 {
451 BioseqPtr bsp;
452 Int4 index, offset;
453 Int4 length;
454 SeqIdPtr id;
455 SeqPortPtr spp;
456 Uint1Ptr buffer;
457 DenseDiagPtr ddpThis;
458 SeqEntryPtr sep;
459 Int4 uid;
460 Int2 retcode = 1;
461
462 ddpThis = ddp;
463
464 if (ddpThis == NULL) return NULL;
465 if (query == TRUE) {
466 id = ddpThis->id;
467 } else {
468 offset = 1;
469 id = ddpThis->id->next;
470 }
471
472 *start = ddpThis->starts[offset];
473 *stop = 0;
474 while (ddpThis) {
475 *stop = ddpThis->starts[offset] + ddpThis->len - 1;
476 ddpThis = ddpThis->next;
477 }
478 bsp = BioseqLockById(id);
479 if (bsp) {
480 spp = SeqPortNew(bsp, *start, *stop, Seq_strand_unknown, Seq_code_ncbistdaa);
481 length = *stop - *start + 1;
482 buffer = MemNew((length)*sizeof(Uint1));
483 for (index=0; index<length; index++) buffer[index] = SeqPortGetResidue(spp);
484 spp = SeqPortFree(spp);
485 BioseqUnlock(bsp);
486 return buffer;
487 } else {
488 uid = ID1FindSeqId(id);
489 sep = (SeqEntryPtr) ID1SeqEntryGet(uid,retcode);
490 bsp = BioseqFindInSeqEntry(id,sep);
491 spp = SeqPortNew(bsp, *start, *stop, Seq_strand_unknown, Seq_code_ncbistdaa);
492 length = *stop - *start + 1;
493 buffer = MemNew((length)*sizeof(Uint1));
494 for (index=0; index<length; index++) buffer[index] = SeqPortGetResidue(spp);
495 spp = SeqPortFree(spp);
496 return buffer;
497 }
498 }
499
500 /*---------------------------------------------------------------------------*/
501 /*---------------------------------------------------------------------------*/
CddGetSequenceWithDenseSeg(DenseSegPtr dsp,Boolean query,Int4Ptr start,Int4Ptr length)502 static Uint1Ptr LIBCALL CddGetSequenceWithDenseSeg(DenseSegPtr dsp, Boolean query,
503 Int4Ptr start, Int4Ptr length)
504 {
505 BioseqPtr bsp;
506 Int4 index, offset;
507 SeqIdPtr id;
508 SeqPortPtr spp;
509 Uint1Ptr buffer;
510
511 if (dsp == NULL) return NULL;
512
513 if (query == TRUE) {
514 offset = 0;
515 id = dsp->ids;
516 } else {
517 offset = 1;
518 id = dsp->ids->next;
519 }
520 *start = dsp->starts[offset];
521 *length = 0;
522 for (index=0; index<dsp->numseg; index++) {
523 if (dsp->starts[offset+2*index] != -1) *length += dsp->lens[index];
524 }
525 bsp = BioseqLockById(id);
526 spp = SeqPortNew(bsp, *start, (*start)+(*length)-1, Seq_strand_unknown, Seq_code_ncbistdaa);
527 buffer = MemNew((*length)*sizeof(Uint1));
528 for (index=0; index<*length; index++) buffer[index] = SeqPortGetResidue(spp);
529 spp = SeqPortFree(spp);
530 BioseqUnlock(bsp);
531 return buffer;
532 }
533
534 /*---------------------------------------------------------------------------*/
535 /* Eliminate the matches from sequence second starting at position */
536 /* matchStart and extending for intervalLength characters */
537 /*---------------------------------------------------------------------------*/
CddposCancel(posSearchItems * posSearch,compactSearchItems * compactSearch,Int4 first,Int4 second,Int4 matchStart,Int4 intervalLength)538 static void LIBCALL CddposCancel(posSearchItems *posSearch, compactSearchItems * compactSearch,
539 Int4 first, Int4 second, Int4 matchStart, Int4 intervalLength)
540 {
541 Int4 c, i;
542 Boolean stillNeeded;
543
544 for(c = matchStart, i = 0; i < intervalLength; i++, c++) {
545 posSearch->posDescMatrix[second][c].used = FALSE;
546 posSearch->posDescMatrix[second][c].letter = 0;
547 }
548 stillNeeded = FALSE;
549 for (c = 0; c < compactSearch->qlength; c++)
550 if (posSearch->posDescMatrix[second][c].used) {
551 stillNeeded = TRUE;
552 break;
553 }
554 if (!stillNeeded)
555 posSearch->posUseSequences[second] = FALSE;
556 }
557
558
559
560 /*---------------------------------------------------------------------------*/
561 /*Eliminate sequences that are identical to the query and partial alignments
562 that are identical in two matching sequences*/
563 /*---------------------------------------------------------------------------*/
CddposPurgeMatches(posSearchItems * posSearch,compactSearchItems * compactSearch)564 void LIBCALL CddposPurgeMatches(posSearchItems *posSearch, compactSearchItems * compactSearch)
565 {
566 Int4 i, j; /*index over sequences*/
567 Boolean matchesQuery; /*Is a matching sequence identical to the query?*/
568 Int4 c; /*index over demographics of matching sequence*/
569 Int4 state; /*state of checking for a match*/
570 Int4 intervalLength, matchStart; /*Length and start of a matching region*/
571 Int4 matchNumber; /*number of characters matching*/
572
573 posSearch->posUseSequences = (Boolean *) MemNew((posSearch->posNumSequences + 1) * sizeof(Boolean));
574 if (NULL == posSearch->posUseSequences) exit(EXIT_FAILURE);
575 for(i = 0; i <= posSearch->posNumSequences; i++)
576 posSearch->posUseSequences[i] = TRUE;
577 for(i = 1; i <= posSearch->posNumSequences; i++) {
578 matchesQuery = TRUE;
579 for (c = 0; c < compactSearch->qlength; c++) {
580 if ((!posSearch->posDescMatrix[i][c].used) ||
581 (posSearch->posDescMatrix[i][c].letter !=
582 posSearch->posDescMatrix[0][c].letter)) {
583 matchesQuery = FALSE;
584 break;
585 }
586 }
587 if (matchesQuery) posSearch->posUseSequences[i] = FALSE;
588 }
589 for(j = 1; j <= posSearch->posNumSequences; j++) {
590 if (!posSearch->posUseSequences[j]) continue;
591 state = POS_COUNTING;
592 c = 0;
593 matchStart = 0;
594 intervalLength = 0;
595 matchNumber = 0;
596 while (c < compactSearch->qlength) {
597 if (posSearch->posDescMatrix[j][c].used) {
598 if ((posSearch->posDescMatrix[0][c].letter != Xchar) &&
599 (posSearch->posDescMatrix[j][c].letter != Xchar)) {
600 if (state == POS_RESTING) {
601 matchStart = c;
602 intervalLength = 1;
603 state = POS_COUNTING;
604 matchNumber = 0;
605 } else intervalLength++;
606 if (posSearch->posDescMatrix[j][c].used &&
607 (posSearch->posDescMatrix[0][c].letter == posSearch->posDescMatrix[j][c].letter))
608 matchNumber++;
609 }
610 } else {
611 if (state == POS_COUNTING) {
612 if ((intervalLength > 0) && (matchNumber == intervalLength))
613 CddposCancel(posSearch,compactSearch,0,j,matchStart,intervalLength);
614 state = POS_RESTING;
615 }
616 }
617 c++;
618 }
619 if (state == POS_COUNTING) /*at end of sequence i*/
620 if ((intervalLength > 0) && (matchNumber == intervalLength))
621 CddposCancel(posSearch,compactSearch,0,j,matchStart,intervalLength);
622 }
623 for (i = 1; i < posSearch->posNumSequences; i++) {
624 if (!posSearch->posUseSequences[i])
625 continue;
626 for(j = i+1; j <= posSearch->posNumSequences; j++) {
627 if (!posSearch->posUseSequences[j])
628 continue;
629 state = POS_COUNTING;
630 c = 0;
631 matchStart = 0;
632 intervalLength = 0;
633 matchNumber = 0;
634 while (c < compactSearch->qlength) {
635 if (posSearch->posDescMatrix[i][c].used ||
636 posSearch->posDescMatrix[j][c].used) {
637 if ((posSearch->posDescMatrix[i][c].letter != Xchar) &&
638 (posSearch->posDescMatrix[j][c].letter != Xchar)) {
639 if (state == POS_RESTING) {
640 matchStart = c;
641 intervalLength = 1;
642 state = POS_COUNTING;
643 matchNumber = 0;
644 }
645 else
646 intervalLength++;
647 if (posSearch->posDescMatrix[i][c].used &&
648 posSearch->posDescMatrix[j][c].used &&
649 (posSearch->posDescMatrix[i][c].letter == posSearch->posDescMatrix[j][c].letter))
650 matchNumber++;
651 }
652 }
653 else {
654 if (state == POS_COUNTING) {
655 if ((intervalLength > 0) && ((((Nlm_FloatHi) matchNumber)/intervalLength) >= IDENTITY_RATIO))
656 CddposCancel(posSearch,compactSearch,i,j,matchStart,intervalLength);
657 state = POS_RESTING;
658 }
659 }
660 c++;
661 }
662 if (state == POS_COUNTING) /*at end of sequence i*/
663 if ((intervalLength > 0) && ((((Nlm_FloatHi) matchNumber)/intervalLength) >= IDENTITY_RATIO))
664 CddposCancel(posSearch,compactSearch,i,j,matchStart,intervalLength);
665 }
666 }
667 }
668
669 /*---------------------------------------------------------------------------*/
670 /*---------------------------------------------------------------------------*/
CddposComputeExtents(posSearchItems * posSearch,compactSearchItems * compactSearch)671 void LIBCALL CddposComputeExtents(posSearchItems *posSearch, compactSearchItems * compactSearch)
672 {
673 Int4 seqIndex; /*index of sequence*/
674 Int4 length; /*length of query*/
675 Int4 qplace, qplace2; /*place in query*/
676 Int4 numseq; /*number of sequences including query*/
677 Uint1Ptr q; /*pointers into query */
678
679 length = compactSearch->qlength;
680 numseq = posSearch->posNumSequences;
681 q = compactSearch->query;
682 for(seqIndex = 0; seqIndex < numseq; seqIndex++) {
683 if (!posSearch->posUseSequences[seqIndex+1])
684 continue;
685 if ((posSearch->posDescMatrix[seqIndex+1][0].used)
686 && (posSearch->posDescMatrix[seqIndex+1][length-1].letter != GAP_CHAR))
687 posSearch->posDescMatrix[seqIndex+1][0].leftExtent = 0;
688 for(qplace = 1; qplace < length; qplace++)
689 if(posSearch->posDescMatrix[seqIndex+1][qplace].used) {
690 if(posSearch->posDescMatrix[seqIndex+1][qplace-1].used)
691 posSearch->posDescMatrix[seqIndex+1][qplace].leftExtent =
692 posSearch->posDescMatrix[seqIndex+1][qplace -1].leftExtent;
693 else
694 posSearch->posDescMatrix[seqIndex+1][qplace].leftExtent = qplace;
695 }
696 if ((posSearch->posDescMatrix[seqIndex+1][length-1].used)
697 && (posSearch->posDescMatrix[seqIndex+1][length-1].letter != GAP_CHAR))
698 posSearch->posDescMatrix[seqIndex+1][length-1].rightExtent = length -1;
699 for(qplace = length -2; qplace >= 0; qplace--)
700 if(posSearch->posDescMatrix[seqIndex+1][qplace].used) {
701 if(posSearch->posDescMatrix[seqIndex+1][qplace+1].used)
702 posSearch->posDescMatrix[seqIndex+1][qplace].rightExtent =
703 posSearch->posDescMatrix[seqIndex+1][qplace + 1].rightExtent;
704 else
705 posSearch->posDescMatrix[seqIndex+1][qplace].rightExtent = qplace;
706 }
707 for(qplace = 0; qplace < length; qplace++)
708 if (posSearch->posDescMatrix[seqIndex+1][qplace].used) {
709 posSearch->posExtents[qplace].leftExtent = MAX(posSearch->posExtents[qplace].leftExtent,
710 posSearch->posDescMatrix[seqIndex+1][qplace].leftExtent);
711 posSearch->posExtents[qplace].rightExtent = MIN(posSearch->posExtents[qplace].rightExtent,
712 posSearch->posDescMatrix[seqIndex+1][qplace].rightExtent);
713
714 }
715
716 for(qplace = 0; qplace < length; qplace++)
717 /*used to check qplace for GAP_CHAR here*/
718 if (posSearch->posDescMatrix[seqIndex+1][qplace].used) {
719 posSearch->posC[qplace][posSearch->posDescMatrix[seqIndex+1][qplace].letter]++;
720 posSearch->posCount[qplace]++; /*Add to number of matches in this query position*/
721 }
722 }
723 for(qplace = 0; qplace < length; qplace++)
724 posSearch->posIntervalSizes[qplace] = posSearch->posExtents[qplace].rightExtent -
725 posSearch->posExtents[qplace].leftExtent + 1;
726 for(qplace =0; qplace < length; qplace++) {
727 if(Xchar == q[qplace]) {
728 posSearch->posIntervalSizes[qplace] = 0;
729 for(qplace2 = 0; qplace2 <qplace; qplace2++) {
730 if(posSearch->posExtents[qplace2].rightExtent >= qplace)
731 posSearch->posIntervalSizes[qplace2]--;
732 }
733 }
734 }
735 }
736
737 /*---------------------------------------------------------------------------*/
738 /*Compute weight of each sequence and letter in each position*/
739 /*---------------------------------------------------------------------------*/
CddposComputeSequenceWeights(posSearchItems * posSearch,compactSearchItems * compactSearch)740 void LIBCALL CddposComputeSequenceWeights(posSearchItems *posSearch,
741 compactSearchItems * compactSearch)
742 {
743 Int4 length; /*length of query*/
744 Int4 numseq, seqIndex; /*number of matches, index for them*/
745 Int4 i; /*index over a multi-alignment block*/
746 Int4 qplace; /*index into query*/
747 Nlm_FloatHi Sigma; /*Number of different characters occurring in matches
748 within a multi-alignment block, excluding identical columns*/
749 Nlm_FloatHi intervalSigma; /*Same as Sigma but includes identical columns*/
750 Int4 alphabetSize; /*number of characters in alphabet*/
751 Int4 *participatingSequences; /*array of part. seq. at a position*/
752 Int4 *oldParticipatingSequences;/*array of part. seq. at a position*/
753 Int4 posLocalVariety;/*number of different characters at a position*/
754 Int4 *posLocalC; /*counts of how many of each letter in this column*/
755 Int4 c;
756 Int4 thisSeq;
757 Int4 numParticipating; /*number of seq. in this alignment block*/
758 Int4 oldNumParticipating; /*number of seq. in this alignment block*/
759 Boolean newSequenceSet;
760 Int4 p; /*index on sequences*/
761
762 alphabetSize = compactSearch->alphabetSize;
763 length = compactSearch->qlength;
764 numseq = posSearch->posNumSequences;
765 participatingSequences = (Int4 *) MemNew((numseq+1) * sizeof(Int4));
766 if (NULL == participatingSequences)
767 exit(EXIT_FAILURE);
768 oldParticipatingSequences = (Int4 *) MemNew((numseq+1) * sizeof(Int4));
769 if (NULL == oldParticipatingSequences)
770 exit(EXIT_FAILURE);
771 posLocalC = (Int4 *) MemNew(alphabetSize * sizeof(Int4));
772 if (NULL == posLocalC)
773 exit(EXIT_FAILURE);
774 for (qplace = 0; qplace < length; qplace++) {
775 posSearch->posSigma[qplace] = 0.0;
776 }
777 numParticipating = 0;
778 for(qplace = 0; qplace < length; qplace++) {
779 if ((posSearch->posCount[qplace] > 1) && (posSearch->posIntervalSizes[qplace] > 0)) {
780 oldNumParticipating = numParticipating;
781 for(p =0; p < numParticipating; p++)
782 oldParticipatingSequences[p] = participatingSequences[p];
783 numParticipating = 0;
784 for (seqIndex = 0; seqIndex <= numseq; seqIndex++) {
785 if (!posSearch->posUseSequences[seqIndex]) continue;
786 if ((posSearch->posDescMatrix[seqIndex][qplace].used) &&
787 (posSearch->posDescMatrix[seqIndex][qplace].letter != GAP_CHAR)) {
788 participatingSequences[numParticipating] = seqIndex;
789 numParticipating++;
790 }
791 }
792 newSequenceSet = TRUE;
793 if (numParticipating == oldNumParticipating) {
794 for(p = 0; p < numParticipating; p++)
795 if (oldParticipatingSequences[p] != participatingSequences[p]) break;
796 if (p == numParticipating) newSequenceSet = FALSE;
797 }
798 if (newSequenceSet) {
799 Sigma = 0;
800 intervalSigma = 0;
801 for (seqIndex = 0; seqIndex <= numseq; seqIndex++) {
802 if (!posSearch->posUseSequences[seqIndex]) continue;
803 posSearch->posRowSigma[seqIndex] = 0.0;
804 posSearch->posA[seqIndex] = 0.0;
805 }
806 for (i = posSearch->posExtents[qplace].leftExtent;
807 i <= posSearch->posExtents[qplace].rightExtent; i++) {
808 posLocalVariety = 0;
809 for(c = 0; c < alphabetSize; c++) posLocalC[c] = 0;
810 for(seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
811 thisSeq = participatingSequences[seqIndex];
812 /* used to check for GAP here */
813 if (0 == posLocalC[posSearch->posDescMatrix[thisSeq][i].letter])
814 /* letter (not a gap) not seen before in this query pos. */
815 posLocalVariety++;
816 posLocalC[posSearch->posDescMatrix[thisSeq][i].letter]++;
817 }
818 intervalSigma += posLocalVariety;
819 if (posLocalVariety > 1) {
820 Sigma += posLocalVariety;
821 for(seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
822 thisSeq = participatingSequences[seqIndex];
823 /* used to check for gap here */
824 posSearch->posRowSigma[thisSeq] += ( 1.0 / (Nlm_FloatHi) posLocalC[posSearch->posDescMatrix[thisSeq][i].letter]);
825 }
826 }
827 }
828 }
829 if (Sigma > 0) {
830 for (seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
831 thisSeq = participatingSequences[seqIndex];
832 posSearch->posA[thisSeq] = posSearch->posRowSigma[thisSeq]/Sigma;
833 }
834 } else {
835 for (seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
836 thisSeq = participatingSequences[seqIndex];
837 posSearch->posA[thisSeq] = ((Nlm_FloatHi) 1 / (Nlm_FloatHi) numParticipating);
838 }
839 }
840 posSearch->posSigma[qplace] = intervalSigma;
841 for (seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
842 thisSeq = participatingSequences[seqIndex];
843 posSearch->posMatchWeights[qplace][posSearch->posDescMatrix[thisSeq][qplace].letter] += posSearch->posA[thisSeq];
844 }
845 }
846 }
847 MemFree(participatingSequences);
848 MemFree(oldParticipatingSequences);
849 MemFree(posLocalC);
850 }
851
852 /*---------------------------------------------------------------------------*/
853 /*check that weights add to 1 in each column */
854 /*---------------------------------------------------------------------------*/
CddposCheckWeights(posSearchItems * posSearch,compactSearchItems * compactSearch)855 void LIBCALL CddposCheckWeights(posSearchItems *posSearch, compactSearchItems * compactSearch)
856 {
857 Uint1Ptr q; /*pointer to query*/
858 Int4 length, alphabetSize; /*length of query and number of characters in alphabet*/
859 Int4 a, c; /*loop indices*/
860 Nlm_FloatHi runningSum; /*partial total for a column*/
861
862 length = compactSearch->qlength;
863 alphabetSize = compactSearch->alphabetSize;
864
865 q = compactSearch->query;
866 for(c = 0; c < length; c++) {
867 if ((posSearch->posCount[c] > 1) && (q[c] != Xchar)) {
868 runningSum = 0;
869 for(a = 0; a < alphabetSize; a++)
870 runningSum += posSearch->posMatchWeights[c][a];
871 if((runningSum < 0.99) || (runningSum > 1.01))
872 ErrPostEx(SEV_ERROR, 0, 0, "\nERROR IN WEIGHTS, column %d, value %lf\n",c, runningSum);
873 }
874 }
875 }
876
877 /*---------------------------------------------------------------------------*/
878 /*---------------------------------------------------------------------------*/
CddallocatePosFreqs(Int4 length,Int4 alphabetSize)879 static Nlm_FloatHi ** LIBCALL CddallocatePosFreqs(Int4 length, Int4 alphabetSize)
880 {
881 Int4 c, i; /*loop indices*/
882 Nlm_FloatHi ** returnArray;
883
884 returnArray = (Nlm_FloatHi **) MemNew((length + 1) * sizeof(Nlm_FloatHi *));
885 if (NULL == returnArray)
886 exit(EXIT_FAILURE);
887 for(i = 0; i <= length; i++) {
888 returnArray[i] = (Nlm_FloatHi *) MemNew(alphabetSize * sizeof(Nlm_FloatHi));
889 if (NULL == returnArray[i])
890 exit(EXIT_FAILURE);
891 for(c = 0; c < alphabetSize; c++)
892 returnArray[i][c] = 0.0;
893 }
894 return(returnArray);
895 }
896
897
898 /*---------------------------------------------------------------------------*/
899 /*---------------------------------------------------------------------------*/
countsFunction(Nlm_FloatHi Sigma,Int4 intervalLength)900 static Nlm_FloatHi LIBCALL countsFunction(Nlm_FloatHi Sigma, Int4 intervalLength)
901 {
902 return(Sigma / intervalLength - 1);
903 }
904
905
906 /*---------------------------------------------------------------------------*/
907 /*---------------------------------------------------------------------------*/
CddposComputePseudoFreqs(posSearchItems * posSearch,compactSearchItems * compactSearch,Boolean Cpos)908 Nlm_FloatHi ** LIBCALL CddposComputePseudoFreqs(posSearchItems *posSearch, compactSearchItems * compactSearch, Boolean Cpos)
909 {
910 Uint1Ptr q; /*pointer to the query*/
911 Int4 length; /*length of the query*/
912 Int4 c; /*loop index*/
913 Int4 a, aSub, alphabetSize; /*loop indices and size of alphabet*/
914 Nlm_FloatHi lambda; /*Karlin-Altschul parameter*/
915 Nlm_FloatHi Sigma; /*number of characters in an interval*/
916 Int4 intervalLength; /*length of a block*/
917 Nlm_FloatHi pseudo, numerator, denominator, qOverPEstimate; /*intermediate terms*/
918 Nlm_FloatHi infoSum; /*sum used for information content*/
919 Nlm_FloatHi **posFreqs; /*store frequencies*/
920
921 q = compactSearch->query;
922 length = compactSearch->qlength;
923
924 alphabetSize = compactSearch->alphabetSize;
925 lambda = compactSearch->lambda_ideal;
926 posFreqs = CddallocatePosFreqs(length, alphabetSize);
927
928 for(c = 0; c < length; c++) {
929 if ((posSearch->posCount[c] > 1) && (Xchar != q[c])) {
930 infoSum = 0;
931 for(a = 0; a < alphabetSize; a++) {
932 if (compactSearch->standardProb[a] > posEpsilon) {
933 pseudo = 0;
934 for (aSub = 0; aSub < alphabetSize; aSub++)
935 if(compactSearch->matrix[a][aSub] != BLAST_SCORE_MIN)
936 pseudo += (posSearch->posMatchWeights[c][aSub] *
937 exp(lambda * compactSearch->matrix[a][aSub]));
938 pseudo *= (compactSearch->pseudoCountConst);
939 Sigma = posSearch->posSigma[c];
940 intervalLength = posSearch->posIntervalSizes[c];
941 numerator = pseudo +
942 (countsFunction(Sigma, intervalLength) * posSearch->posMatchWeights[c][a]/
943 compactSearch->standardProb[a]);
944 denominator = countsFunction(Sigma, intervalLength) + (compactSearch->pseudoCountConst);
945 qOverPEstimate = numerator / denominator;
946 /*Note artificial multiplication by standard probability to
947 normalize*/
948 posFreqs[c][a] = qOverPEstimate * compactSearch->standardProb[a];
949 if (0.0 != qOverPEstimate && (compactSearch->standardProb[a] > posEpsilon))
950 infoSum += qOverPEstimate * compactSearch->standardProb[a] * log(qOverPEstimate)/ NCBIMATH_LN2;
951 }
952 else
953 posFreqs[c][a] = 0.0;
954 }
955 if (Cpos)
956 posSearch->posInformation[c] = infoSum;
957 }
958 else
959 for(a = 0; a < alphabetSize; a++) {
960 posFreqs[c][a] = 0;
961 }
962 }
963 return(posFreqs);
964 }
965
966 /*---------------------------------------------------------------------------*/
967 /*---------------------------------------------------------------------------*/
968 /* from posit.c as of 12/29/2000 */
969 /*---------------------------------------------------------------------------*/
970 /*---------------------------------------------------------------------------*/
CddfreePosFreqs(Nlm_FloatHi ** posFreqs,Int4 length)971 static void CddfreePosFreqs(Nlm_FloatHi ** posFreqs, Int4 length)
972 {
973 Int4 i;
974
975 for (i = 0; i <= length; i++)
976 MemFree(posFreqs[i]);
977 MemFree(posFreqs);
978 }
979
980 /*---------------------------------------------------------------------------*/
981 /*---------------------------------------------------------------------------*/
982 /*---------------------------------------------------------------------------*/
983 /*---------------------------------------------------------------------------*/
posit_rounddown(Nlm_FloatHi value)984 static Nlm_FloatHi LIBCALL posit_rounddown(Nlm_FloatHi value)
985 {
986 return (Nlm_FloatHi) Nlm_Nint(value);
987 }
988
989
990
991 /*---------------------------------------------------------------------------*/
992 /*Convert pseudo-count frequencies to a score matrix */
993 /*---------------------------------------------------------------------------*/
CddposFreqsToMatrix(posSearchItems * posSearch,compactSearchItems * compactSearch)994 void LIBCALL CddposFreqsToMatrix(posSearchItems *posSearch, compactSearchItems * compactSearch)
995 {
996 Uint1Ptr q; /*pointer to the query*/
997 Int4 length; /*length of the query*/
998 Int4 c; /*loop index*/
999 Int4 a, alphabetSize; /*loop index and size of alphabet*/
1000 Nlm_FloatHi lambda; /*Karlin-Altschul parameter*/
1001 Nlm_FloatHi qOverPEstimate, value; /*intermediate terms*/
1002 Boolean allZeros; /*are all frequencies in a column 0?*/
1003
1004 q = compactSearch->query;
1005 length = compactSearch->qlength;
1006
1007 alphabetSize = compactSearch->alphabetSize;
1008 lambda = compactSearch->lambda_ideal;
1009
1010 for(c = 0; c < length; c++) {
1011 allZeros = TRUE;
1012 for(a = 0; a < alphabetSize; a++) {
1013 /*Division compensates for multiplication in posComputePsedoFreqs*/
1014 if (compactSearch->standardProb[a] > posEpsilon)
1015 qOverPEstimate = posSearch->posFreqs[c][a]/compactSearch->standardProb[a];
1016 else
1017 qOverPEstimate = 0.0;
1018 if (qOverPEstimate != 0.0)
1019 allZeros = FALSE;
1020 if (0.0 == qOverPEstimate || (compactSearch->standardProb[a] < posEpsilon))
1021 posSearch->posPrivateMatrix[c][a] = BLAST_SCORE_MIN;
1022 else {
1023 value = log(qOverPEstimate)/lambda;
1024 posSearch->posPrivateMatrix[c][a] = (BLAST_Score) posit_rounddown(POSIT_SCALE_FACTOR2*value);
1025 }
1026 }
1027 if (allZeros) {
1028 for(a = 0; a < alphabetSize; a++) {
1029 posSearch->posMatrix[c][a] = compactSearch->matrix[q[c]][a];
1030 if (compactSearch->matrix[q[c]][a] == BLAST_SCORE_MIN)
1031 posSearch->posPrivateMatrix[c][a] = BLAST_SCORE_MIN;
1032 else
1033 posSearch->posPrivateMatrix[c][a] = POSIT_SCALE_FACTOR2*compactSearch->matrix[q[c]][a];
1034 }
1035 }
1036 }
1037 for(a = 0; a < alphabetSize; a++) {
1038 posSearch->posPrivateMatrix[length][a] = posSearch->posPrivateMatrix[length][a] = BLAST_SCORE_MIN;
1039 }
1040 }
1041
1042 /*---------------------------------------------------------------------------*/
1043 /*---------------------------------------------------------------------------*/
CddposScaling(posSearchItems * posSearch,compactSearchItems * compactSearch)1044 void LIBCALL CddposScaling(posSearchItems *posSearch, compactSearchItems * compactSearch)
1045 {
1046 BlastMatrixRescalePtr matrix_rescale;
1047
1048 matrix_rescale = BlastMatrixRescaleNew(compactSearch->alphabetSize,
1049 compactSearch->qlength,
1050 compactSearch->query,
1051 compactSearch->standardProb,
1052 posSearch->posMatrix,
1053 posSearch->posPrivateMatrix,
1054 compactSearch->kbp_std,
1055 compactSearch->kbp_psi,
1056 compactSearch->kbp_gap_std,
1057 compactSearch->kbp_gap_psi,
1058 compactSearch->lambda_ideal,
1059 compactSearch->K_ideal);
1060
1061 BlastScaleMatrix(matrix_rescale, TRUE);
1062
1063 matrix_rescale = BlastMatrixRescaleDestruct(matrix_rescale);
1064
1065 return;
1066 }
1067
1068
1069 /*---------------------------------------------------------------------------*/
1070 /*---------------------------------------------------------------------------*/
1071 /* Compute general information about the sequences that matched on the */
1072 /* i-th pass such as how many matched at each query position and what letter */
1073 /* matched */
1074 /*---------------------------------------------------------------------------*/
1075 /*---------------------------------------------------------------------------*/
CddposDenseDiagDemographics(posSearchItems * posSearch,compactSearchItems * compactSearch,SeqAlignPtr listOfSeqAligns)1076 void LIBCALL CddposDenseDiagDemographics(posSearchItems *posSearch,
1077 compactSearchItems *compactSearch,
1078 SeqAlignPtr listOfSeqAligns)
1079 {
1080 Uint1Ptr q; /*pointers into query */
1081 Uint1Ptr s; /*pointer into a matching string */
1082 Int4 length, subjectLength; /*length of query and subject*/
1083 Int4 c; /*index into a string*/
1084 Int4 numseq, numSeqAligns; /*number of matching seq. and SeqAligns*/
1085 Int4 seqIndex; /*index for the array of matching sequences*/
1086 Int4 matchLength; /*length of a match*/
1087 Int4 queryOffset, subjectOffset, retrievalOffset; /*offsets needed to make a match align*/
1088 Int4 qplace, splace; /*index into query string and matching string*/
1089 SeqAlignPtr curSeqAlign; /*pointers into listOfSeqAligns*/
1090 DenseDiagPtr curSegs; /*used to extract alignm. from curSeqAlign*/
1091 SeqIdPtr curId; /*Used to compare seq. coming from different SeqAligns*/
1092 Int4 startQ, startS; /*Indices into array of starting positions*/
1093 Int4 segIndex; /*Index for which piece we are at*/
1094 Boolean is_new_id = FALSE;
1095
1096 q = compactSearch->query;
1097 length = compactSearch->qlength;
1098 for(c = 0; c < length; c++) {
1099 posSearch->posDescMatrix[0][c].letter = (Int1) q[c];
1100 posSearch->posDescMatrix[0][c].used = TRUE;
1101 posSearch->posDescMatrix[0][c].leftExtent = 0;
1102 posSearch->posDescMatrix[0][c].rightExtent = length;
1103 posSearch->posDescMatrix[0][c].e_value = compactSearch->ethresh/2;
1104 posSearch->posC[c][q[c]]++;
1105 posSearch->posCount[c]++;
1106 }
1107
1108 numSeqAligns = CddCountDenDiagSeqAligns(listOfSeqAligns, &numseq);
1109
1110 posSearch->posNumSequences = numSeqAligns;
1111 seqIndex = 0;
1112 curSeqAlign = listOfSeqAligns;
1113 for(curSeqAlign = listOfSeqAligns; curSeqAlign != NULL; curSeqAlign = curSeqAlign->next) {
1114 curSegs = (DenseDiagPtr) curSeqAlign->segs;
1115 s = CddGetSequenceWithDenseDiag(curSegs, FALSE, &retrievalOffset, &subjectLength);
1116 startQ = 0; startS = 1;
1117 while (curSegs) {
1118 queryOffset = curSegs->starts[0];
1119 subjectOffset = curSegs->starts[1] - retrievalOffset;
1120 matchLength = curSegs->len;
1121 for(c = 0, qplace = queryOffset, splace = subjectOffset; c < matchLength; c++, qplace++, splace++) {
1122 if (!posSearch->posDescMatrix[seqIndex+1][qplace].used) {
1123 posSearch->posDescMatrix[seqIndex+1][qplace].letter = (Int1) s[splace];
1124 posSearch->posDescMatrix[seqIndex+1][qplace].used = TRUE;
1125 posSearch->posDescMatrix[seqIndex+1][qplace].e_value = 0;
1126 }
1127 }
1128 curSegs = curSegs->next;
1129 }
1130 s = MemFree(s);
1131 seqIndex++;
1132 } /*closes the while loop over seqAligns*/
1133 }
1134
1135
1136
1137
1138 /*---------------------------------------------------------------------------*/
1139 /* Compute general information about the sequences that matched on the */
1140 /* i-th pass such as how many matched at each query position and what letter */
1141 /* matched */
1142 /*---------------------------------------------------------------------------*/
CddposDemographics(posSearchItems * posSearch,compactSearchItems * compactSearch,SeqAlignPtr listOfSeqAligns)1143 void LIBCALL CddposDemographics(posSearchItems *posSearch,
1144 compactSearchItems *compactSearch,
1145 SeqAlignPtr listOfSeqAligns)
1146 {
1147 Uint1Ptr q; /*pointers into query */
1148 Uint1Ptr s; /*pointer into a matching string */
1149 Int4 length, subjectLength; /*length of query and subject*/
1150 Int4 c; /*index into a string*/
1151 Int4 numseq, numSeqAligns; /*number of matching seqs and SeqAligns*/
1152 Int4 seqIndex; /*index for the array of matching sequences*/
1153 Int4 matchLength; /*length of a match*/
1154 Int4 queryOffset, subjectOffset, retrievalOffset; /*offsets needed to make a match align*/
1155 Int4 qplace, splace; /*index into query string and matching string*/
1156 SeqAlignPtr curSeqAlign, prevSeqAlign; /*pointers into listOfSeqAligns*/
1157 DenseSegPtr curSegs, prevSegs; /*used to extract alignmts from curSeqAlign*/
1158 SeqIdPtr curId, prevId;/*Use to comp. seqs coming from difft. SeqAligns*/
1159 Int4 startQ, startS; /*Indices into array of starting positions*/
1160 Int4 numsegs; /*Number of pieces in the gapped alignment*/
1161 Int4 segIndex; /*Index for which piece we are at*/
1162 Boolean is_new_id = FALSE;
1163
1164 q = compactSearch->query;
1165 length = compactSearch->qlength;
1166 for(c = 0; c < length; c++) {
1167 posSearch->posDescMatrix[0][c].letter = (Int1) q[c];
1168 posSearch->posDescMatrix[0][c].used = TRUE;
1169 posSearch->posDescMatrix[0][c].leftExtent = 0;
1170 posSearch->posDescMatrix[0][c].rightExtent = length;
1171 posSearch->posDescMatrix[0][c].e_value = compactSearch->ethresh/2;
1172 posSearch->posC[c][q[c]]++;
1173 posSearch->posCount[c]++;
1174 }
1175
1176 numSeqAligns = CddCountSeqAligns(listOfSeqAligns, &numseq);
1177
1178 posSearch->posNumSequences = numSeqAligns;
1179 seqIndex = 0;
1180 curSeqAlign = listOfSeqAligns;
1181 prevSeqAlign = NULL;
1182 for(curSeqAlign = listOfSeqAligns; curSeqAlign != NULL; curSeqAlign = curSeqAlign->next) {
1183 is_new_id = FALSE;
1184 curSegs = (DenseSegPtr) curSeqAlign->segs;
1185 if (NULL != prevSeqAlign) {
1186 prevSegs = (DenseSegPtr) prevSeqAlign->segs;
1187 if(curSegs->ids == NULL) break;
1188 curId = curSegs->ids->next;
1189 prevId = prevSegs->ids->next;
1190 if (!(SeqIdMatch(curId, prevId))) is_new_id = TRUE;
1191 }
1192 if(is_new_id == TRUE) seqIndex++;
1193 s = CddGetSequenceWithDenseSeg(curSegs, FALSE, &retrievalOffset, &subjectLength);
1194 startQ = 0; startS = 1;
1195 numsegs = curSegs->numseg;
1196 for(segIndex = 0; segIndex < numsegs; segIndex++) {
1197 queryOffset = curSegs->starts[startQ];
1198 if (curSegs->starts[startS] != GAP_HERE)
1199 subjectOffset = curSegs->starts[startS] - retrievalOffset;
1200 else
1201 subjectOffset = GAP_HERE;
1202 matchLength = curSegs->lens[segIndex];
1203 if ((GAP_HERE ) == queryOffset) {
1204 ; /*do nothing, gap in query*/
1205 } else if ((GAP_HERE) == subjectOffset) {
1206 for(c = 0, qplace = queryOffset; c < matchLength; c++, qplace++) {
1207 posSearch->posDescMatrix[seqIndex + 1][qplace].used = TRUE;
1208 posSearch->posDescMatrix[seqIndex + 1][qplace].letter = GAP_CHAR;
1209 posSearch->posDescMatrix[seqIndex + 1][qplace].e_value = 1.0;
1210 }
1211 } else { /*no gap*/
1212 for(c = 0, qplace = queryOffset, splace = subjectOffset; c < matchLength; c++, qplace++, splace++) {
1213 if ((!posSearch->posDescMatrix[seqIndex+1][qplace].used))
1214 if (!posSearch->posDescMatrix[seqIndex+1][qplace].used) {
1215 posSearch->posDescMatrix[seqIndex+1][qplace].letter = (Int1) s[splace];
1216 posSearch->posDescMatrix[seqIndex+1][qplace].used = TRUE;
1217 posSearch->posDescMatrix[seqIndex+1][qplace].e_value = 0;
1218 }
1219 }
1220 }
1221 startQ += 2;
1222 startS += 2;
1223 }
1224 prevSeqAlign = curSeqAlign;
1225 s = MemFree(s);
1226 } /*closes the while loop over seqAligns*/
1227 }
1228
1229
1230 /*---------------------------------------------------------------------------*/
1231 /* copied from BLASTSetUpSearchInternalByLoc */
1232 /*---------------------------------------------------------------------------*/
1233 #define DROPOFF_NUMBER_OF_BITS 10.0
1234 #define INDEX_THR_MIN_SIZE 20000
CddSetUpSearchInternalByLoc(BlastSearchBlkPtr search,SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))1235 static Int2 LIBCALL CddSetUpSearchInternalByLoc
1236 (BlastSearchBlkPtr search, SeqLocPtr query_slp, BioseqPtr query_bsp,
1237 CharPtr prog_name, Int4 qlen, BLAST_OptionsBlkPtr options,
1238 int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
1239 {
1240 BioseqPtr bsp_temp, bsp;
1241 Boolean mask_at_hash=FALSE, private_slp_delete;
1242 Boolean query_is_na, db_is_na;
1243 Char buffer[128];
1244 Int2 retval, status;
1245 Int4 effective_query_length, query_length, full_query_length,
1246 index, length, length_adjustment=0, last_length_adjustment, min_query_length;
1247 Int4 array_size, max_length, block_width;
1248 Int4Ptr open, extend;
1249 Nlm_FloatHiPtr lambda, K, H;
1250 Nlm_FloatHi avglen;
1251 ReadDBFILEPtr rdfp;
1252 SeqIdPtr query_id;
1253 ObjectIdPtr oip;
1254 SeqPortPtr spp=NULL, spp_reverse=NULL;
1255 SeqLocPtr filter_slp=NULL, private_slp=NULL, private_slp_rev=NULL;
1256 GeneticCodePtr gcp;
1257 Uint1 residue, strand;
1258 Uint1Ptr sequence;
1259 Uint1Ptr query_seq, query_seq_start, query_seq_rev, query_seq_start_rev;
1260 ValNodePtr vnp;
1261
1262 if (options == NULL)
1263 {
1264 ErrPostEx(SEV_FATAL, 0, 0, "BLAST_OptionsBlkPtr is NULL\n");
1265 return 1;
1266 }
1267
1268 if (query_slp == NULL && query_bsp == NULL)
1269 {
1270 ErrPostEx(SEV_FATAL, 0, 0, "Query is NULL\n");
1271 return 1;
1272 }
1273
1274 query_seq = NULL; /* Gets rid of warning. */
1275 query_seq_rev = NULL; /* Gets rid of warning. */
1276 query_seq_start = NULL; /* Gets rid of warning. */
1277 query_seq_start_rev = NULL; /* Gets rid of warning. */
1278
1279 if (query_slp)
1280 {
1281 strand = SeqLocStrand(query_slp);
1282 if (strand == Seq_strand_unknown || strand == Seq_strand_plus || strand == Seq_strand_both)
1283 {
1284 private_slp = SeqLocIntNew(SeqLocStart(query_slp), SeqLocStop(query_slp), Seq_strand_plus, SeqLocId(query_slp));
1285 }
1286 if (strand == Seq_strand_minus || strand == Seq_strand_both)
1287 {
1288 private_slp_rev = SeqLocIntNew(SeqLocStart(query_slp), SeqLocStop(query_slp), Seq_strand_minus, SeqLocId(query_slp));
1289 }
1290 private_slp_delete = TRUE;
1291 }
1292 else
1293 {
1294 private_slp = SeqLocIntNew(0, query_bsp->length-1 , Seq_strand_plus, SeqIdFindBest(query_bsp->id, SEQID_GI));
1295 private_slp_rev = SeqLocIntNew(0, query_bsp->length-1 , Seq_strand_minus, SeqIdFindBest(query_bsp->id, SEQID_GI));
1296 private_slp_delete = FALSE;
1297 }
1298
1299 query_length = 0;
1300 if (private_slp)
1301 query_length = SeqLocLen(private_slp);
1302 else if (private_slp_rev)
1303 query_length = SeqLocLen(private_slp_rev);
1304 if (query_length == 0)
1305 {
1306 sprintf(buffer, "No valid query sequence");
1307 BlastConstructErrorMessage("Blast", buffer, 2, &(search->error_return));
1308 return 1;
1309 }
1310
1311 bsp = NULL;
1312 if (private_slp)
1313 bsp = BioseqLockById(SeqLocId(private_slp));
1314 else if (private_slp_rev)
1315 bsp = BioseqLockById(SeqLocId(private_slp_rev));
1316
1317 if (bsp == NULL)
1318 {
1319 ErrPostEx(SEV_WARNING, 0, 0, "No valid query sequence, BioseqLockById returned NULL\n");
1320 return 1;
1321 }
1322
1323 full_query_length = bsp->length;
1324
1325 BlastGetTypes(prog_name, &query_is_na, &db_is_na);
1326 if (query_is_na != ISA_na(bsp->mol))
1327 {
1328 ErrPostEx(SEV_WARNING, 0, 0, "Query molecule is incompatible with %s program", prog_name);
1329 BioseqUnlock(bsp);
1330 return 1;
1331 }
1332
1333 if (bsp->repr == Seq_repr_virtual)
1334 {
1335 BioseqUnlock(bsp);
1336 ErrPostEx(SEV_WARNING, 0, 0, "Virtual sequence detected\n");
1337 return 1;
1338 }
1339 BioseqUnlock(bsp);
1340
1341 if (query_slp)
1342 {
1343 search->query_slp = query_slp;
1344 }
1345 else
1346 {
1347 search->query_slp = private_slp;
1348 search->allocated += BLAST_SEARCH_ALLOC_QUERY_SLP;
1349 }
1350
1351
1352 search->translation_buffer = NULL;
1353 search->translation_buffer_size = 0;
1354
1355 /*
1356 Get genetic codes (should be determined from BLAST_OptionsBlkPtr.
1357 Only needed for blastx, tblast[nx]
1358 */
1359 if (StringCmp(prog_name, "blastp") != 0 && StringCmp(prog_name, "blastn") != 0)
1360 {
1361 if (StringCmp(prog_name, "tblastx") == 0 || StringCmp(prog_name, "tblastn") == 0)
1362 {
1363 gcp = GeneticCodeFind(options->db_genetic_code, NULL);
1364 for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next)
1365 {
1366 if (vnp->choice == 3) /* ncbieaa */
1367 {
1368 search->db_genetic_code = (CharPtr)vnp->data.ptrvalue;
1369 break;
1370 }
1371 }
1372 search->translation_table = (Uint1Ptr) GetPrivatTranslationTable(search->db_genetic_code, FALSE);
1373 search->translation_table_rc = (Uint1Ptr) GetPrivatTranslationTable(search->db_genetic_code, TRUE);
1374 max_length = 0;
1375 rdfp = search->rdfp;
1376 while (rdfp)
1377 {
1378 max_length = MAX(max_length, readdb_get_maxlen(rdfp));
1379 rdfp = rdfp->next;
1380 }
1381 search->translation_buffer = MemNew((3+(max_length/3))*sizeof(Uint1));
1382 search->translation_buffer_size = 1+(max_length/3);
1383 search->allocated += BLAST_SEARCH_ALLOC_TRANS_INFO;
1384 }
1385
1386 if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0)
1387 {
1388 gcp = GeneticCodeFind(options->genetic_code, NULL);
1389 for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next)
1390 {
1391 if (vnp->choice == 3) /* ncbieaa */
1392 {
1393 search->genetic_code = (CharPtr)vnp->data.ptrvalue;
1394 break;
1395 }
1396 }
1397 }
1398 }
1399
1400 if (options->filter && !options->filter_string)
1401 options->filter_string = (CharPtr) BlastConstructFilterString(options->filter);
1402
1403 /* If the query is translated do this below. */
1404 if (StringCmp(prog_name, "blastx") && StringCmp(prog_name, "tblastx")) {
1405 if (private_slp)
1406 filter_slp = BlastSeqLocFilterEx(private_slp, options->filter_string, &mask_at_hash);
1407 else if (private_slp_rev)
1408 filter_slp = BlastSeqLocFilterEx(private_slp_rev, options->filter_string, &mask_at_hash);
1409 }
1410
1411
1412 /*
1413 Dusting of query sequence. Only needed for blastn, optional
1414 */
1415
1416 if(StringCmp(prog_name, "blastn") == 0) {
1417 if (filter_slp && !mask_at_hash)
1418 ValNodeAddPointer(&(search->mask), SEQLOC_MASKING_NOTSET, filter_slp);
1419 }
1420
1421 if (StringCmp(prog_name, "blastp") == 0 || StringCmp(prog_name, "tblastn") == 0)
1422 {
1423 spp = SeqPortNewByLoc(private_slp, Seq_code_ncbistdaa);
1424 SeqPortSet_do_virtual(spp, TRUE);
1425 if (filter_slp && !mask_at_hash)
1426 ValNodeAddPointer(&(search->mask), SEQLOC_MASKING_NOTSET, filter_slp);
1427 }
1428 else if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0 || StringCmp(prog_name, "blastn") == 0)
1429 {
1430 if (private_slp)
1431 {
1432 spp = SeqPortNewByLoc(private_slp, Seq_code_ncbi4na);
1433 SeqPortSet_do_virtual(spp, TRUE);
1434 }
1435 if (private_slp_rev)
1436 {
1437 spp_reverse = SeqPortNewByLoc(private_slp_rev, Seq_code_ncbi4na);
1438 SeqPortSet_do_virtual(spp_reverse, TRUE);
1439 }
1440 }
1441 else
1442 {
1443 ErrPostEx(SEV_FATAL, 0, 0, "Only blastn, blastp, blastx, tblastn tblastx is allowed\n");
1444 return 1;
1445 }
1446
1447 if (spp)
1448 {
1449 query_seq_start = (Uint1Ptr) MemNew(((query_length)+2)*sizeof(Char));
1450 query_seq_start[0] = NULLB;
1451 query_seq = query_seq_start+1;
1452 index=0;
1453 while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
1454 {
1455
1456 if (IS_residue(residue))
1457 {
1458 query_seq[index] = residue;
1459 index++;
1460 }
1461 }
1462 query_seq[index] = NULLB;
1463 spp = SeqPortFree(spp);
1464 if (StringCmp(prog_name, "blastn") == 0)
1465 {
1466 if (filter_slp)
1467 {
1468 if (mask_at_hash)
1469 search->context[0].location =
1470 BlastSeqLocFillDoubleInt(filter_slp, query_length, FALSE);
1471 else
1472 BlastMaskTheResidues(query_seq, full_query_length, 15, filter_slp, FALSE, SeqLocStart(private_slp));
1473 }
1474 for (index=0; index<query_length; index++)
1475 query_seq[index] = ncbi4na_to_blastna[query_seq[index]];
1476 }
1477 }
1478
1479 if (spp_reverse)
1480 {
1481 query_seq_start_rev = (Uint1Ptr) MemNew(((query_length)+2)*sizeof(Char));
1482 query_seq_start_rev[0] = NULLB;
1483 query_seq_rev = query_seq_start_rev+1;
1484 index=0;
1485 while ((residue=SeqPortGetResidue(spp_reverse)) != SEQPORT_EOF)
1486 {
1487 if (IS_residue(residue))
1488 {
1489 query_seq_rev[index] = residue;
1490 index++;
1491 }
1492 }
1493 query_seq_rev[index] = NULLB;
1494 spp_reverse = SeqPortFree(spp_reverse);
1495 if (StringCmp(prog_name, "blastn") == 0)
1496 {
1497 if (filter_slp)
1498 {
1499 if (mask_at_hash)
1500 search->context[1].location =
1501 BlastSeqLocFillDoubleInt(filter_slp, query_length, TRUE);
1502 else
1503 BlastMaskTheResidues(query_seq_rev, full_query_length, 15, filter_slp, TRUE, full_query_length - SeqLocStop(private_slp_rev) - 1);
1504 }
1505 for (index=0; index<query_length; index++)
1506 query_seq_rev[index] = ncbi4na_to_blastna[query_seq_rev[index]];
1507 }
1508 }
1509
1510 /*
1511 Set the context_factor, which specifies how many different
1512 ways the query or db is examined (e.g., blastn looks at both
1513 stands of query, context_factor is 2).
1514 */
1515 if (StringCmp(prog_name, "blastp") == 0)
1516 {
1517 search->context_factor = 1;
1518 length = query_length;
1519 }
1520 else if (StringCmp(prog_name, "blastn") == 0)
1521 { /* two strands */
1522 search->context_factor = (search->last_context-search->first_context+1);
1523 length = query_length;
1524 }
1525 else if (StringCmp(prog_name, "blastx") == 0)
1526 { /* query translated in six frames. */
1527 search->context_factor = search->last_context-search->first_context+1;
1528 length = query_length/3;
1529 }
1530 else if (StringCmp(prog_name, "tblastn") == 0)
1531 { /* db translated in six frames. */
1532 search->context_factor = 6;
1533 length = query_length;
1534 }
1535 else if (StringCmp(prog_name, "tblastx") == 0)
1536 { /* db and query each translated in six frames. */
1537 search->context_factor = 6*CODON_LENGTH*(search->last_context-search->first_context+1);
1538 length = query_length/3;
1539 }
1540
1541 if (private_slp)
1542 query_id = SeqIdFindBest(SeqLocId(private_slp), SEQID_GI);
1543 else
1544 query_id = SeqIdFindBest(SeqLocId(private_slp_rev), SEQID_GI);
1545
1546 search->query_id = SeqIdDup(query_id);
1547
1548 /* Store the query sequence, or the translation thereof. */
1549 if (StringCmp(prog_name, "blastp") == 0 || StringCmp(prog_name, "tblastn") == 0)
1550 { /* One blastp context for now. */
1551 if (filter_slp)
1552 {
1553 if (mask_at_hash)
1554 search->context[0].location =
1555 BlastSeqLocFillDoubleInt(filter_slp, query_length, FALSE);
1556 else
1557 BlastMaskTheResidues(query_seq, full_query_length, 21, filter_slp, FALSE, SeqLocStart(private_slp));
1558 }
1559 BlastSequenceAddSequence(search->context[0].query, NULL, query_seq_start, query_length, query_length, 0);
1560 }
1561 else if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0)
1562 {
1563
1564 for (index=search->first_context; index<=search->last_context; index++)
1565 {
1566 if (search->context[index].query->frame > 0)
1567 {
1568 sequence = GetTranslation(query_seq, query_length, search->context[index].query->frame, &length, search->genetic_code);
1569 }
1570 else
1571 {
1572 sequence = GetTranslation(query_seq_rev, query_length, search->context[index].query->frame, &length, search->genetic_code);
1573 }
1574 if (options->filter_string && length > 0)
1575 {
1576 bsp_temp = (BioseqPtr) BlastMakeTempProteinBioseq(sequence+1, length, Seq_code_ncbistdaa);
1577 bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1578 /*
1579 bsp_temp->id = search->query_id;
1580 */
1581 oip = (ObjectIdPtr) UniqueLocalId();
1582 ValNodeAddPointer(&(bsp_temp->id), SEQID_LOCAL, oip);
1583 SeqMgrAddToBioseqIndex(bsp_temp);
1584
1585 filter_slp = BlastBioseqFilterEx(bsp_temp, options->filter_string, &mask_at_hash);
1586 HackSeqLocId(filter_slp, search->query_id);
1587
1588 SeqMgrDeleteFromBioseqIndex(bsp_temp);
1589
1590 bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1591 bsp_temp = BioseqFree(bsp_temp);
1592 if (mask_at_hash)
1593 {
1594 search->context[index].location =
1595 BlastSeqLocFillDoubleInt(filter_slp, query_length, FALSE);
1596 }
1597 else
1598 {
1599 BlastMaskTheResidues(sequence+1, length, 21, filter_slp, FALSE, SeqLocStart(private_slp));
1600 BlastConvertProteinSeqLoc(filter_slp, search->context[index].query->frame, query_length);
1601 }
1602 if (filter_slp && !mask_at_hash)
1603 ValNodeAddPointer(&(search->mask), FrameToDefine(search->context[index].query->frame), filter_slp);
1604 }
1605 BlastSequenceAddSequence(search->context[index].query, NULL, sequence, length, query_length, 0);
1606 }
1607 query_seq_start = MemFree(query_seq_start);
1608 query_seq_start_rev = MemFree(query_seq_start_rev);
1609 }
1610 else if (StringCmp(prog_name, "blastn") == 0)
1611 {
1612 if (search->first_context == 0)
1613 BlastSequenceAddSequence(search->context[0].query, NULL, query_seq_start, query_length, query_length, 0);
1614 if (search->last_context == 1)
1615 BlastSequenceAddSequence(search->context[1].query, NULL, query_seq_start_rev, query_length, query_length, 0);
1616 }
1617
1618 if (mask_at_hash)
1619 { /* No longer needed. */
1620 filter_slp = SeqLocSetFree(filter_slp);
1621 }
1622
1623 /* Set the ambiguous residue before the ScoreBlk is filled. */
1624 if (StringCmp(prog_name, "blastn") != 0)
1625 {
1626 search->sbp->read_in_matrix = TRUE;
1627 BlastScoreSetAmbigRes(search->sbp, 'X');
1628 }
1629 else
1630 {
1631 if(options->matrix!=NULL) {
1632 search->sbp->read_in_matrix = TRUE;
1633 } else {
1634 search->sbp->read_in_matrix = FALSE;
1635 }
1636 BlastScoreSetAmbigRes(search->sbp, 'N');
1637 }
1638
1639
1640 search->sbp->penalty = options->penalty;
1641 search->sbp->reward = options->reward;
1642
1643 /* option is to use alignments choosen by user in PSM computation API (used in WWW PSI-Blast); */
1644 search->pbp->use_best_align = options->use_best_align;
1645
1646 /* Should culling be used at all? */
1647 search->pbp->perform_culling = options->perform_culling;
1648 search->pbp->hsp_range_max = options->hsp_range_max;
1649 /* This assures that search->pbp->max_pieces is at least one wide. */
1650 block_width = MIN(query_length, options->block_width);
1651 if (block_width > 0)
1652 search->pbp->max_pieces = query_length/block_width;
1653
1654 search->sbp->query_length = query_length;
1655
1656 search->result_struct = BLASTResultsStructNew(search->result_size, search->pbp->max_pieces, search->pbp->hsp_range_max);
1657 if (options->matrix != NULL)
1658 status = BlastScoreBlkMatFill(search->sbp, options->matrix);
1659 else
1660 status = BlastScoreBlkMatFill(search->sbp, "BLOSUM62");
1661 if (status != 0)
1662 {
1663 ErrPostEx(SEV_WARNING, 0, 0, "BlastScoreBlkMatFill returned non-zero status");
1664 return 1;
1665 }
1666
1667 /* This is used right below. */
1668 search->pbp->gapped_calculation = options->gapped_calculation;
1669 search->pbp->do_not_reevaluate = options->do_not_reevaluate;
1670 search->pbp->do_sum_stats = options->do_sum_stats;
1671 search->pbp->first_db_seq = options->first_db_seq;
1672 search->pbp->final_db_seq = options->final_db_seq;
1673
1674 retval = 0;
1675 for (index=search->first_context; index<=search->last_context; index++)
1676 {
1677 status = BlastScoreBlkFill(search->sbp, (CharPtr) search->context[index].query->sequence,search->context[index].query->length, index);
1678 if (status != 0)
1679 {
1680 sprintf(buffer, "Unable to calculate Karlin-Altschul params, check query sequence");
1681 BlastConstructErrorMessage("BLASTSetUpSearch", buffer, 2, &(search->error_return));
1682 retval = 1;
1683 }
1684 if (search->pbp->gapped_calculation &&
1685 StringCmp(search->prog_name, "blastn") != 0)
1686 {
1687 search->sbp->kbp_gap_std[index] = BlastKarlinBlkCreate();
1688 status = BlastKarlinBlkGappedCalc(search->sbp->kbp_gap_std[index], options->gap_open, options->gap_extend, search->sbp->name, &(search->error_return));
1689 if (status != 0)
1690 {
1691 retval = 1;
1692 }
1693 search->sbp->kbp_gap_psi[index] = BlastKarlinBlkCreate();
1694 status = BlastKarlinBlkGappedCalc(search->sbp->kbp_gap_psi[index], options->gap_open, options->gap_extend, search->sbp->name, &(search->error_return));
1695 if (status != 0)
1696 {
1697 retval = 1;
1698 }
1699 }
1700 }
1701
1702 search->sbp->kbp_gap = search->sbp->kbp_gap_std;
1703 search->sbp->kbp = search->sbp->kbp_std;
1704 if (StringCmp(prog_name, "blastn") != 0)
1705 {
1706 array_size = BlastKarlinGetMatrixValues(search->sbp->name, &open, &extend, &lambda, &K, &H, NULL);
1707 if (array_size > 0)
1708 {
1709 for (index=0; index<array_size; index++)
1710 {
1711 if (open[index] == INT2_MAX && extend[index] == INT2_MAX)
1712 {
1713 search->sbp->kbp_ideal = BlastKarlinBlkCreate();
1714 search->sbp->kbp_ideal->Lambda = lambda[index];
1715 search->sbp->kbp_ideal->K = K[index];
1716 search->sbp->kbp_ideal->H = H[index];
1717 }
1718 }
1719 MemFree(open);
1720 MemFree(extend);
1721 MemFree(lambda);
1722 MemFree(K);
1723 MemFree(H);
1724 }
1725 if (search->sbp->kbp_ideal == NULL)
1726 search->sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(search->sbp);
1727 }
1728
1729 /* Adjust the Karlin parameters. */
1730 if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0)
1731 {
1732 BlastKarlinBlkStandardCalc(search->sbp, search->first_context, search->last_context);
1733 }
1734
1735 /* If retval was set non-zero above (by the routines calculating Karlin-Altschul params),
1736 return here before these values are used.
1737 */
1738 if (retval)
1739 return retval;
1740 if( 0 != BlastComputeLengthAdjustment(1/min_query_length,
1741 search->sbp->
1742 kbp[search->first_context]->logK,
1743 1/search->sbp->
1744 kbp[search->first_context]->H,
1745 0.0,
1746 length,
1747 search->dblen, search->dbseq_num,
1748 &length_adjustment) ) {
1749 ErrPostEx(SEV_WARNING, 0, 0,
1750 "BlastComputeLengthAdjustment failed!");
1751 }
1752 search->length_adjustment = length_adjustment;
1753 search->dblen_eff =
1754 search->dblen - search->dbseq_num*search->length_adjustment;
1755
1756 effective_query_length = length - length_adjustment;
1757 for (index=search->first_context; index<=search->last_context; index++)
1758 {
1759 search->context[index].query->effective_length =
1760 effective_query_length;
1761 }
1762
1763 if (search->searchsp_eff == 0)
1764 search->searchsp_eff =
1765 ((Nlm_FloatHi) search->dblen_eff)*
1766 ((Nlm_FloatHi) effective_query_length);
1767
1768 /* The default is that cutoff_s was not set and is zero. */
1769 if (options->cutoff_s == 0)
1770 {
1771 search->pbp->cutoff_e = options->expect_value;
1772 search->pbp->cutoff_e_set = TRUE;
1773 search->pbp->cutoff_s = options->cutoff_s;
1774 search->pbp->cutoff_s_set = FALSE;
1775 }
1776 else
1777 {
1778 search->pbp->cutoff_e = options->expect_value;
1779 search->pbp->cutoff_e_set = FALSE;
1780 search->pbp->cutoff_s = options->cutoff_s;
1781 search->pbp->cutoff_s_set = TRUE;
1782 }
1783 /* For now e2 is set to 0.5 and cutoff_e2_set is FALSE. This is then
1784 changed to the proper values in blast_set_parameters. In the final version
1785 of this program (where more blast programs and command-line options are
1786 available) this needs to be set higher up. */
1787 if (options->cutoff_s2 == 0)
1788 {
1789 search->pbp->cutoff_e2 = options->e2;
1790 search->pbp->cutoff_e2_set = FALSE;
1791 search->pbp->cutoff_s2 = options->cutoff_s2;
1792 search->pbp->cutoff_s2_set = FALSE;
1793 }
1794 else
1795 {
1796 search->pbp->cutoff_e2 = options->e2;
1797 search->pbp->cutoff_e2_set = FALSE;
1798 search->pbp->cutoff_s2 = options->cutoff_s2;
1799 search->pbp->cutoff_s2_set = TRUE;
1800 }
1801
1802 search->pbp->discontinuous = options->discontinuous;
1803
1804
1805 /* For postion based blast. */
1806 search->pbp->ethresh = options->ethresh;
1807 search->pbp->maxNumPasses = options->maxNumPasses;
1808 search->pbp->pseudoCountConst = options->pseudoCountConst;
1809
1810 search->pbp->process_num = options->number_of_cpus;
1811 search->pbp->cpu_limit = options->cpu_limit;
1812 search->pbp->gap_decay_rate = options->gap_decay_rate;
1813 search->pbp->gap_size = options->gap_size;
1814 search->pbp->gap_prob = options->gap_prob;
1815 search->pbp->old_stats = options->old_stats;
1816 search->pbp->use_large_gaps = options->use_large_gaps;
1817 search->pbp->number_of_bits = options->number_of_bits;
1818 search->pbp->two_pass_method = options->two_pass_method;
1819 search->pbp->multiple_hits_only = options->multiple_hits_only;
1820 search->pbp->gap_open = options->gap_open;
1821 search->pbp->gap_extend = options->gap_extend;
1822 search->pbp->decline_align = options->decline_align;
1823
1824 search->pbp->hsp_num_max = options->hsp_num_max;
1825 /* CHANGE HERE??? */
1826 if (search->pbp->gapped_calculation && StringCmp(search->prog_name, "blastn"))
1827 {
1828 search->pbp->cutoff_s2_set = TRUE;
1829 if (StringCmp(search->prog_name, "blastn") != 0)
1830 {
1831 search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
1832 search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
1833 search->pbp->gap_trigger = (BLAST_Score) ((options->gap_trigger*NCBIMATH_LN2+search->sbp->kbp[search->first_context]->logK)/ search->sbp->kbp[search->first_context]->Lambda);
1834 }
1835 else
1836 {
1837 search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
1838 search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
1839 search->pbp->gap_trigger = (BLAST_Score) ((options->gap_trigger*NCBIMATH_LN2+search->sbp->kbp[search->first_context]->logK)/ search->sbp->kbp[search->first_context]->Lambda);
1840 }
1841 /* The trigger value sets the s2 cutoff. */
1842 search->pbp->cutoff_s2 = search->pbp->gap_trigger;
1843 }
1844 else
1845 {
1846 search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
1847 search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
1848 search->pbp->gap_trigger = (BLAST_Score) ((options->gap_trigger*NCBIMATH_LN2+search->sbp->kbp[search->first_context]->logK)/ search->sbp->kbp[search->first_context]->Lambda);
1849 /* Set S and S2 equal if not sum stats. */
1850 if (search->pbp->do_sum_stats == FALSE)
1851 search->pbp->cutoff_s2 = search->pbp->cutoff_s;
1852 }
1853 /* Ensures that gap_x_dropoff_final is at least as large as gap_x_dropoff. */
1854 search->pbp->gap_x_dropoff_final = MAX(search->pbp->gap_x_dropoff_final, search->pbp->gap_x_dropoff);
1855
1856 /* "threshold" (first and second) must be set manually for two-pass right now.*/
1857 search->pbp->threshold_set = TRUE;
1858 search->pbp->threshold_first = options->threshold_first;
1859 search->pbp->threshold_second = options->threshold_second;
1860
1861 search->pbp->window_size = options->window_size;
1862 search->pbp->window_size_set = TRUE;
1863
1864 search->whole_query = TRUE;
1865 if (options->required_start != 0 || options->required_end != -1)
1866 {
1867 search->whole_query = FALSE;
1868 search->required_start = options->required_start;
1869 if (options->required_end != -1)
1870 search->required_end = options->required_end;
1871 else
1872 search->required_end = query_length;
1873 }
1874
1875 if (qlen <= 0)
1876 qlen = query_length;
1877
1878 /* Use DROPOFF_NUMBER_OF_BITS as the default if it's set to zero. */
1879 if (options->dropoff_1st_pass == 0)
1880 options->dropoff_1st_pass = DROPOFF_NUMBER_OF_BITS;
1881
1882 if (options->dropoff_2nd_pass == 0)
1883 options->dropoff_2nd_pass = DROPOFF_NUMBER_OF_BITS;
1884
1885 if (StringCmp(search->prog_name, "blastn") != 0)
1886 {
1887 avglen = BLAST_AA_AVGLEN;
1888 }
1889 else
1890 {
1891 avglen = BLAST_NT_AVGLEN;
1892 /* Use only one type of gap for blastn */
1893 search->pbp->ignore_small_gaps = TRUE;
1894 }
1895
1896 if (blast_set_parameters(search, options->dropoff_1st_pass, options->dropoff_2nd_pass, avglen, search->searchsp_eff, options->window_size) != 0)
1897 return 1;
1898
1899 search->thr_info->awake_index = FALSE;
1900
1901 /* Only do this if this is not a pattern search. */
1902 if (options->isPatternSearch == FALSE)
1903 {
1904 for (index=search->first_context; index<=search->last_context; index++)
1905 {
1906 if (options->threshold_first > 0)
1907 {
1908 search->wfp = search->wfp_first;
1909 if (search->whole_query == TRUE)
1910 if (!(search->positionBased)) /*AAS*/
1911 status = BlastFindWords(search, 0, search->context[index].query->length, options->threshold_first, (Uint1) index);
1912 else
1913 status = BlastNewFindWords(search, 0, search->context[index].query->length, options->threshold_first, (Uint1) index);
1914 else
1915 if (!(search->positionBased)) /*AAS*/
1916 status = BlastFindWords(search, search->required_start, search->required_end, options->threshold_first, (Uint1) index);
1917 else
1918 status = BlastFindWords(search, search->required_start, search->required_end, options->threshold_first, (Uint1) index);
1919 if (status != 0) {
1920 search->thr_info->awake_index = FALSE;
1921 ErrPostEx(SEV_WARNING, 0, 0, "BlastFindWords returned non-zero status");
1922 return 1;
1923 }
1924 }
1925 search->wfp = search->wfp_second;
1926 if (StringCmp(prog_name, "blastn") != 0)
1927 {
1928 if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND)
1929 {
1930 if (search->whole_query == TRUE)
1931 if (!(search->positionBased))
1932 status = BlastFindWords(search, 0, search->context[index].query->length, options->threshold_second, (Uint1) index);
1933 else
1934 status = BlastNewFindWords(search, 0, search->context[index].query->length, options->threshold_second, (Uint1) index);
1935 else
1936 if (!(search->positionBased))
1937 status = BlastFindWords(search, search->required_start, search->required_end, options->threshold_second, (Uint1) index);
1938 else
1939 status = BlastNewFindWords(search, search->required_start, search->required_end, options->threshold_second, (Uint1) index);
1940 }
1941 }
1942 else
1943 {
1944 status = BlastNtFindWords(search, 0, search->context[index].query->length,
1945 (Uint1) index);
1946 }
1947
1948 if (status > 0)
1949 {
1950 search->thr_info->awake_index = FALSE;
1951 sprintf(buffer, "No valid letters to be indexed");
1952 BlastConstructErrorMessage("Blast", buffer, 2, &(search->error_return));
1953 return 1;
1954 }
1955 else if (status < 0)
1956 {
1957 search->thr_info->awake_index = FALSE;
1958 sprintf(buffer, "Error finding words");
1959 BlastConstructErrorMessage("Blast", buffer, 2, &(search->error_return));
1960 return 1;
1961 }
1962 }
1963 lookup_position_aux_destruct(search->wfp->lookup);
1964 }
1965 /*
1966 Turn off the index thread by setting this flag. Don't wait for a join, as the
1967 search will take much longer than the one second for this to die.
1968 */
1969 search->thr_info->awake_index = FALSE;
1970
1971
1972 if (private_slp && private_slp_delete)
1973 private_slp = SeqLocFree(private_slp);
1974 if (private_slp_rev)
1975 private_slp_rev = SeqLocFree(private_slp_rev);
1976
1977 return 0;
1978 }
1979
1980 /*---------------------------------------------------------------------------*/
1981 /*---------------------------------------------------------------------------*/
CddSetUpSearchWithReadDb(SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,ReadDBFILEPtr rdfp)1982 BlastSearchBlkPtr LIBCALL CddSetUpSearchWithReadDb(SeqLocPtr query_slp,
1983 BioseqPtr query_bsp,
1984 CharPtr prog_name,
1985 Int4 qlen,
1986 CharPtr dbname,
1987 BLAST_OptionsBlkPtr options,
1988 int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)),
1989 SeqIdPtr seqid_list,
1990 BlastDoubleInt4Ptr gi_list,
1991 Int4 gi_list_total,
1992 ReadDBFILEPtr rdfp)
1993
1994 {
1995 BlastSearchBlkPtr search;
1996 Boolean multiple_hits, options_alloc=FALSE;
1997 Int2 status, first_context, last_context;
1998 Int8 dblen;
1999 Int4 query_length;
2000 ValNodePtr vnp;
2001 Int4 i;
2002 Nlm_FloatHi searchsp_eff=0;
2003 Boolean use_private_gilist = FALSE;
2004 OIDListPtr alias_oidlist;
2005 Int4 mask_index, virtual_mask_index;
2006 Uint4 oid_bit, virtual_oid_bit;
2007 ReadDBFILEPtr tmprdfp;
2008 Int4 hitlist_size;
2009
2010 if (options == NULL) {
2011 options = BLASTOptionNew(prog_name, FALSE); options_alloc = TRUE;
2012 }
2013
2014 if (options->window_size != 0) multiple_hits = TRUE;
2015 else multiple_hits = FALSE;
2016
2017 BlastGetFirstAndLastContext(prog_name, query_slp, &first_context, &last_context, options->strand_option);
2018
2019 if (query_slp) query_length = SeqLocLen(query_slp);
2020 else query_length = query_bsp->length;
2021
2022 hitlist_size = BlastSingleQueryResultSize(options);
2023 /* On the first call query length is used for the subject length. */
2024 search =
2025 BlastSearchBlkNewExtra(options->wordsize, query_length, dbname,
2026 multiple_hits, options->threshold_first,
2027 options->threshold_second, hitlist_size,
2028 prog_name, NULL, first_context,
2029 last_context, rdfp, options->window_size);
2030
2031 if (search) {
2032 readdb_get_totals(search->rdfp, &(dblen), &(search->dbseq_num));
2033 if (seqid_list) BlastAdjustDbNumbers(search->rdfp, &(dblen), &(search->dbseq_num), seqid_list, NULL, NULL, NULL, 0);
2034
2035 if (!gi_list) {
2036 /* Ok, we do not have a gi-list specified, but maybe
2037 we have an a mask database in the list of databases,
2038 we need to create one mask for all such databases */
2039 OIDListPtr virtual_oidlist = NULL;
2040 Int4 final_virtual_db_seq=0, final_db_seq=0;
2041 Int4 mask, oid, virtual_oid, maskindex, virtual_mask_index, total_virtual_mask, base;
2042 Uint4 virtual_oid_bit;
2043
2044 tmprdfp = search->rdfp;
2045 while (tmprdfp) {
2046 final_virtual_db_seq = tmprdfp->stop;
2047 if (!tmprdfp->oidlist) final_db_seq = tmprdfp->stop;
2048 tmprdfp = tmprdfp->next;
2049 }
2050 tmprdfp = search->rdfp;
2051 while (tmprdfp) {
2052 if (tmprdfp->oidlist) {
2053 if (!virtual_oidlist) {
2054 /* create new oidlist for virtual database */
2055 virtual_oidlist = (OIDListPtr) MemNew(sizeof(OIDList));
2056 virtual_oidlist->total = final_virtual_db_seq + 1;
2057 total_virtual_mask = final_virtual_db_seq/MASK_WORD_SIZE + 2;
2058 virtual_oidlist->list = (Uint4Ptr) MemNew (total_virtual_mask*sizeof(Int4));
2059 }
2060 /* Now populate the virtual_oidlist */
2061 maskindex = 0;
2062 base = 0;
2063 while (maskindex < (tmprdfp->oidlist->total/MASK_WORD_SIZE +1)) {
2064 /* for each long-word mask */
2065 mask = Nlm_SwapUint4(tmprdfp->oidlist->list[maskindex]);
2066 i = 0;
2067 while (mask) {
2068 if (mask & (((Uint4)0x1)<<(MASK_WORD_SIZE-1))) {
2069 oid = base + i;
2070 virtual_oid = oid + tmprdfp->start;
2071 virtual_mask_index = virtual_oid/MASK_WORD_SIZE;
2072 virtual_oid_bit = 0x1 << (MASK_WORD_SIZE - 1 - virtual_oid % MASK_WORD_SIZE);
2073 virtual_oidlist->list[virtual_mask_index] |= virtual_oid_bit;
2074 }
2075 mask <<= 1;
2076 i++;
2077 }
2078 maskindex++;
2079 base += MASK_WORD_SIZE;
2080 }
2081 /* free old mask */
2082 tmprdfp->oidlist = OIDListFree(tmprdfp->oidlist);
2083 }
2084 tmprdfp = tmprdfp->next;
2085 }
2086 if (virtual_oidlist) {
2087 for (i=0; i<total_virtual_mask; i++) {
2088 virtual_oidlist->list[i] = Nlm_SwapUint4(virtual_oidlist->list[i]);
2089 }
2090 }
2091 search->rdfp->oidlist = virtual_oidlist;
2092 readdb_get_totals_ex(search->rdfp, &(dblen), &(search->dbseq_num), TRUE);
2093 }
2094 #if 0
2095 /* Intended for use when a list of gi's is sent in, but the real size is needed. */
2096 /* It's probably still necessary to call BlastAdjustDbNumbers, but it would be nice
2097 if this were not required. */
2098 if (options->use_real_db_size)
2099 readdb_get_totals(search->rdfp, &(dblen), &(search->dbseq_num));
2100 /* use length and num of seqs of the database from alias file */
2101 if (search->rdfp->aliaslen && !gi_list) dblen = search->rdfp->aliaslen;
2102 if (search->rdfp->aliasnseq && !gi_list) search->dbseq_num = search->rdfp->aliasnseq;
2103 /* command-line/options trump alias file. */
2104 if (options->db_length > 0) dblen = options->db_length;
2105 if (options->dbseq_num > 0) search->dbseq_num = options->dbseq_num;
2106 #endif
2107 if (options->searchsp_eff > 0) searchsp_eff = options->searchsp_eff;
2108 if (StringCmp(prog_name, "tblastn") == 0 || StringCmp(prog_name, "tblastx") == 0) {
2109 dblen /= 3;
2110 searchsp_eff /= 3.0;
2111 }
2112 search->dblen = dblen;
2113 search->searchsp_eff = searchsp_eff;
2114 status = CddSetUpSearchInternalByLoc (search, query_slp, query_bsp, prog_name, qlen, options, callback);
2115 if (status != 0) {
2116 ErrPostEx(SEV_WARNING, 0, 0, "SetUpBlastSearch failed.");
2117 search->query_invalid = TRUE;
2118 }
2119 }
2120 if (options_alloc) options = BLASTOptionDelete(options);
2121 return search;
2122 }
2123
2124 #define putCkptNlm_FloatHi(d, ckptFile) (putCkptNumber(&(d),sizeof(Nlm_FloatHi),ckptFile))
2125 #define putCkptInt4(i, ckptFile) (putCkptNumber(&(i),sizeof(Int4),ckptFile))
2126 #define putCkptChar(c, ckptFile) (putCkptNumber(&(c),sizeof(Char),ckptFile))
2127
2128 /*---------------------------------------------------------------------------*/
2129 /* General routine for putting the internal representation of a number. */
2130 /*---------------------------------------------------------------------------*/
putCkptNumber(void * numberPtr,Int4 numberSize,FILE * ckptFile)2131 static void LIBCALL putCkptNumber(void * numberPtr, Int4 numberSize, FILE * ckptFile)
2132 {
2133 FileWrite(numberPtr,numberSize,1,ckptFile) ;
2134 }
2135
2136 /*---------------------------------------------------------------------------*/
2137 /*Code to put a vector of frequencies; put only the interesting entries*/
2138 /*---------------------------------------------------------------------------*/
putFreqVector(Nlm_FloatHi * theVector,Int4 length,FILE * ckptFile)2139 static void LIBCALL putFreqVector(Nlm_FloatHi * theVector, Int4 length, FILE * ckptFile)
2140 {
2141 int vectorRef;
2142 Int4 charOrder[EFFECTIVE_ALPHABET]; /*standard order of letters according to S. Altschul*/
2143
2144
2145 charOrder[0] = 1; /*A*/
2146 charOrder[1] = 16; /*R*/
2147 charOrder[2] = 13; /*N*/
2148 charOrder[3] = 4; /*D*/
2149 charOrder[4] = 3; /*C*/
2150 charOrder[5] = 15; /*Q*/
2151 charOrder[6] = 5; /*E*/
2152 charOrder[7] = 7; /*G*/
2153 charOrder[8] = 8; /*H*/
2154 charOrder[9] = 9; /*I*/
2155 charOrder[10] = 11; /*L*/
2156 charOrder[11] = 10; /*K*/
2157 charOrder[12] = 12; /*M*/
2158 charOrder[13] = 6; /*F*/
2159 charOrder[14] = 14; /*P*/
2160 charOrder[15] = 17; /*S*/
2161 charOrder[16] = 18; /*T*/
2162 charOrder[17] = 20; /*W*/
2163 charOrder[18] = 22; /*Y*/
2164 charOrder[19] = 19; /*V*/
2165
2166 for(vectorRef = 0; vectorRef < EFFECTIVE_ALPHABET; vectorRef++)
2167 putCkptNlm_FloatHi(theVector[charOrder[vectorRef]],ckptFile);
2168 }
2169
2170 /*---------------------------------------------------------------------------*/
2171 /* Code to put a matrix, vector-by-vector. */
2172 /*---------------------------------------------------------------------------*/
putCkptFreqMatrix(Nlm_FloatHi ** theMatrix,Int4 length,Int4 width,FILE * ckptFile)2173 static void LIBCALL putCkptFreqMatrix (Nlm_FloatHi **theMatrix, Int4 length, Int4 width, FILE * ckptFile)
2174 {
2175 int matrixRef; /*loop index*/
2176
2177 for (matrixRef = 0; matrixRef < length ; matrixRef++ )
2178 putFreqVector(theMatrix[matrixRef], width, ckptFile);
2179 }
2180
2181
2182 /*---------------------------------------------------------------------------*/
2183 /*Take a checkpoint at the end of the current PSI-BLAST round, stores
2184 query length, query, and position-specific target frequencies.
2185 Returns TRUE if checkpoint was sucessful and FALSE otherwise. */
2186 /*---------------------------------------------------------------------------*/
CddposTakeCheckpoint(posSearchItems * posSearch,compactSearchItems * compactSearch,CharPtr fileName,ValNodePtr * error_return)2187 Boolean LIBCALL CddposTakeCheckpoint(posSearchItems * posSearch, compactSearchItems * compactSearch, CharPtr fileName, ValNodePtr *error_return)
2188 {
2189 FILE * checkFile; /*file in which to take the checkpoint*/
2190 Int4 length; /*length of query sequence, and an index for it*/
2191 Int4 i; /*indices to position and alphabet */
2192 Char localChar; /*temporary character*/
2193
2194 checkFile = FileOpen(fileName, "w");
2195 if (NULL == checkFile) {
2196 BlastConstructErrorMessage("CddposTakeCheckpoint", "Could not open checkpoint file", 1, error_return);
2197 return(FALSE);
2198 }
2199 length = compactSearch->qlength;
2200 putCkptInt4(length, checkFile);
2201 for(i = 0; i < length; i++) {
2202 localChar = getRes(compactSearch->query[i]);
2203 putCkptChar(localChar, checkFile);
2204 }
2205 putCkptFreqMatrix(posSearch->posFreqs,length,compactSearch->alphabetSize, checkFile);
2206 FileClose(checkFile);
2207 return(TRUE);
2208 }
2209
2210 /*---------------------------------------------------------------------------*/
2211 /*---------------------------------------------------------------------------*/
2212 /* modelled after posReadAlignment (posit.c, as of 12/28/2000) */
2213 /* instead of reading in a text alignment from a file, the posDesc array is */
2214 /* populated from the DenseDiag alignment */
2215 /*---------------------------------------------------------------------------*/
2216 /*---------------------------------------------------------------------------*/
CddposReadAlignment(compactSearchItems * compactSearch,SeqAlignPtr salp,Int4 numSeqs,Int4 alignLength,BioseqPtr bsp)2217 static posDesc** CddposReadAlignment(compactSearchItems *compactSearch,
2218 SeqAlignPtr salp, Int4 numSeqs,
2219 Int4 alignLength, BioseqPtr bsp)
2220 {
2221 posDesc **returnArray; /*array of sequences to return*/
2222 SeqAlignPtr salpThis;
2223 DenseDiagPtr ddp;
2224 SeqPortPtr spp;
2225 Int4 i,j,c; /*loop indices*/
2226 Int4 alignPos, queryOffset, subjectOffset, matchLength;
2227 Int4 index, splace, qplace, retrievalOffset, subjectLength;
2228 Uint1Ptr s;
2229
2230 returnArray = (posDesc**) MemNew(numSeqs * sizeof(posDesc *));
2231 if (NULL == returnArray) exit(EXIT_FAILURE);
2232 for (i = 0; i < numSeqs; i++) {
2233 returnArray[i] = (posDesc *) MemNew(alignLength * sizeof(posDesc));
2234 if (NULL == returnArray[i]) exit(EXIT_FAILURE);
2235 for(j = 0; j < alignLength; j++) {
2236 returnArray[i][j].letter = GAP_CHAR;
2237 returnArray[i][j].used = TRUE;
2238 }
2239 }
2240
2241 alignPos = 0; i = 0;
2242 ASSERT(NULL != bsp);
2243 spp = SeqPortNew(bsp,0,bsp->length-1,Seq_strand_unknown,Seq_code_ncbistdaa);
2244 for (index=0; index<alignLength; index++) {
2245 returnArray[i][index].letter = SeqPortGetResidue(spp);
2246 returnArray[i][index].used = TRUE;
2247 }
2248 spp = SeqPortFree(spp);
2249 salpThis = salp;
2250
2251 while (salpThis) {
2252 i++;
2253 ddp = (DenseDiagPtr) salpThis->segs;
2254 s = CddGetSequenceWithDenseDiag(ddp,FALSE, &retrievalOffset, &subjectLength);
2255 while (ddp) {
2256 queryOffset = ddp->starts[0];
2257 subjectOffset = ddp->starts[1] - retrievalOffset;
2258 matchLength = ddp->len;
2259 for(c = 0, qplace = queryOffset, splace = subjectOffset; c < matchLength; c++, qplace++, splace++) {
2260 ASSERT(qplace < alignLength);
2261 ASSERT(i < numSeqs);
2262 returnArray[i][qplace].letter = (Int1) s[splace];
2263 returnArray[i][qplace].used = TRUE;
2264 }
2265 ddp = ddp->next;
2266 }
2267 s = MemFree(s);
2268 salpThis = salpThis->next;
2269 }
2270 return(returnArray);
2271 }
2272
2273
2274
2275 /*---------------------------------------------------------------------------*/
2276 /*---------------------------------------------------------------------------*/
2277 /* modelled after posProcessAlignment (posit.c, as of 12/28/2000) */
2278 /*---------------------------------------------------------------------------*/
2279 /*---------------------------------------------------------------------------*/
CddposProcessAlignment(posSearchItems * posSearch,compactSearchItems * compactSearch,SeqAlignPtr salp,Int4 numSeqs,Int4 alignLength,BioseqPtr bsp)2280 void LIBCALL CddposProcessAlignment(posSearchItems *posSearch,
2281 compactSearchItems *compactSearch,
2282 SeqAlignPtr salp, Int4 numSeqs,
2283 Int4 alignLength, BioseqPtr bsp)
2284 {
2285 Int4 queryPos, alignPos, linePos; /*placeholder for pos. in query & alignmt*/
2286 Int4 *queryDesc; /*position correspondence between alignment and query*/
2287 Int4 seqIndex; /*counter for sequences*/
2288 posDesc ** alignArray;
2289 Int4 queryLength; /*length of query sequence*/
2290
2291 alignArray = CddposReadAlignment(compactSearch,salp,numSeqs,alignLength,bsp);
2292 queryDesc = (Int4 *) MemNew(alignLength * sizeof(Int4));
2293 if (NULL == queryDesc) exit(EXIT_FAILURE);
2294 for(alignPos = 0; alignPos < alignLength; alignPos++) queryDesc[alignPos] = GAP_HERE;
2295 alignPos = 0;
2296 queryPos = 0;
2297 for(linePos = 0; linePos < alignLength; linePos++) {
2298 queryDesc[alignPos] = queryPos;
2299 posSearch->posDescMatrix[0][queryPos].letter = alignArray[0][linePos].letter;
2300 posSearch->posDescMatrix[0][queryPos].used = TRUE;
2301 posSearch->posDescMatrix[0][queryPos].e_value = compactSearch->ethresh/2;
2302 posSearch->posDescMatrix[0][queryPos].leftExtent = 0;
2303 posSearch->posDescMatrix[0][queryPos].rightExtent = compactSearch->qlength - 1;
2304 posSearch->posC[queryPos][alignArray[0][linePos].letter]++;
2305 posSearch->posCount[queryPos]++;
2306 queryPos++;
2307 alignPos++;
2308 }
2309
2310
2311 queryLength = queryPos;
2312 for(seqIndex = 1; seqIndex < numSeqs; seqIndex++) {
2313 for(linePos = 0; linePos < alignLength; linePos++) {
2314 if (!(posSearch->posDescMatrix[0][queryDesc[linePos]].used)) {
2315 /*mark column as not participating*/
2316 posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].used = FALSE;
2317 }
2318 else {
2319 posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].letter = alignArray[seqIndex][linePos].letter;
2320 posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].used = TRUE;
2321 posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].e_value = compactSearch->ethresh/2;
2322 posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].leftExtent = 0;
2323 posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].rightExtent = compactSearch->qlength;
2324 }
2325 }
2326 }
2327
2328 /*make terminal gaps unused*/
2329 for(seqIndex = 1; seqIndex < numSeqs; seqIndex++) {
2330 linePos = 0;
2331 while((linePos < queryLength) && (posSearch->posDescMatrix[seqIndex][linePos].letter == GAP_CHAR)) {
2332 posSearch->posDescMatrix[seqIndex][linePos].used = FALSE;
2333 linePos++;
2334 }
2335 linePos = queryLength - 1;
2336 while((linePos >= 0) && (posSearch->posDescMatrix[seqIndex][linePos].letter == GAP_CHAR)) {
2337 posSearch->posDescMatrix[seqIndex][linePos].used = FALSE;
2338 linePos--;
2339 }
2340 }
2341
2342 /* clean up */
2343 for (seqIndex=0;seqIndex<numSeqs;seqIndex++) {
2344 MemFree(alignArray[seqIndex]);
2345 }
2346 MemFree(alignArray);
2347 MemFree(queryDesc);
2348 }
2349
2350