1 /* $Id: cddposutil.c,v 1.22 2007/03/13 20:38:05 madden Exp $
2 *===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name:  cddposutil.c
27 *
28 * Author:  Aron Marchler-Bauer
29 *
30 * Initial Version Creation Date: 12/21/1999
31 *
32 * $Revision: 1.22 $
33 *
34 * File Description: CDD utilities involving position-specific scoring
35 *                   matrices (PSSMs)
36 *
37 * Modifications:
38 * --------------------------------------------------------------------------
39 * $Log: cddposutil.c,v $
40 * Revision 1.22  2007/03/13 20:38:05  madden
41 *   - In CddSetUpSearchInternalByLoc, don't cast DROPOFF_NUMBER_OF_BITS
42 *     to integer.
43 *
44 *   - In CddSetUpSearchWithReadDb, use integer division to compute dblen
45 *     to avoid possible truncation of an Int8 value.
46 *   [From Mike Gertz]
47 *
48 * Revision 1.21  2006/09/18 20:54:53  kans
49 * removed PROTEIN_ALPHABET - collides with new version in posit.h
50 *
51 * Revision 1.20  2005/10/22 02:04:53  thiessen
52 * plug memory leak
53 *
54 * Revision 1.19  2005/08/31 20:34:47  coulouri
55 * From Mike Gertz:
56 * in CddSetUpSearchWithReadDb:
57 *   Call BlastSingleQueryResultSize to get the hitlist size for
58 *   preliminary alignments as is done for other modes of blast; this
59 *   will increase the hitlist size for either modes with a final
60 *   traceback computation or modes that use composition-based
61 *   statistics.
62 *
63 * Revision 1.18  2004/03/31 17:58:51  papadopo
64 * Mike Gertz' changes for length adjustment calculations
65 *
66 * Revision 1.17  2004/01/22 15:43:57  bauer
67 * call BlastCalculateEffectiveLengths in CddSetUpSearchInternalByLoc
68 *
69 * Revision 1.16  2003/05/29 13:17:56  thiessen
70 * fix memory leak in CddposFreeMemory()
71 *
72 * Revision 1.15  2001/12/31 13:47:06  bauer
73 * made block_width a local variable in CddSetUpSearchInternalByLoc to deal with changes in blast data structures
74 *
75 * Revision 1.14  2001/02/06 20:54:43  hurwitz
76 * added a couple asserts
77 *
78 * Revision 1.13  2001/01/19 23:34:55  bauer
79 * fixed memory leaks
80 *
81 * Revision 1.12  2001/01/17 20:21:54  bauer
82 * merged in changes for PSSM calculation
83 *
84 * Revision 1.11  2001/01/03 11:11:49  bauer
85 * renamed posCancel to CddposCancel
86 *
87 * Revision 1.10  2000/12/29 00:52:21  hurwitz
88 * cleaning memory leaks
89 *
90 * Revision 1.9  2000/10/26 01:55:54  bauer
91 * renamed BLASTSetUpSearchInternalByLoc to CddSetUpSearchInternalByLoc
92 *
93 * Revision 1.8  2000/09/08 21:43:51  hurwitz
94 * adding PSSM calculation to DDE
95 *
96 * Revision 1.7  2000/08/17 19:00:24  hurwitz
97 * fixed some codewarrior compiler warnings
98 *
99 * Revision 1.6  2000/08/10 22:31:19  bauer
100 * CddposTakeCheckpoint now declared to return Boolean
101 *
102 * Revision 1.5  2000/08/09 21:29:08  hurwitz
103 * adding cddutil.c to VC++ build
104 *
105 * Revision 1.4  2000/08/08 21:18:58  bauer
106 * changed POSIT_SCALE_FACTOR from 1000 to 200
107 *
108 * Revision 1.3  2000/07/28 18:01:59  bauer
109 * fixed typecasts
110 *
111 * Revision 1.2  2000/07/19 19:36:22  bauer
112 * added modification logging
113 *
114 *
115 * ==========================================================================
116 */
117 
118 
119 #include <ncbi.h>
120 #include <blastpri.h>
121 #include <objcode.h>
122 #include <objseq.h>
123 #include <objsset.h>
124 #include <sequtil.h>
125 #include <posit.h>
126 #include <txalign.h>
127 #include "cddposutil.h"
128 #include <blast.h>
129 #include <objloc.h>
130 #include <accid1.h>
131 #include <cddutil.h>
132 #include <mblast.h>
133 #include <profiles.h>
134 /* #include <blastdef.h> */
135 
136 #define GAP_CHAR 0
137 #define GAP_HERE -1
138 #define POS_RESTING 0
139 #define POS_COUNTING 1
140 #define IDENTITY_RATIO 0.98
141 #define posEpsilon 0.0001
142 #define posEpsilon2 0.0000001
143 #define POSIT_SCALE_FACTOR2 200
144 #define EFFECTIVE_ALPHABET 20
145 
146 /* function definitions */
147 static Uint1Ptr LIBCALL CddGetSequenceWithDenseDiag(DenseDiagPtr ddp, Boolean query,
148                                                     Int4Ptr start, Int4Ptr stop);
149 static Uint1Ptr LIBCALL CddGetSequenceWithDenseSeg(DenseSegPtr dsp, Boolean query,
150                                                    Int4Ptr start, Int4Ptr length);
151 static void LIBCALL CddposCancel(posSearchItems *posSearch, compactSearchItems * compactSearch,
152                               Int4 first, Int4 second, Int4 matchStart, Int4 intervalLength);
153 static Nlm_FloatHi ** LIBCALL CddallocatePosFreqs(Int4 length, Int4 alphabetSize);
154 static Nlm_FloatHi LIBCALL countsFunction(Nlm_FloatHi Sigma, Int4 intervalLength);
155 static Nlm_FloatHi LIBCALL posit_rounddown(Nlm_FloatHi value);
156 static Int2 LIBCALL CddSetUpSearchInternalByLoc
157                        (BlastSearchBlkPtr search, SeqLocPtr query_slp, BioseqPtr query_bsp,
158                         CharPtr prog_name, Int4 qlen, BLAST_OptionsBlkPtr options,
159                         int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)));
160 static void LIBCALL putCkptNumber(void * numberPtr, Int4 numberSize, FILE * ckptFile);
161 static void LIBCALL putFreqVector(Nlm_FloatHi * theVector, Int4 length, FILE * ckptFile);
162 static void LIBCALL putCkptFreqMatrix (Nlm_FloatHi **theMatrix, Int4 length, Int4 width, FILE * ckptFile);
163 
164 /*---------------------------------------------------------------------------*/
165 /*Allocate memory for  data structures inside posSearch used in              */
166 /* position-specific caculations                                             */
167 /* posSearch -- to be filled in                                              */
168 /* alphabetSize -- number of distinct characters used in the sequences       */
169 /* querySize -- number of characters in the query sequence                   */
170 /* numSequences -- number of matching sequences potentially in the model     */
171 /*---------------------------------------------------------------------------*/
172 
CddposAllocateMemory(posSearchItems * posSearch,Int4 alphabetSize,Int4 querySize,Int4 numSequences)173 void LIBCALL CddposAllocateMemory(posSearchItems * posSearch, Int4 alphabetSize, Int4 querySize, Int4 numSequences)
174 {
175   Int4 i, j;                                                   /*loop indices*/
176 
177   posSearch->QuerySize = querySize;
178   posSearch->NumSequences = numSequences;
179 
180   posSearch->posCount = (Int4 *) MemNew(querySize * sizeof(Int4));
181   if (NULL == posSearch->posCount)
182     exit(EXIT_FAILURE);
183   for(i = 0; i < querySize; i++)
184     posSearch->posCount[i] = 0;
185 
186   posSearch->posC = (Int4 **) MemNew((querySize + 1) * sizeof(Int4 *));
187   if (NULL == posSearch->posC)
188     exit(EXIT_FAILURE);
189   for(i = 0; i <= querySize; i++) {
190     posSearch->posC[i] = (Int4 *) MemNew(alphabetSize * sizeof(Int4));
191     if (NULL == posSearch->posC[i])
192       exit(EXIT_FAILURE);
193     for(j = 0; j < alphabetSize; j++)
194       posSearch->posC[i][j] = 0;
195   }
196   posSearch->posMatchWeights = (Nlm_FloatHi **) MemNew((querySize+1) * sizeof(Nlm_FloatHi *));
197   if (NULL == posSearch->posMatchWeights)
198     exit(EXIT_FAILURE);
199   for (i = 0; i <= querySize ; i++) {
200     posSearch->posMatchWeights[i] = (Nlm_FloatHi *) MemNew(alphabetSize * sizeof(Nlm_FloatHi));
201     if (NULL == posSearch->posMatchWeights[i])
202       exit(EXIT_FAILURE);
203     for(j = 0; j < alphabetSize; j++)
204       posSearch->posMatchWeights[i][j] = 0.0;
205   }
206   posSearch->posMatrix = (BLAST_Score **) MemNew((querySize + 1) * sizeof(BLAST_Score *));
207   posSearch->posPrivateMatrix = (BLAST_Score **) MemNew((querySize + 1) * sizeof(BLAST_Score *));
208   posSearch->posFreqs = (Nlm_FloatHi **) MemNew((querySize + 1) * sizeof(Nlm_FloatHi *));
209   if (NULL == posSearch->posMatrix) exit(EXIT_FAILURE);
210   if (NULL == posSearch->posPrivateMatrix) exit(EXIT_FAILURE);
211   if (NULL == posSearch->posFreqs) exit(EXIT_FAILURE);
212   for(i = 0; i <= querySize; i++) {
213     posSearch->posMatrix[i] = (BLAST_Score *) MemNew(alphabetSize * sizeof(BLAST_Score));
214     posSearch->posPrivateMatrix[i] = (BLAST_Score *) MemNew(alphabetSize * sizeof(BLAST_Score));
215     posSearch->posFreqs[i] = (Nlm_FloatHi *) MemNew(alphabetSize * sizeof(Nlm_FloatHi));
216     if (NULL == posSearch->posMatrix[i]) exit(EXIT_FAILURE);
217     if (NULL == posSearch->posPrivateMatrix[i]) exit(EXIT_FAILURE);
218     if (NULL == posSearch->posFreqs[i]) exit(EXIT_FAILURE);
219     for(j = 0; j < alphabetSize; j++) {
220       posSearch->posMatrix[i][j] = 0;
221       posSearch->posFreqs[i][j] = 0;
222     }
223   }
224   posSearch->posSigma = (Nlm_FloatHi *) MemNew((querySize) * sizeof(Nlm_FloatHi));
225   if (NULL == posSearch->posSigma)
226     exit(EXIT_FAILURE);
227   for(i = 0; i < querySize; i++) {
228     posSearch->posSigma[i] = 0.0;
229   }
230   posSearch->posIntervalSizes = (Int4 *) MemNew((querySize) * sizeof(Int4));
231   if (NULL == posSearch->posIntervalSizes)
232     exit(EXIT_FAILURE);
233   for(i=0; i < querySize; i++)
234     posSearch->posIntervalSizes[i] = 0;
235   posSearch->posDescMatrixLength = numSequences;
236   posSearch->posDescMatrix = (posDesc **) MemNew((numSequences + 1) * sizeof(posDesc *));
237   if (NULL == posSearch->posDescMatrix)
238     exit(EXIT_FAILURE);
239   for (i = 0; i <= numSequences; i++) {
240     posSearch->posDescMatrix[i] = (posDesc *) MemNew(querySize * sizeof(posDesc));
241     if (NULL == posSearch->posDescMatrix[i])
242       exit(EXIT_FAILURE);
243     for(j = 0; j < querySize; j++) {
244       posSearch->posDescMatrix[i][j].letter = UNUSED;
245       posSearch->posDescMatrix[i][j].used = FALSE;
246       posSearch->posDescMatrix[i][j].e_value = 1.0;
247       posSearch->posDescMatrix[i][j].leftExtent = -1;
248       posSearch->posDescMatrix[i][j].rightExtent = querySize;
249     }
250   }
251   posSearch->posExtents = (posDesc *) MemNew(querySize * sizeof(posDesc));
252   if (NULL == posSearch->posExtents)
253     exit(EXIT_FAILURE);
254   for(j = 0; j < querySize; j++) {
255     posSearch->posExtents[j].used = FALSE;
256     posSearch->posExtents[j].leftExtent = -1;
257     posSearch->posExtents[j].rightExtent = querySize;
258   }
259    posSearch->posA = (Nlm_FloatHi *) MemNew((numSequences+ 1) * sizeof(Nlm_FloatHi));
260    if (NULL == posSearch->posA)
261      exit(EXIT_FAILURE);
262    posSearch->posRowSigma = (Nlm_FloatHi *) MemNew((numSequences + 1) * sizeof(Nlm_FloatHi));
263    if (NULL == posSearch->posRowSigma)
264      exit(EXIT_FAILURE);
265 }
266 
267 
CddposFreeMemory2(compactSearchItems * compactSearch)268 void LIBCALL CddposFreeMemory2(compactSearchItems * compactSearch)
269 {
270   MemFree(compactSearch->standardProb);
271 }
272 
CddposFreeMemory(posSearchItems * posSearch)273 void LIBCALL CddposFreeMemory(posSearchItems * posSearch) {
274 /*---------------------------------------------------------------------------*/
275 /*  free memory that was allocated in posAllocateMemory.                     */
276 /*---------------------------------------------------------------------------*/
277   Int4  i;
278 
279   MemFree(posSearch->posCount);
280   for (i=0; i<=posSearch->QuerySize; i++) {
281     MemFree(posSearch->posMatchWeights[i]);
282     MemFree(posSearch->posFreqs[i]);
283     MemFree(posSearch->posMatrix[i]);
284     MemFree(posSearch->posPrivateMatrix[i]);
285     MemFree(posSearch->posC[i]);
286   }
287   MemFree(posSearch->posC);
288   MemFree(posSearch->posMatchWeights);
289   MemFree(posSearch->posMatrix);
290   MemFree(posSearch->posPrivateMatrix);
291   MemFree(posSearch->posFreqs);
292 
293   for (i=0; i<=posSearch->posDescMatrixLength; i++) {
294     MemFree(posSearch->posDescMatrix[i]);
295   }
296   MemFree(posSearch->posDescMatrix);
297 
298   MemFree(posSearch->posSigma);
299   MemFree(posSearch->posIntervalSizes);
300   MemFree(posSearch->posExtents);
301   MemFree(posSearch->posA);
302   MemFree(posSearch->posRowSigma);
303   MemFree(posSearch->posInformation);
304   MemFree(posSearch->posUseSequences);
305   MemFree(posSearch->posGaplessColumnWeights);
306 
307   PSIMatrixFrequencyRatiosFree(posSearch->stdFreqRatios);
308 }
309 
310 
311 /*---------------------------------------------------------------------------*/
312 /* Copy a few fields from the lasrge record search into the small record     */
313 /* compactSearch, so that a small amount of information                      */
314 /* is passed into posit.c                                                    */
315 /*---------------------------------------------------------------------------*/
CddcopySearchItems(compactSearchItems * compactSearch,BlastSearchBlkPtr search)316 void LIBCALL CddcopySearchItems(compactSearchItems *compactSearch,
317                                 BlastSearchBlkPtr   search)
318 {
319    BLAST_ResFreqPtr stdrfp; /* gets standard frequencies in prob field */
320    Int4 a; /*index over characters*/
321 
322    compactSearch->query = search->context[0].query->sequence;
323    compactSearch->qlength = search->context[0].query->length;
324    compactSearch->gapped_calculation = search->pbp->gapped_calculation;
325    compactSearch->alphabetSize = search->sbp->alphabet_size;
326    compactSearch->pseudoCountConst = search->pbp->pseudoCountConst;
327    compactSearch->ethresh = search->pbp->ethresh;
328    compactSearch->matrix = search->sbp->matrix;
329    compactSearch->kbp_psi = search->sbp->kbp_psi;
330    compactSearch->kbp_gap_psi = search->sbp->kbp_gap_psi;
331    compactSearch->kbp_std = search->sbp->kbp_std;
332    compactSearch->kbp_gap_std = search->sbp->kbp_gap_std;
333    compactSearch->use_best_align = search->pbp->use_best_align;
334 
335 /*---------------------------------------------------------------------------*/
336 /* these are removed, not initialized in CddCposComputation                  */
337 /*---------------------------------------------------------------------------*/
338 /*   compactSearch->lambda =  search->sbp->kbp[0]->Lambda; */
339 /*   compactSearch->lambda_ideal = search->sbp->kbp_ideal->Lambda; */
340 /*   compactSearch->K_ideal = search->sbp->kbp_ideal->K; */
341 
342    stdrfp = BlastResFreqNew(search->sbp);
343    BlastResFreqStdComp(search->sbp,stdrfp);
344    compactSearch->standardProb = MemNew(compactSearch->alphabetSize * sizeof(Nlm_FloatHi));
345    if (NULL == compactSearch->standardProb)
346      exit(EXIT_FAILURE);
347    for(a = 0; a < compactSearch->alphabetSize; a++)
348      compactSearch->standardProb[a] = stdrfp->prob[a];
349    stdrfp = BlastResFreqDestruct(stdrfp);
350 }
351 
352 /*---------------------------------------------------------------------------*/
353 /* - find all sequences, basically, for Cdd-specific purposes                */
354 /*---------------------------------------------------------------------------*/
CddfindThreshSequences(posSearchItems * posSearch,SeqAlignPtr listOfSeqAligns,Int4 numalign,Int4 numseq)355 void LIBCALL CddfindThreshSequences(posSearchItems *posSearch, SeqAlignPtr listOfSeqAligns, Int4 numalign, Int4 numseq)
356 {
357   SeqAlignPtr curSeqAlign;                /* pointers into list of seqAligns */
358   DenseSegPtr curSegs;                          /* Item in list of seqAligns */
359   SeqIdPtr    thisId;         /* Ids of target sequences in current SeqAlign */
360   Int4        ordinalNumber;            /* index of sequence within database */
361 
362 /* Allocate boolean array to store values */
363   posSearch->posResultSequences = (Int4 *) MemNew(numalign * sizeof(Int4));
364   posSearch->posResultsCounter = 0;
365 
366   curSeqAlign = listOfSeqAligns;
367   ordinalNumber = -1;
368   while (curSeqAlign) {
369     curSegs = (DenseSegPtr) curSeqAlign->segs;
370     thisId = curSegs->ids->next;
371 /* Get ordinal ids of sequences in result */
372     ordinalNumber++;
373     posSearch->posResultSequences[posSearch->posResultsCounter]=ordinalNumber;
374     posSearch->posResultsCounter++;
375     curSeqAlign = curSeqAlign->next;
376   }
377 }
378 
379 /*---------------------------------------------------------------------------*/
380 /* - find all sequences, basically, for Cdd-specific purposes                */
381 /*---------------------------------------------------------------------------*/
CddfindDenseDiagThreshSequences(posSearchItems * posSearch,SeqAlignPtr listOfSeqAligns,Int4 numalign,Int4 numseq)382 void LIBCALL CddfindDenseDiagThreshSequences(posSearchItems *posSearch, SeqAlignPtr listOfSeqAligns, Int4 numalign, Int4 numseq)
383 {
384   SeqAlignPtr  curSeqAlign;               /* pointers into list of seqAligns */
385   DenseDiagPtr curSegs;                         /* Item in list of seqAligns */
386   SeqIdPtr     thisId;        /* Ids of target sequences in current SeqAlign */
387   Int4         ordinalNumber;           /* index of sequence within database */
388 
389 /* Allocate boolean array to store values */
390   posSearch->posResultSequences = (Int4 *) MemNew(numalign * sizeof(Int4));
391   posSearch->posResultsCounter = 0;
392 
393   curSeqAlign = listOfSeqAligns;
394   ordinalNumber = -1;
395   while (curSeqAlign) {
396     curSegs = (DenseDiagPtr) curSeqAlign->segs;
397     thisId = curSegs->id->next;
398 /* Get ordinal ids of sequences in result */
399     ordinalNumber++;
400     posSearch->posResultSequences[posSearch->posResultsCounter]=ordinalNumber;
401     posSearch->posResultsCounter++;
402     curSeqAlign = curSeqAlign->next;
403   }
404 }
405 
406 
407 /*---------------------------------------------------------------------------*/
408 /*---------------------------------------------------------------------------*/
CddposInitializeInformation(posSearchItems * posSearch,BLAST_ScoreBlkPtr sbp,compactSearchItems * compactSearch)409 void LIBCALL CddposInitializeInformation(posSearchItems *posSearch, BLAST_ScoreBlkPtr sbp, compactSearchItems *compactSearch)
410 {
411   Uint1Ptr query;
412   Int4 querySize;
413   Int4 c, a, alphabetSize;
414   BLAST_ResFreqPtr stdrfp; /*standard frequencies*/
415   Nlm_FloatHi lambda;
416   Nlm_FloatHi term1, term2, term3, term4;
417   Nlm_FloatHi infoSum;
418 
419   querySize = compactSearch->qlength;
420   query = compactSearch->query;
421   posSearch->posInformation = (Nlm_FloatHi *) MemNew(querySize * sizeof(Nlm_FloatHi));
422   if (NULL == posSearch->posInformation) exit(EXIT_FAILURE);
423   for(c = 0; c < querySize; c++)
424     posSearch->posInformation[c] = 0.0;
425   alphabetSize = compactSearch->alphabetSize;
426   /*Compute standard frequencies as in BlastScoreBlkFill in blastkar.c*/
427   stdrfp = BlastResFreqNew(sbp);
428   BlastResFreqStdComp(sbp,stdrfp);
429   lambda = sbp->kbp[0]->Lambda;
430   for(c = 0; c < querySize; c++) {
431     infoSum = 0;
432     for(a = 0; a < alphabetSize; a++)
433       if (stdrfp->prob[a] > posEpsilon) {
434         term1 = sbp->matrix[query[c]][a];
435 	      term2 = term1 * lambda;
436 	      term3 = exp(term2);
437 	      term4 = stdrfp->prob[a] * term3;
438 	      infoSum += term4 * log(term4/stdrfp->prob[a])/NCBIMATH_LN2;
439       }
440     posSearch->posInformation[c] = infoSum;
441   }
442 }
443 
444 
445 /*---------------------------------------------------------------------------*/
446 /* have added sequence retrieval via ID1 in case BioseqLockById fails        */
447 /*---------------------------------------------------------------------------*/
CddGetSequenceWithDenseDiag(DenseDiagPtr ddp,Boolean query,Int4Ptr start,Int4Ptr stop)448 static Uint1Ptr LIBCALL CddGetSequenceWithDenseDiag(DenseDiagPtr ddp, Boolean query,
449                                                     Int4Ptr start, Int4Ptr stop)
450 {
451   BioseqPtr    bsp;
452   Int4         index, offset;
453   Int4         length;
454   SeqIdPtr     id;
455   SeqPortPtr   spp;
456   Uint1Ptr     buffer;
457   DenseDiagPtr ddpThis;
458   SeqEntryPtr  sep;
459   Int4         uid;
460   Int2         retcode = 1;
461 
462   ddpThis = ddp;
463 
464   if (ddpThis == NULL) return NULL;
465   if (query == TRUE) {
466     id = ddpThis->id;
467   } else {
468     offset = 1;
469     id = ddpThis->id->next;
470   }
471 
472   *start = ddpThis->starts[offset];
473   *stop = 0;
474   while (ddpThis) {
475     *stop = ddpThis->starts[offset] + ddpThis->len - 1;
476     ddpThis = ddpThis->next;
477   }
478 	bsp = BioseqLockById(id);
479   if (bsp) {
480   	spp = SeqPortNew(bsp, *start, *stop, Seq_strand_unknown, Seq_code_ncbistdaa);
481     length = *stop - *start + 1;
482 	  buffer = MemNew((length)*sizeof(Uint1));
483 	  for (index=0; index<length; index++) buffer[index] = SeqPortGetResidue(spp);
484 	  spp = SeqPortFree(spp);
485 	  BioseqUnlock(bsp);
486 	  return buffer;
487   } else {
488     uid = ID1FindSeqId(id);
489     sep = (SeqEntryPtr) ID1SeqEntryGet(uid,retcode);
490     bsp = BioseqFindInSeqEntry(id,sep);
491   	spp = SeqPortNew(bsp, *start, *stop, Seq_strand_unknown, Seq_code_ncbistdaa);
492     length = *stop - *start + 1;
493 	  buffer = MemNew((length)*sizeof(Uint1));
494 	  for (index=0; index<length; index++) buffer[index] = SeqPortGetResidue(spp);
495 	  spp = SeqPortFree(spp);
496 	  return buffer;
497   }
498 }
499 
500 /*---------------------------------------------------------------------------*/
501 /*---------------------------------------------------------------------------*/
CddGetSequenceWithDenseSeg(DenseSegPtr dsp,Boolean query,Int4Ptr start,Int4Ptr length)502 static Uint1Ptr LIBCALL CddGetSequenceWithDenseSeg(DenseSegPtr dsp, Boolean query,
503                                                    Int4Ptr start, Int4Ptr length)
504 {
505   BioseqPtr  bsp;
506   Int4       index, offset;
507   SeqIdPtr   id;
508   SeqPortPtr spp;
509   Uint1Ptr   buffer;
510 
511   if (dsp == NULL) return NULL;
512 
513   if (query == TRUE) {
514     offset = 0;
515     id = dsp->ids;
516   } else {
517     offset = 1;
518     id = dsp->ids->next;
519   }
520   *start = dsp->starts[offset];
521   *length = 0;
522   for (index=0; index<dsp->numseg; index++) {
523     if (dsp->starts[offset+2*index] != -1) *length += dsp->lens[index];
524   }
525   bsp = BioseqLockById(id);
526   spp = SeqPortNew(bsp, *start, (*start)+(*length)-1, Seq_strand_unknown, Seq_code_ncbistdaa);
527   buffer = MemNew((*length)*sizeof(Uint1));
528   for (index=0; index<*length; index++) buffer[index] = SeqPortGetResidue(spp);
529   spp = SeqPortFree(spp);
530   BioseqUnlock(bsp);
531   return buffer;
532 }
533 
534 /*---------------------------------------------------------------------------*/
535 /* Eliminate the matches from sequence second starting at position           */
536 /* matchStart and extending for intervalLength characters                    */
537 /*---------------------------------------------------------------------------*/
CddposCancel(posSearchItems * posSearch,compactSearchItems * compactSearch,Int4 first,Int4 second,Int4 matchStart,Int4 intervalLength)538 static void LIBCALL CddposCancel(posSearchItems *posSearch, compactSearchItems * compactSearch,
539                               Int4 first, Int4 second, Int4 matchStart, Int4 intervalLength)
540 {
541   Int4 c, i;
542   Boolean stillNeeded;
543 
544   for(c = matchStart, i = 0; i < intervalLength; i++, c++) {
545     posSearch->posDescMatrix[second][c].used = FALSE;
546     posSearch->posDescMatrix[second][c].letter = 0;
547   }
548   stillNeeded = FALSE;
549   for (c = 0; c < compactSearch->qlength; c++)
550     if (posSearch->posDescMatrix[second][c].used) {
551       stillNeeded = TRUE;
552       break;
553     }
554    if (!stillNeeded)
555       posSearch->posUseSequences[second] = FALSE;
556 }
557 
558 
559 
560 /*---------------------------------------------------------------------------*/
561 /*Eliminate sequences that are identical to the query and partial alignments
562   that are identical in two matching sequences*/
563 /*---------------------------------------------------------------------------*/
CddposPurgeMatches(posSearchItems * posSearch,compactSearchItems * compactSearch)564 void LIBCALL CddposPurgeMatches(posSearchItems *posSearch, compactSearchItems * compactSearch)
565 {
566   Int4    i, j;                                        /*index over sequences*/
567   Boolean matchesQuery;      /*Is a matching sequence identical to the query?*/
568   Int4    c;                   /*index over demographics of matching sequence*/
569   Int4    state;                              /*state of checking for a match*/
570   Int4    intervalLength, matchStart; /*Length and start of a matching region*/
571   Int4    matchNumber;                        /*number of characters matching*/
572 
573   posSearch->posUseSequences =  (Boolean *) MemNew((posSearch->posNumSequences + 1) * sizeof(Boolean));
574    if (NULL == posSearch->posUseSequences)  exit(EXIT_FAILURE);
575   for(i = 0; i <= posSearch->posNumSequences; i++)
576     posSearch->posUseSequences[i] = TRUE;
577   for(i = 1; i <= posSearch->posNumSequences; i++) {
578     matchesQuery = TRUE;
579     for (c = 0; c < compactSearch->qlength; c++) {
580       if ((!posSearch->posDescMatrix[i][c].used) ||
581           (posSearch->posDescMatrix[i][c].letter !=
582            posSearch->posDescMatrix[0][c].letter)) {
583         matchesQuery = FALSE;
584         break;
585       }
586     }
587     if (matchesQuery) posSearch->posUseSequences[i] = FALSE;
588   }
589   for(j = 1; j <= posSearch->posNumSequences; j++) {
590     if (!posSearch->posUseSequences[j]) continue;
591     state = POS_COUNTING;
592     c = 0;
593     matchStart = 0;
594     intervalLength = 0;
595     matchNumber = 0;
596     while (c < compactSearch->qlength) {
597       if (posSearch->posDescMatrix[j][c].used) {
598         if ((posSearch->posDescMatrix[0][c].letter != Xchar) &&
599 	          (posSearch->posDescMatrix[j][c].letter != Xchar)) {
600 	        if (state == POS_RESTING) {
601 	          matchStart = c;
602 	          intervalLength = 1;
603 	          state = POS_COUNTING;
604 	          matchNumber = 0;
605 	        } else  intervalLength++;
606 	        if (posSearch->posDescMatrix[j][c].used &&
607 	            (posSearch->posDescMatrix[0][c].letter == posSearch->posDescMatrix[j][c].letter))
608 	          matchNumber++;
609 	      }
610       } else {
611 	      if (state == POS_COUNTING) {
612 	        if ((intervalLength > 0) && (matchNumber == intervalLength))
613 	          CddposCancel(posSearch,compactSearch,0,j,matchStart,intervalLength);
614 	        state = POS_RESTING;
615 	      }
616       }
617       c++;
618     }
619     if (state == POS_COUNTING) /*at end of sequence i*/
620       if ((intervalLength > 0) && (matchNumber == intervalLength))
621 	CddposCancel(posSearch,compactSearch,0,j,matchStart,intervalLength);
622   }
623   for (i = 1; i < posSearch->posNumSequences; i++) {
624     if (!posSearch->posUseSequences[i])
625       continue;
626     for(j = i+1; j <= posSearch->posNumSequences; j++) {
627       if (!posSearch->posUseSequences[j])
628 	continue;
629       state = POS_COUNTING;
630       c = 0;
631       matchStart = 0;
632       intervalLength = 0;
633       matchNumber = 0;
634       while (c < compactSearch->qlength) {
635 	if (posSearch->posDescMatrix[i][c].used ||
636 	    posSearch->posDescMatrix[j][c].used) {
637 	  if ((posSearch->posDescMatrix[i][c].letter != Xchar) &&
638 	      (posSearch->posDescMatrix[j][c].letter != Xchar)) {
639 	    if (state == POS_RESTING) {
640 	      matchStart = c;
641 	      intervalLength = 1;
642 	      state = POS_COUNTING;
643 	      matchNumber = 0;
644 	    }
645 	    else
646 	      intervalLength++;
647 	    if (posSearch->posDescMatrix[i][c].used &&
648 		posSearch->posDescMatrix[j][c].used &&
649 		(posSearch->posDescMatrix[i][c].letter == posSearch->posDescMatrix[j][c].letter))
650 	      matchNumber++;
651 	    }
652 	}
653 	else {
654 	  if (state == POS_COUNTING) {
655 	    if ((intervalLength > 0) && ((((Nlm_FloatHi) matchNumber)/intervalLength) >= IDENTITY_RATIO))
656 	      CddposCancel(posSearch,compactSearch,i,j,matchStart,intervalLength);
657 	    state = POS_RESTING;
658 	  }
659 	}
660 	c++;
661       }
662       if (state == POS_COUNTING) /*at end of sequence i*/
663 	if ((intervalLength > 0) && ((((Nlm_FloatHi) matchNumber)/intervalLength) >= IDENTITY_RATIO))
664 	  CddposCancel(posSearch,compactSearch,i,j,matchStart,intervalLength);
665     }
666   }
667 }
668 
669 /*---------------------------------------------------------------------------*/
670 /*---------------------------------------------------------------------------*/
CddposComputeExtents(posSearchItems * posSearch,compactSearchItems * compactSearch)671 void LIBCALL CddposComputeExtents(posSearchItems *posSearch, compactSearchItems * compactSearch)
672 {
673    Int4     seqIndex;                                     /*index of sequence*/
674    Int4     length;                                         /*length of query*/
675    Int4     qplace, qplace2;                                 /*place in query*/
676    Int4     numseq;                     /*number of sequences including query*/
677    Uint1Ptr q;                                         /*pointers into query */
678 
679    length = compactSearch->qlength;
680    numseq = posSearch->posNumSequences;
681    q = compactSearch->query;
682    for(seqIndex = 0; seqIndex < numseq; seqIndex++) {
683      if (!posSearch->posUseSequences[seqIndex+1])
684        continue;
685      if ((posSearch->posDescMatrix[seqIndex+1][0].used)
686 	 && (posSearch->posDescMatrix[seqIndex+1][length-1].letter != GAP_CHAR))
687        posSearch->posDescMatrix[seqIndex+1][0].leftExtent = 0;
688      for(qplace = 1; qplace < length; qplace++)
689        if(posSearch->posDescMatrix[seqIndex+1][qplace].used) {
690 	 if(posSearch->posDescMatrix[seqIndex+1][qplace-1].used)
691 	   posSearch->posDescMatrix[seqIndex+1][qplace].leftExtent =
692 	     posSearch->posDescMatrix[seqIndex+1][qplace -1].leftExtent;
693 	 else
694 	   posSearch->posDescMatrix[seqIndex+1][qplace].leftExtent = qplace;
695        }
696      if ((posSearch->posDescMatrix[seqIndex+1][length-1].used)
697 	 && (posSearch->posDescMatrix[seqIndex+1][length-1].letter != GAP_CHAR))
698        posSearch->posDescMatrix[seqIndex+1][length-1].rightExtent = length -1;
699      for(qplace = length -2; qplace >= 0; qplace--)
700        if(posSearch->posDescMatrix[seqIndex+1][qplace].used) {
701 	 if(posSearch->posDescMatrix[seqIndex+1][qplace+1].used)
702 	   posSearch->posDescMatrix[seqIndex+1][qplace].rightExtent =
703 	     posSearch->posDescMatrix[seqIndex+1][qplace + 1].rightExtent;
704 	 else
705 	   posSearch->posDescMatrix[seqIndex+1][qplace].rightExtent = qplace;
706        }
707      for(qplace = 0; qplace < length; qplace++)
708        if (posSearch->posDescMatrix[seqIndex+1][qplace].used) {
709 	 posSearch->posExtents[qplace].leftExtent = MAX(posSearch->posExtents[qplace].leftExtent,
710 							posSearch->posDescMatrix[seqIndex+1][qplace].leftExtent);
711 	 posSearch->posExtents[qplace].rightExtent = MIN(posSearch->posExtents[qplace].rightExtent,
712 							 posSearch->posDescMatrix[seqIndex+1][qplace].rightExtent);
713 
714        }
715 
716      for(qplace = 0; qplace < length; qplace++)
717      /*used to check qplace for GAP_CHAR here*/
718        if (posSearch->posDescMatrix[seqIndex+1][qplace].used) {
719 	 posSearch->posC[qplace][posSearch->posDescMatrix[seqIndex+1][qplace].letter]++;
720 	 posSearch->posCount[qplace]++; /*Add to number of matches in this query position*/
721        }
722    }
723    for(qplace = 0; qplace < length; qplace++)
724      posSearch->posIntervalSizes[qplace] = posSearch->posExtents[qplace].rightExtent -
725        posSearch->posExtents[qplace].leftExtent + 1;
726    for(qplace =0; qplace < length; qplace++) {
727      if(Xchar == q[qplace]) {
728        posSearch->posIntervalSizes[qplace] = 0;
729        for(qplace2 = 0; qplace2 <qplace; qplace2++) {
730 	 if(posSearch->posExtents[qplace2].rightExtent >= qplace)
731 	   posSearch->posIntervalSizes[qplace2]--;
732        }
733      }
734    }
735  }
736 
737 /*---------------------------------------------------------------------------*/
738 /*Compute weight of each sequence and letter in each position*/
739 /*---------------------------------------------------------------------------*/
CddposComputeSequenceWeights(posSearchItems * posSearch,compactSearchItems * compactSearch)740 void LIBCALL CddposComputeSequenceWeights(posSearchItems *posSearch,
741                                           compactSearchItems * compactSearch)
742 {
743    Int4        length;                                      /*length of query*/
744    Int4        numseq, seqIndex;          /*number of matches, index for them*/
745    Int4        i;                        /*index over a multi-alignment block*/
746    Int4        qplace;                                     /*index into query*/
747    Nlm_FloatHi Sigma;   /*Number of different characters occurring in matches
748                   within a multi-alignment block, excluding identical columns*/
749    Nlm_FloatHi intervalSigma;  /*Same as Sigma but includes identical columns*/
750    Int4        alphabetSize;               /*number of characters in alphabet*/
751    Int4        *participatingSequences;   /*array of part. seq. at a position*/
752    Int4        *oldParticipatingSequences;/*array of part. seq. at a position*/
753    Int4        posLocalVariety;/*number of different characters at a position*/
754    Int4        *posLocalC; /*counts of how many of each letter in this column*/
755    Int4        c;
756    Int4        thisSeq;
757    Int4        numParticipating;     /*number of seq. in this alignment block*/
758    Int4        oldNumParticipating;  /*number of seq. in this alignment block*/
759    Boolean     newSequenceSet;
760    Int4        p;                                        /*index on sequences*/
761 
762    alphabetSize = compactSearch->alphabetSize;
763    length = compactSearch->qlength;
764    numseq = posSearch->posNumSequences;
765    participatingSequences = (Int4 *) MemNew((numseq+1) * sizeof(Int4));
766    if (NULL == participatingSequences)
767      exit(EXIT_FAILURE);
768    oldParticipatingSequences = (Int4 *) MemNew((numseq+1) * sizeof(Int4));
769    if (NULL == oldParticipatingSequences)
770      exit(EXIT_FAILURE);
771    posLocalC = (Int4 *) MemNew(alphabetSize * sizeof(Int4));
772    if (NULL == posLocalC)
773      exit(EXIT_FAILURE);
774    for (qplace = 0; qplace < length; qplace++) {
775      posSearch->posSigma[qplace] = 0.0;
776    }
777    numParticipating = 0;
778    for(qplace = 0; qplace < length; qplace++) {
779      if ((posSearch->posCount[qplace] > 1) && (posSearch->posIntervalSizes[qplace] > 0)) {
780        oldNumParticipating = numParticipating;
781        for(p =0; p < numParticipating; p++)
782          oldParticipatingSequences[p] = participatingSequences[p];
783        numParticipating = 0;
784        for (seqIndex = 0; seqIndex <= numseq; seqIndex++) {
785          if (!posSearch->posUseSequences[seqIndex]) continue;
786 	       if ((posSearch->posDescMatrix[seqIndex][qplace].used) &&
787 	           (posSearch->posDescMatrix[seqIndex][qplace].letter != GAP_CHAR)) {
788 	         participatingSequences[numParticipating] = seqIndex;
789 	         numParticipating++;
790 	       }
791        }
792        newSequenceSet = TRUE;
793        if (numParticipating == oldNumParticipating) {
794          for(p = 0; p < numParticipating; p++)
795            if (oldParticipatingSequences[p] != participatingSequences[p]) break;
796          if (p == numParticipating) newSequenceSet = FALSE;
797        }
798        if (newSequenceSet) {
799 	     Sigma = 0;
800 	     intervalSigma = 0;
801 	     for (seqIndex = 0; seqIndex <= numseq; seqIndex++) {
802 	       if (!posSearch->posUseSequences[seqIndex]) continue;
803 	       posSearch->posRowSigma[seqIndex] = 0.0;
804 	       posSearch->posA[seqIndex] = 0.0;
805 	     }
806 	     for (i = posSearch->posExtents[qplace].leftExtent;
807 	          i <= posSearch->posExtents[qplace].rightExtent; i++) {
808 	       posLocalVariety = 0;
809 	       for(c = 0; c < alphabetSize; c++) posLocalC[c] = 0;
810 	       for(seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
811 	         thisSeq = participatingSequences[seqIndex];
812 /* used to check for GAP here */
813 	         if (0 == posLocalC[posSearch->posDescMatrix[thisSeq][i].letter])
814 /* letter (not a gap) not seen before in this query pos. */
815 	           posLocalVariety++;
816 	         posLocalC[posSearch->posDescMatrix[thisSeq][i].letter]++;
817 	       }
818 	       intervalSigma += posLocalVariety;
819 	       if (posLocalVariety > 1) {
820             Sigma += posLocalVariety;
821 	          for(seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
822 	            thisSeq = participatingSequences[seqIndex];
823 /* used to check for gap here */
824 	            posSearch->posRowSigma[thisSeq] += ( 1.0 / (Nlm_FloatHi) posLocalC[posSearch->posDescMatrix[thisSeq][i].letter]);
825 	          }
826 	        }
827 	      }
828       }
829       if (Sigma > 0) {
830         for (seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
831 	        thisSeq = participatingSequences[seqIndex];
832 	        posSearch->posA[thisSeq] = posSearch->posRowSigma[thisSeq]/Sigma;
833 	      }
834       } else {
835         for (seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
836           thisSeq = participatingSequences[seqIndex];
837 	        posSearch->posA[thisSeq] = ((Nlm_FloatHi) 1 / (Nlm_FloatHi) numParticipating);
838         }
839       }
840       posSearch->posSigma[qplace] = intervalSigma;
841       for (seqIndex = 0; seqIndex < numParticipating; seqIndex++) {
842 	      thisSeq = participatingSequences[seqIndex];
843         posSearch->posMatchWeights[qplace][posSearch->posDescMatrix[thisSeq][qplace].letter] += posSearch->posA[thisSeq];
844       }
845     }
846   }
847   MemFree(participatingSequences);
848   MemFree(oldParticipatingSequences);
849   MemFree(posLocalC);
850 }
851 
852 /*---------------------------------------------------------------------------*/
853 /*check that weights add to 1 in each column */
854 /*---------------------------------------------------------------------------*/
CddposCheckWeights(posSearchItems * posSearch,compactSearchItems * compactSearch)855 void LIBCALL CddposCheckWeights(posSearchItems *posSearch, compactSearchItems * compactSearch)
856 {
857    Uint1Ptr q;  /*pointer to query*/
858    Int4 length, alphabetSize; /*length of query and number of characters in alphabet*/
859    Int4  a, c; /*loop indices*/
860    Nlm_FloatHi runningSum; /*partial total for a column*/
861 
862    length = compactSearch->qlength;
863    alphabetSize = compactSearch->alphabetSize;
864 
865    q = compactSearch->query;
866    for(c = 0; c < length; c++) {
867      if ((posSearch->posCount[c] > 1) && (q[c] != Xchar)) {
868        runningSum = 0;
869        for(a = 0; a < alphabetSize; a++)
870            runningSum += posSearch->posMatchWeights[c][a];
871        if((runningSum < 0.99) || (runningSum > 1.01))
872          ErrPostEx(SEV_ERROR, 0, 0, "\nERROR IN WEIGHTS, column %d, value %lf\n",c, runningSum);
873      }
874    }
875 }
876 
877 /*---------------------------------------------------------------------------*/
878 /*---------------------------------------------------------------------------*/
CddallocatePosFreqs(Int4 length,Int4 alphabetSize)879 static Nlm_FloatHi ** LIBCALL CddallocatePosFreqs(Int4 length, Int4 alphabetSize)
880 {
881   Int4 c, i; /*loop indices*/
882   Nlm_FloatHi ** returnArray;
883 
884   returnArray = (Nlm_FloatHi **) MemNew((length + 1) * sizeof(Nlm_FloatHi *));
885   if (NULL == returnArray)
886     exit(EXIT_FAILURE);
887   for(i = 0; i <= length; i++) {
888     returnArray[i] = (Nlm_FloatHi *) MemNew(alphabetSize * sizeof(Nlm_FloatHi));
889     if (NULL == returnArray[i])
890       exit(EXIT_FAILURE);
891     for(c = 0; c < alphabetSize; c++)
892       returnArray[i][c] = 0.0;
893   }
894   return(returnArray);
895 }
896 
897 
898 /*---------------------------------------------------------------------------*/
899 /*---------------------------------------------------------------------------*/
countsFunction(Nlm_FloatHi Sigma,Int4 intervalLength)900 static Nlm_FloatHi LIBCALL countsFunction(Nlm_FloatHi Sigma, Int4 intervalLength)
901 {
902   return(Sigma / intervalLength - 1);
903 }
904 
905 
906 /*---------------------------------------------------------------------------*/
907 /*---------------------------------------------------------------------------*/
CddposComputePseudoFreqs(posSearchItems * posSearch,compactSearchItems * compactSearch,Boolean Cpos)908 Nlm_FloatHi ** LIBCALL CddposComputePseudoFreqs(posSearchItems *posSearch, compactSearchItems * compactSearch, Boolean Cpos)
909 {
910    Uint1Ptr q;  /*pointer to the query*/
911    Int4 length;  /*length of the query*/
912    Int4 c; /*loop index*/
913    Int4 a, aSub, alphabetSize; /*loop indices and size of alphabet*/
914    Nlm_FloatHi lambda; /*Karlin-Altschul parameter*/
915    Nlm_FloatHi Sigma;  /*number of characters in an interval*/
916    Int4 intervalLength;  /*length of a block*/
917    Nlm_FloatHi pseudo, numerator, denominator, qOverPEstimate; /*intermediate terms*/
918    Nlm_FloatHi infoSum; /*sum used for information content*/
919    Nlm_FloatHi **posFreqs; /*store frequencies*/
920 
921    q = compactSearch->query;
922    length = compactSearch->qlength;
923 
924    alphabetSize = compactSearch->alphabetSize;
925    lambda = compactSearch->lambda_ideal;
926    posFreqs = CddallocatePosFreqs(length, alphabetSize);
927 
928    for(c = 0; c < length; c++) {
929      if ((posSearch->posCount[c] > 1) && (Xchar != q[c])) {
930        infoSum = 0;
931        for(a = 0; a < alphabetSize; a++) {
932          if (compactSearch->standardProb[a] > posEpsilon) {
933 	   pseudo = 0;
934 	   for (aSub = 0; aSub < alphabetSize; aSub++)
935 	     if(compactSearch->matrix[a][aSub] != BLAST_SCORE_MIN)
936 	       pseudo += (posSearch->posMatchWeights[c][aSub] *
937 			exp(lambda * compactSearch->matrix[a][aSub]));
938 	   pseudo *= (compactSearch->pseudoCountConst);
939            Sigma = posSearch->posSigma[c];
940            intervalLength = posSearch->posIntervalSizes[c];
941 	   numerator = pseudo +
942              (countsFunction(Sigma, intervalLength) * posSearch->posMatchWeights[c][a]/
943                 compactSearch->standardProb[a]);
944 	   denominator = countsFunction(Sigma, intervalLength) + (compactSearch->pseudoCountConst);
945 	   qOverPEstimate = numerator / denominator;
946 	   /*Note artificial multiplication by standard probability to
947              normalize*/
948            posFreqs[c][a] = qOverPEstimate * compactSearch->standardProb[a];
949 	 if (0.0 != qOverPEstimate && (compactSearch->standardProb[a] > posEpsilon))
950 	   infoSum += qOverPEstimate * compactSearch->standardProb[a] * log(qOverPEstimate)/ NCBIMATH_LN2;
951 	 }
952         else
953           posFreqs[c][a] = 0.0;
954        }
955        if (Cpos)
956 	 posSearch->posInformation[c] = infoSum;
957      }
958      else
959        for(a = 0; a < alphabetSize; a++) {
960          posFreqs[c][a] = 0;
961        }
962    }
963   return(posFreqs);
964 }
965 
966 /*---------------------------------------------------------------------------*/
967 /*---------------------------------------------------------------------------*/
968 /* from posit.c as of 12/29/2000                                             */
969 /*---------------------------------------------------------------------------*/
970 /*---------------------------------------------------------------------------*/
CddfreePosFreqs(Nlm_FloatHi ** posFreqs,Int4 length)971 static void CddfreePosFreqs(Nlm_FloatHi ** posFreqs, Int4 length)
972 {
973   Int4 i;
974 
975   for (i = 0; i <= length; i++)
976     MemFree(posFreqs[i]);
977   MemFree(posFreqs);
978 }
979 
980 /*---------------------------------------------------------------------------*/
981 /*---------------------------------------------------------------------------*/
982 /*---------------------------------------------------------------------------*/
983 /*---------------------------------------------------------------------------*/
posit_rounddown(Nlm_FloatHi value)984 static Nlm_FloatHi LIBCALL posit_rounddown(Nlm_FloatHi value)
985 {
986   return (Nlm_FloatHi) Nlm_Nint(value);
987 }
988 
989 
990 
991 /*---------------------------------------------------------------------------*/
992 /*Convert pseudo-count frequencies to a score matrix                         */
993 /*---------------------------------------------------------------------------*/
CddposFreqsToMatrix(posSearchItems * posSearch,compactSearchItems * compactSearch)994 void LIBCALL CddposFreqsToMatrix(posSearchItems *posSearch, compactSearchItems * compactSearch)
995 {
996    Uint1Ptr q;                                         /*pointer to the query*/
997    Int4 length;                                         /*length of the query*/
998    Int4 c;                                                       /*loop index*/
999    Int4 a, alphabetSize;                    /*loop index and size of alphabet*/
1000    Nlm_FloatHi lambda;                            /*Karlin-Altschul parameter*/
1001    Nlm_FloatHi  qOverPEstimate, value;                   /*intermediate terms*/
1002    Boolean allZeros;                     /*are all frequencies in a column 0?*/
1003 
1004    q = compactSearch->query;
1005    length = compactSearch->qlength;
1006 
1007    alphabetSize = compactSearch->alphabetSize;
1008    lambda = compactSearch->lambda_ideal;
1009 
1010    for(c = 0; c < length; c++) {
1011      allZeros = TRUE;
1012      for(a = 0; a < alphabetSize; a++) {
1013        /*Division compensates for multiplication in posComputePsedoFreqs*/
1014        if (compactSearch->standardProb[a] > posEpsilon)
1015 	 qOverPEstimate = posSearch->posFreqs[c][a]/compactSearch->standardProb[a];
1016        else
1017         qOverPEstimate = 0.0;
1018        if (qOverPEstimate != 0.0)
1019          allZeros = FALSE;
1020        if (0.0 == qOverPEstimate || (compactSearch->standardProb[a] < posEpsilon))
1021 	 posSearch->posPrivateMatrix[c][a] = BLAST_SCORE_MIN;
1022        else {
1023 	 value = log(qOverPEstimate)/lambda;
1024 	 posSearch->posPrivateMatrix[c][a] = (BLAST_Score) posit_rounddown(POSIT_SCALE_FACTOR2*value);
1025        }
1026      }
1027      if (allZeros) {
1028        for(a = 0; a < alphabetSize; a++) {
1029          posSearch->posMatrix[c][a] = compactSearch->matrix[q[c]][a];
1030 	 if (compactSearch->matrix[q[c]][a] == BLAST_SCORE_MIN)
1031 		posSearch->posPrivateMatrix[c][a] = BLAST_SCORE_MIN;
1032 	 else
1033          	posSearch->posPrivateMatrix[c][a] = POSIT_SCALE_FACTOR2*compactSearch->matrix[q[c]][a];
1034        }
1035      }
1036    }
1037    for(a = 0; a < alphabetSize; a++) {
1038      posSearch->posPrivateMatrix[length][a] = posSearch->posPrivateMatrix[length][a] = BLAST_SCORE_MIN;
1039    }
1040 }
1041 
1042 /*---------------------------------------------------------------------------*/
1043 /*---------------------------------------------------------------------------*/
CddposScaling(posSearchItems * posSearch,compactSearchItems * compactSearch)1044 void LIBCALL CddposScaling(posSearchItems *posSearch, compactSearchItems * compactSearch)
1045 {
1046 	BlastMatrixRescalePtr matrix_rescale;
1047 
1048 	matrix_rescale = BlastMatrixRescaleNew(compactSearch->alphabetSize,
1049 						compactSearch->qlength,
1050 						compactSearch->query,
1051 						compactSearch->standardProb,
1052 						posSearch->posMatrix,
1053 						posSearch->posPrivateMatrix,
1054 						compactSearch->kbp_std,
1055 						compactSearch->kbp_psi,
1056 						compactSearch->kbp_gap_std,
1057 						compactSearch->kbp_gap_psi,
1058 						compactSearch->lambda_ideal,
1059 						compactSearch->K_ideal);
1060 
1061 	BlastScaleMatrix(matrix_rescale, TRUE);
1062 
1063 	matrix_rescale = BlastMatrixRescaleDestruct(matrix_rescale);
1064 
1065 	return;
1066 }
1067 
1068 
1069 /*---------------------------------------------------------------------------*/
1070 /*---------------------------------------------------------------------------*/
1071 /* Compute general information about the sequences that matched on the       */
1072 /* i-th pass such as how many matched at each query position and what letter */
1073 /* matched                                                                   */
1074 /*---------------------------------------------------------------------------*/
1075 /*---------------------------------------------------------------------------*/
CddposDenseDiagDemographics(posSearchItems * posSearch,compactSearchItems * compactSearch,SeqAlignPtr listOfSeqAligns)1076 void LIBCALL CddposDenseDiagDemographics(posSearchItems     *posSearch,
1077                                          compactSearchItems *compactSearch,
1078                                          SeqAlignPtr        listOfSeqAligns)
1079 {
1080    Uint1Ptr     q;                                     /*pointers into query */
1081    Uint1Ptr     s;                          /*pointer into a matching string */
1082    Int4         length, subjectLength;          /*length of query and subject*/
1083    Int4         c;                                      /*index into a string*/
1084    Int4         numseq, numSeqAligns; /*number of matching seq. and SeqAligns*/
1085    Int4         seqIndex;         /*index for the array of matching sequences*/
1086    Int4         matchLength;                              /*length of a match*/
1087    Int4         queryOffset, subjectOffset, retrievalOffset;  /*offsets needed to make a match align*/
1088    Int4         qplace, splace; /*index into query string and matching string*/
1089    SeqAlignPtr  curSeqAlign;                  /*pointers into listOfSeqAligns*/
1090    DenseDiagPtr curSegs;           /*used to extract alignm. from curSeqAlign*/
1091    SeqIdPtr     curId; /*Used to compare seq. coming from different SeqAligns*/
1092    Int4         startQ, startS;    /*Indices into array of starting positions*/
1093    Int4         segIndex;                   /*Index for which piece we are at*/
1094    Boolean      is_new_id = FALSE;
1095 
1096    q = compactSearch->query;
1097    length = compactSearch->qlength;
1098    for(c = 0; c < length; c++) {
1099      posSearch->posDescMatrix[0][c].letter = (Int1) q[c];
1100      posSearch->posDescMatrix[0][c].used = TRUE;
1101      posSearch->posDescMatrix[0][c].leftExtent = 0;
1102      posSearch->posDescMatrix[0][c].rightExtent = length;
1103      posSearch->posDescMatrix[0][c].e_value = compactSearch->ethresh/2;
1104      posSearch->posC[c][q[c]]++;
1105      posSearch->posCount[c]++;
1106    }
1107 
1108    numSeqAligns = CddCountDenDiagSeqAligns(listOfSeqAligns, &numseq);
1109 
1110    posSearch->posNumSequences = numSeqAligns;
1111    seqIndex = 0;
1112    curSeqAlign = listOfSeqAligns;
1113    for(curSeqAlign = listOfSeqAligns; curSeqAlign != NULL; curSeqAlign = curSeqAlign->next) {
1114      curSegs = (DenseDiagPtr) curSeqAlign->segs;
1115      s = CddGetSequenceWithDenseDiag(curSegs, FALSE, &retrievalOffset, &subjectLength);
1116      startQ = 0; startS = 1;
1117      while (curSegs) {
1118        queryOffset = curSegs->starts[0];
1119        subjectOffset = curSegs->starts[1] - retrievalOffset;
1120        matchLength = curSegs->len;
1121        for(c = 0, qplace = queryOffset, splace = subjectOffset; c < matchLength; c++, qplace++, splace++) {
1122          if (!posSearch->posDescMatrix[seqIndex+1][qplace].used) {
1123            posSearch->posDescMatrix[seqIndex+1][qplace].letter = (Int1) s[splace];
1124            posSearch->posDescMatrix[seqIndex+1][qplace].used = TRUE;
1125            posSearch->posDescMatrix[seqIndex+1][qplace].e_value = 0;
1126          }
1127        }
1128        curSegs = curSegs->next;
1129      }
1130      s = MemFree(s);
1131      seqIndex++;
1132    } /*closes the while loop over seqAligns*/
1133 }
1134 
1135 
1136 
1137 
1138 /*---------------------------------------------------------------------------*/
1139 /* Compute general information about the sequences that matched on the       */
1140 /* i-th pass such as how many matched at each query position and what letter */
1141 /* matched                                                                   */
1142 /*---------------------------------------------------------------------------*/
CddposDemographics(posSearchItems * posSearch,compactSearchItems * compactSearch,SeqAlignPtr listOfSeqAligns)1143 void LIBCALL CddposDemographics(posSearchItems *posSearch,
1144                                 compactSearchItems *compactSearch,
1145 				SeqAlignPtr listOfSeqAligns)
1146 {
1147    Uint1Ptr    q;                                      /*pointers into query */
1148    Uint1Ptr    s;                           /*pointer into a matching string */
1149    Int4        length, subjectLength;           /*length of query and subject*/
1150    Int4        c;                                       /*index into a string*/
1151    Int4        numseq, numSeqAligns;  /*number of matching seqs and SeqAligns*/
1152    Int4        seqIndex;          /*index for the array of matching sequences*/
1153    Int4        matchLength;                               /*length of a match*/
1154    Int4        queryOffset, subjectOffset, retrievalOffset;  /*offsets needed to make a match align*/
1155    Int4        qplace, splace;  /*index into query string and matching string*/
1156    SeqAlignPtr curSeqAlign, prevSeqAlign;     /*pointers into listOfSeqAligns*/
1157    DenseSegPtr curSegs, prevSegs; /*used to extract alignmts from curSeqAlign*/
1158    SeqIdPtr    curId, prevId;/*Use to comp. seqs coming from difft. SeqAligns*/
1159    Int4        startQ, startS;     /*Indices into array of starting positions*/
1160    Int4        numsegs;            /*Number of pieces in the gapped alignment*/
1161    Int4        segIndex;                    /*Index for which piece we are at*/
1162    Boolean     is_new_id = FALSE;
1163 
1164    q = compactSearch->query;
1165    length = compactSearch->qlength;
1166    for(c = 0; c < length; c++) {
1167      posSearch->posDescMatrix[0][c].letter = (Int1) q[c];
1168      posSearch->posDescMatrix[0][c].used = TRUE;
1169      posSearch->posDescMatrix[0][c].leftExtent = 0;
1170      posSearch->posDescMatrix[0][c].rightExtent = length;
1171      posSearch->posDescMatrix[0][c].e_value = compactSearch->ethresh/2;
1172      posSearch->posC[c][q[c]]++;
1173      posSearch->posCount[c]++;
1174    }
1175 
1176    numSeqAligns = CddCountSeqAligns(listOfSeqAligns, &numseq);
1177 
1178    posSearch->posNumSequences = numSeqAligns;
1179    seqIndex = 0;
1180    curSeqAlign = listOfSeqAligns;
1181    prevSeqAlign = NULL;
1182    for(curSeqAlign = listOfSeqAligns; curSeqAlign != NULL; curSeqAlign = curSeqAlign->next) {
1183      is_new_id = FALSE;
1184      curSegs = (DenseSegPtr) curSeqAlign->segs;
1185      if (NULL != prevSeqAlign) {
1186        prevSegs = (DenseSegPtr) prevSeqAlign->segs;
1187        if(curSegs->ids == NULL) break;
1188        curId = curSegs->ids->next;
1189        prevId = prevSegs->ids->next;
1190        if (!(SeqIdMatch(curId, prevId))) is_new_id = TRUE;
1191      }
1192      if(is_new_id == TRUE) seqIndex++;
1193      s = CddGetSequenceWithDenseSeg(curSegs, FALSE, &retrievalOffset, &subjectLength);
1194      startQ = 0; startS = 1;
1195      numsegs = curSegs->numseg;
1196      for(segIndex = 0; segIndex < numsegs; segIndex++) {
1197        queryOffset = curSegs->starts[startQ];
1198        if (curSegs->starts[startS] != GAP_HERE)
1199          subjectOffset = curSegs->starts[startS] - retrievalOffset;
1200        else
1201          subjectOffset = GAP_HERE;
1202        matchLength = curSegs->lens[segIndex];
1203        if ((GAP_HERE ) == queryOffset) {
1204          ; /*do nothing, gap in query*/
1205        } else if ((GAP_HERE) == subjectOffset) {
1206          for(c = 0, qplace = queryOffset; c < matchLength; c++, qplace++) {
1207            posSearch->posDescMatrix[seqIndex + 1][qplace].used = TRUE;
1208            posSearch->posDescMatrix[seqIndex + 1][qplace].letter = GAP_CHAR;
1209            posSearch->posDescMatrix[seqIndex + 1][qplace].e_value = 1.0;
1210          }
1211        } else {  /*no gap*/
1212          for(c = 0, qplace = queryOffset, splace = subjectOffset; c < matchLength; c++, qplace++, splace++) {
1213            if ((!posSearch->posDescMatrix[seqIndex+1][qplace].used))
1214              if (!posSearch->posDescMatrix[seqIndex+1][qplace].used) {
1215                posSearch->posDescMatrix[seqIndex+1][qplace].letter = (Int1) s[splace];
1216                posSearch->posDescMatrix[seqIndex+1][qplace].used = TRUE;
1217                posSearch->posDescMatrix[seqIndex+1][qplace].e_value = 0;
1218              }
1219          }
1220        }
1221        startQ += 2;
1222        startS += 2;
1223      }
1224      prevSeqAlign = curSeqAlign;
1225      s = MemFree(s);
1226    } /*closes the while loop over seqAligns*/
1227 }
1228 
1229 
1230 /*---------------------------------------------------------------------------*/
1231 /* copied from BLASTSetUpSearchInternalByLoc                                 */
1232 /*---------------------------------------------------------------------------*/
1233 #define DROPOFF_NUMBER_OF_BITS 10.0
1234 #define INDEX_THR_MIN_SIZE 20000
CddSetUpSearchInternalByLoc(BlastSearchBlkPtr search,SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)))1235 static Int2 LIBCALL CddSetUpSearchInternalByLoc
1236                        (BlastSearchBlkPtr search, SeqLocPtr query_slp, BioseqPtr query_bsp,
1237                         CharPtr prog_name, Int4 qlen, BLAST_OptionsBlkPtr options,
1238                         int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)))
1239 {
1240 	BioseqPtr bsp_temp, bsp;
1241 	Boolean mask_at_hash=FALSE, private_slp_delete;
1242 	Boolean query_is_na, db_is_na;
1243 	Char buffer[128];
1244 	Int2 retval, status;
1245 	Int4 effective_query_length, query_length, full_query_length,
1246 		index, length, length_adjustment=0, last_length_adjustment, min_query_length;
1247 	Int4 array_size, max_length, block_width;
1248 	Int4Ptr open, extend;
1249 	Nlm_FloatHiPtr lambda, K, H;
1250 	Nlm_FloatHi avglen;
1251 	ReadDBFILEPtr rdfp;
1252 	SeqIdPtr query_id;
1253 	ObjectIdPtr oip;
1254 	SeqPortPtr spp=NULL, spp_reverse=NULL;
1255 	SeqLocPtr filter_slp=NULL, private_slp=NULL, private_slp_rev=NULL;
1256 	GeneticCodePtr gcp;
1257 	Uint1 residue, strand;
1258 	Uint1Ptr sequence;
1259 	Uint1Ptr query_seq, query_seq_start, query_seq_rev, query_seq_start_rev;
1260 	ValNodePtr vnp;
1261 
1262 	if (options == NULL)
1263 	{
1264 	  	ErrPostEx(SEV_FATAL, 0, 0, "BLAST_OptionsBlkPtr is NULL\n");
1265 		return 1;
1266 	}
1267 
1268 	if (query_slp == NULL && query_bsp == NULL)
1269 	{
1270 	  	ErrPostEx(SEV_FATAL, 0, 0, "Query is NULL\n");
1271 		return 1;
1272 	}
1273 
1274 	query_seq = NULL;	/* Gets rid of warning. */
1275 	query_seq_rev = NULL;	/* Gets rid of warning. */
1276 	query_seq_start = NULL;	/* Gets rid of warning. */
1277 	query_seq_start_rev = NULL;	/* Gets rid of warning. */
1278 
1279 	if (query_slp)
1280 	{
1281 		strand = SeqLocStrand(query_slp);
1282 		if (strand == Seq_strand_unknown || strand == Seq_strand_plus || strand == Seq_strand_both)
1283 		{
1284 			private_slp = SeqLocIntNew(SeqLocStart(query_slp), SeqLocStop(query_slp), Seq_strand_plus, SeqLocId(query_slp));
1285 		}
1286 		if (strand == Seq_strand_minus || strand == Seq_strand_both)
1287 		{
1288 			private_slp_rev = SeqLocIntNew(SeqLocStart(query_slp), SeqLocStop(query_slp), Seq_strand_minus, SeqLocId(query_slp));
1289 		}
1290 		private_slp_delete = TRUE;
1291 	}
1292 	else
1293 	{
1294 		private_slp = SeqLocIntNew(0, query_bsp->length-1 , Seq_strand_plus, SeqIdFindBest(query_bsp->id, SEQID_GI));
1295 		private_slp_rev = SeqLocIntNew(0, query_bsp->length-1 , Seq_strand_minus, SeqIdFindBest(query_bsp->id, SEQID_GI));
1296 		private_slp_delete = FALSE;
1297 	}
1298 
1299 	query_length = 0;
1300 	if (private_slp)
1301 		query_length = SeqLocLen(private_slp);
1302 	else if (private_slp_rev)
1303 		query_length = SeqLocLen(private_slp_rev);
1304 	if (query_length == 0)
1305 	{
1306 		sprintf(buffer, "No valid query sequence");
1307 		BlastConstructErrorMessage("Blast", buffer, 2, &(search->error_return));
1308 		return 1;
1309 	}
1310 
1311 	bsp = NULL;
1312 	if (private_slp)
1313 		bsp = BioseqLockById(SeqLocId(private_slp));
1314 	else if (private_slp_rev)
1315 		bsp = BioseqLockById(SeqLocId(private_slp_rev));
1316 
1317 	if (bsp == NULL)
1318 	{
1319 	  	ErrPostEx(SEV_WARNING, 0, 0, "No valid query sequence, BioseqLockById returned NULL\n");
1320 		return 1;
1321 	}
1322 
1323 	full_query_length = bsp->length;
1324 
1325 	BlastGetTypes(prog_name, &query_is_na, &db_is_na);
1326 	if (query_is_na != ISA_na(bsp->mol))
1327 	{
1328 	  	ErrPostEx(SEV_WARNING, 0, 0, "Query molecule is incompatible with %s program", prog_name);
1329 		BioseqUnlock(bsp);
1330 		return 1;
1331 	}
1332 
1333 	if (bsp->repr == Seq_repr_virtual)
1334 	{
1335 		BioseqUnlock(bsp);
1336 	  	ErrPostEx(SEV_WARNING, 0, 0, "Virtual sequence detected\n");
1337 		return 1;
1338 	}
1339 	BioseqUnlock(bsp);
1340 
1341 	if (query_slp)
1342 	{
1343 		search->query_slp = query_slp;
1344 	}
1345 	else
1346 	{
1347 		search->query_slp = private_slp;
1348 		search->allocated += BLAST_SEARCH_ALLOC_QUERY_SLP;
1349 	}
1350 
1351 
1352 	search->translation_buffer = NULL;
1353 	search->translation_buffer_size = 0;
1354 
1355 	/*
1356 	Get genetic codes (should be determined from BLAST_OptionsBlkPtr.
1357 	Only needed for blastx, tblast[nx]
1358 	*/
1359 	if (StringCmp(prog_name, "blastp") != 0 && StringCmp(prog_name, "blastn") != 0)
1360 	{
1361 		if (StringCmp(prog_name, "tblastx") == 0 || StringCmp(prog_name, "tblastn") == 0)
1362 		{
1363 			gcp = GeneticCodeFind(options->db_genetic_code, NULL);
1364 			for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next)
1365 			{
1366 				if (vnp->choice == 3)	/* ncbieaa */
1367 				{
1368 					search->db_genetic_code = (CharPtr)vnp->data.ptrvalue;
1369 					break;
1370 				}
1371 			}
1372 			search->translation_table = (Uint1Ptr) GetPrivatTranslationTable(search->db_genetic_code, FALSE);
1373 			search->translation_table_rc = (Uint1Ptr) GetPrivatTranslationTable(search->db_genetic_code, TRUE);
1374 			max_length = 0;
1375 			rdfp = search->rdfp;
1376 			while (rdfp)
1377 			{
1378 				max_length = MAX(max_length, readdb_get_maxlen(rdfp));
1379 				rdfp = rdfp->next;
1380 			}
1381 			search->translation_buffer = MemNew((3+(max_length/3))*sizeof(Uint1));
1382 			search->translation_buffer_size = 1+(max_length/3);
1383 			search->allocated += BLAST_SEARCH_ALLOC_TRANS_INFO;
1384 		}
1385 
1386 		if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0)
1387 		{
1388 			gcp = GeneticCodeFind(options->genetic_code, NULL);
1389 			for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next)
1390 			{
1391 				if (vnp->choice == 3)	/* ncbieaa */
1392 				{
1393 					search->genetic_code = (CharPtr)vnp->data.ptrvalue;
1394 					break;
1395 				}
1396 			}
1397 		}
1398 	}
1399 
1400 	if (options->filter && !options->filter_string)
1401 		options->filter_string = (CharPtr) BlastConstructFilterString(options->filter);
1402 
1403 	/* If the query is translated do this below. */
1404 	if (StringCmp(prog_name, "blastx") && StringCmp(prog_name, "tblastx")) {
1405 		if (private_slp)
1406 			filter_slp = BlastSeqLocFilterEx(private_slp, options->filter_string, &mask_at_hash);
1407 		else if (private_slp_rev)
1408 			filter_slp = BlastSeqLocFilterEx(private_slp_rev, options->filter_string, &mask_at_hash);
1409 	}
1410 
1411 
1412 	/*
1413            Dusting of query sequence. Only needed for blastn, optional
1414         */
1415 
1416         if(StringCmp(prog_name, "blastn") == 0) {
1417 		if (filter_slp && !mask_at_hash)
1418 			ValNodeAddPointer(&(search->mask), SEQLOC_MASKING_NOTSET, filter_slp);
1419         }
1420 
1421 	if (StringCmp(prog_name, "blastp") == 0 || StringCmp(prog_name, "tblastn") == 0)
1422 	{
1423 		spp = SeqPortNewByLoc(private_slp, Seq_code_ncbistdaa);
1424                 SeqPortSet_do_virtual(spp, TRUE);
1425 		if (filter_slp && !mask_at_hash)
1426 			ValNodeAddPointer(&(search->mask), SEQLOC_MASKING_NOTSET, filter_slp);
1427 	}
1428 	else if (StringCmp(prog_name, "blastx") == 0 || StringCmp(prog_name, "tblastx") == 0 || StringCmp(prog_name, "blastn") == 0)
1429 	{
1430 		if (private_slp)
1431 		{
1432 			spp = SeqPortNewByLoc(private_slp, Seq_code_ncbi4na);
1433                 	SeqPortSet_do_virtual(spp, TRUE);
1434 		}
1435 		if (private_slp_rev)
1436 		{
1437 			spp_reverse = SeqPortNewByLoc(private_slp_rev, Seq_code_ncbi4na);
1438                 	SeqPortSet_do_virtual(spp_reverse, TRUE);
1439 		}
1440 	}
1441 	else
1442 	{
1443 	  	ErrPostEx(SEV_FATAL, 0, 0, "Only blastn, blastp, blastx, tblastn tblastx is allowed\n");
1444 		return 1;
1445 	}
1446 
1447 	if (spp)
1448 	{
1449 		query_seq_start = (Uint1Ptr) MemNew(((query_length)+2)*sizeof(Char));
1450 		query_seq_start[0] = NULLB;
1451 		query_seq = query_seq_start+1;
1452 		index=0;
1453 		while ((residue=SeqPortGetResidue(spp)) != SEQPORT_EOF)
1454 		{
1455 
1456 			if (IS_residue(residue))
1457 			{
1458 				query_seq[index] = residue;
1459 				index++;
1460 			}
1461 		}
1462 		query_seq[index] = NULLB;
1463 		spp = SeqPortFree(spp);
1464 		if (StringCmp(prog_name, "blastn") == 0)
1465 		{
1466 			if (filter_slp)
1467 			{
1468 				if (mask_at_hash)
1469                 			search->context[0].location =
1470                         			BlastSeqLocFillDoubleInt(filter_slp, query_length, FALSE);
1471 				else
1472 					BlastMaskTheResidues(query_seq, full_query_length, 15, filter_slp, FALSE, SeqLocStart(private_slp));
1473 			}
1474 			for (index=0; index<query_length; index++)
1475 				query_seq[index] = ncbi4na_to_blastna[query_seq[index]];
1476 		}
1477 	}
1478 
1479 	if (spp_reverse)
1480 	{
1481 		query_seq_start_rev = (Uint1Ptr) MemNew(((query_length)+2)*sizeof(Char));
1482 		query_seq_start_rev[0] = NULLB;
1483 		query_seq_rev = query_seq_start_rev+1;
1484 		index=0;
1485 		while ((residue=SeqPortGetResidue(spp_reverse)) != SEQPORT_EOF)
1486 		{
1487 			if (IS_residue(residue))
1488 			{
1489 				query_seq_rev[index] = residue;
1490 				index++;
1491 			}
1492 		}
1493 		query_seq_rev[index] = NULLB;
1494 		spp_reverse = SeqPortFree(spp_reverse);
1495 		if (StringCmp(prog_name, "blastn") == 0)
1496 		{
1497 			if (filter_slp)
1498 			{
1499 				if (mask_at_hash)
1500                 			search->context[1].location =
1501                         			BlastSeqLocFillDoubleInt(filter_slp, query_length, TRUE);
1502 				else
1503 					BlastMaskTheResidues(query_seq_rev, full_query_length, 15, filter_slp, TRUE, full_query_length - SeqLocStop(private_slp_rev) - 1);
1504 			}
1505 			for (index=0; index<query_length; index++)
1506 				query_seq_rev[index] = ncbi4na_to_blastna[query_seq_rev[index]];
1507 		}
1508 	}
1509 
1510 /*
1511 	Set the context_factor, which specifies how many different
1512 	ways the query or db is examined (e.g., blastn looks at both
1513 	stands of query, context_factor is 2).
1514 */
1515 	if (StringCmp(prog_name, "blastp") == 0)
1516 	{
1517 		search->context_factor = 1;
1518 		length = query_length;
1519 	}
1520 	else if (StringCmp(prog_name, "blastn") == 0)
1521 	{	/* two strands */
1522 		search->context_factor = (search->last_context-search->first_context+1);
1523 		length = query_length;
1524 	}
1525 	else if (StringCmp(prog_name, "blastx") == 0)
1526 	{	/* query translated in six frames. */
1527 		search->context_factor = search->last_context-search->first_context+1;
1528 		length = query_length/3;
1529 	}
1530 	else if (StringCmp(prog_name, "tblastn") == 0)
1531 	{	/* db translated in six frames. */
1532 		search->context_factor = 6;
1533 		length = query_length;
1534 	}
1535 	else if (StringCmp(prog_name, "tblastx") == 0)
1536 	{	/* db and query each translated in six frames. */
1537 		search->context_factor = 6*CODON_LENGTH*(search->last_context-search->first_context+1);
1538 		length = query_length/3;
1539 	}
1540 
1541 	if (private_slp)
1542 		query_id = SeqIdFindBest(SeqLocId(private_slp), SEQID_GI);
1543 	else
1544 		query_id = SeqIdFindBest(SeqLocId(private_slp_rev), SEQID_GI);
1545 
1546 	search->query_id = SeqIdDup(query_id);
1547 
1548 /* Store the query sequence, or the translation thereof. */
1549 	if (StringCmp(prog_name, "blastp") == 0 || StringCmp(prog_name, "tblastn") == 0)
1550 	{	/* One blastp context for now. */
1551 		if (filter_slp)
1552 		{
1553 			if (mask_at_hash)
1554                 		search->context[0].location =
1555                         		BlastSeqLocFillDoubleInt(filter_slp, query_length, FALSE);
1556 			else
1557 				BlastMaskTheResidues(query_seq, full_query_length, 21, filter_slp, FALSE, SeqLocStart(private_slp));
1558 		}
1559 		BlastSequenceAddSequence(search->context[0].query, NULL, query_seq_start, query_length, query_length, 0);
1560 	}
1561 	else if (StringCmp(prog_name, "blastx") == 0  || StringCmp(prog_name, "tblastx") == 0)
1562 	{
1563 
1564 		for (index=search->first_context; index<=search->last_context; index++)
1565 		{
1566 		   if (search->context[index].query->frame > 0)
1567 		   {
1568 			sequence = GetTranslation(query_seq, query_length, search->context[index].query->frame, &length, search->genetic_code);
1569 		   }
1570 		   else
1571 		   {
1572 			sequence = GetTranslation(query_seq_rev, query_length, search->context[index].query->frame, &length, search->genetic_code);
1573 		   }
1574 		   if (options->filter_string && length > 0)
1575 		   {
1576 		  	bsp_temp = (BioseqPtr) BlastMakeTempProteinBioseq(sequence+1, length, Seq_code_ncbistdaa);
1577 			bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1578 			/*
1579 			bsp_temp->id = search->query_id;
1580 			*/
1581 			oip = (ObjectIdPtr) UniqueLocalId();
1582 			ValNodeAddPointer(&(bsp_temp->id), SEQID_LOCAL, oip);
1583 			SeqMgrAddToBioseqIndex(bsp_temp);
1584 
1585 			filter_slp = BlastBioseqFilterEx(bsp_temp, options->filter_string, &mask_at_hash);
1586 			HackSeqLocId(filter_slp, search->query_id);
1587 
1588 			SeqMgrDeleteFromBioseqIndex(bsp_temp);
1589 
1590 			bsp_temp->id = SeqIdSetFree(bsp_temp->id);
1591 			bsp_temp = BioseqFree(bsp_temp);
1592 			if (mask_at_hash)
1593 			{
1594                 		search->context[index].location =
1595 					BlastSeqLocFillDoubleInt(filter_slp, query_length, FALSE);
1596 			}
1597 			else
1598 			{
1599 				BlastMaskTheResidues(sequence+1, length, 21, filter_slp, FALSE, SeqLocStart(private_slp));
1600 				BlastConvertProteinSeqLoc(filter_slp, search->context[index].query->frame, query_length);
1601 			}
1602 			if (filter_slp && !mask_at_hash)
1603 				ValNodeAddPointer(&(search->mask), FrameToDefine(search->context[index].query->frame), filter_slp);
1604 		   }
1605 		   BlastSequenceAddSequence(search->context[index].query, NULL, sequence, length, query_length, 0);
1606 		}
1607 		query_seq_start = MemFree(query_seq_start);
1608 		query_seq_start_rev = MemFree(query_seq_start_rev);
1609 	}
1610 	else if (StringCmp(prog_name, "blastn") == 0)
1611 	{
1612 		if (search->first_context == 0)
1613 			BlastSequenceAddSequence(search->context[0].query, NULL, query_seq_start, query_length, query_length, 0);
1614 		if (search->last_context == 1)
1615 			BlastSequenceAddSequence(search->context[1].query, NULL, query_seq_start_rev, query_length, query_length, 0);
1616 	}
1617 
1618 	if (mask_at_hash)
1619 	{ /* No longer needed. */
1620 		filter_slp = SeqLocSetFree(filter_slp);
1621 	}
1622 
1623 /* Set the ambiguous residue before the ScoreBlk is filled. */
1624 	if (StringCmp(prog_name, "blastn") != 0)
1625 	{
1626 		search->sbp->read_in_matrix = TRUE;
1627 		BlastScoreSetAmbigRes(search->sbp, 'X');
1628 	}
1629 	else
1630 	{
1631   	        if(options->matrix!=NULL) {
1632 		     search->sbp->read_in_matrix = TRUE;
1633 	        } else {
1634 		     search->sbp->read_in_matrix = FALSE;
1635 	        }
1636 		BlastScoreSetAmbigRes(search->sbp, 'N');
1637 	}
1638 
1639 
1640 	search->sbp->penalty = options->penalty;
1641 	search->sbp->reward = options->reward;
1642 
1643         /* option is to use alignments choosen by user in PSM computation API (used in WWW PSI-Blast); */
1644 	search->pbp->use_best_align = options->use_best_align;
1645 
1646 	/* Should culling be used at all? */
1647 	search->pbp->perform_culling = options->perform_culling;
1648 	search->pbp->hsp_range_max = options->hsp_range_max;
1649         /* This assures that search->pbp->max_pieces is at least one wide. */
1650         block_width = MIN(query_length, options->block_width);
1651         if (block_width > 0)
1652                 search->pbp->max_pieces = query_length/block_width;
1653 
1654 	search->sbp->query_length = query_length;
1655 
1656 	search->result_struct = BLASTResultsStructNew(search->result_size, search->pbp->max_pieces, search->pbp->hsp_range_max);
1657 	if (options->matrix != NULL)
1658 		status = BlastScoreBlkMatFill(search->sbp, options->matrix);
1659 	else
1660 		status = BlastScoreBlkMatFill(search->sbp, "BLOSUM62");
1661 	if (status != 0)
1662 	{
1663 		ErrPostEx(SEV_WARNING, 0, 0, "BlastScoreBlkMatFill returned non-zero status");
1664 		return 1;
1665 	}
1666 
1667 	/* This is used right below. */
1668 	search->pbp->gapped_calculation = options->gapped_calculation;
1669 	search->pbp->do_not_reevaluate = options->do_not_reevaluate;
1670 	search->pbp->do_sum_stats = options->do_sum_stats;
1671 	search->pbp->first_db_seq = options->first_db_seq;
1672 	search->pbp->final_db_seq = options->final_db_seq;
1673 
1674 	retval = 0;
1675 	for (index=search->first_context; index<=search->last_context; index++)
1676 	{
1677 		status = BlastScoreBlkFill(search->sbp, (CharPtr) search->context[index].query->sequence,search->context[index].query->length, index);
1678 		if (status != 0)
1679 		{
1680 			sprintf(buffer, "Unable to calculate Karlin-Altschul params, check query sequence");
1681 			BlastConstructErrorMessage("BLASTSetUpSearch", buffer, 2, &(search->error_return));
1682 			retval = 1;
1683 		}
1684 		if (search->pbp->gapped_calculation &&
1685 				StringCmp(search->prog_name, "blastn") != 0)
1686 		{
1687 			search->sbp->kbp_gap_std[index] = BlastKarlinBlkCreate();
1688                 	status = BlastKarlinBlkGappedCalc(search->sbp->kbp_gap_std[index], options->gap_open, options->gap_extend, search->sbp->name, &(search->error_return));
1689 			if (status != 0)
1690 			{
1691 				retval = 1;
1692 			}
1693 			search->sbp->kbp_gap_psi[index] = BlastKarlinBlkCreate();
1694                 	status = BlastKarlinBlkGappedCalc(search->sbp->kbp_gap_psi[index], options->gap_open, options->gap_extend, search->sbp->name, &(search->error_return));
1695 			if (status != 0)
1696 			{
1697 				retval = 1;
1698 			}
1699 		}
1700 	}
1701 
1702 	search->sbp->kbp_gap = search->sbp->kbp_gap_std;
1703         search->sbp->kbp = search->sbp->kbp_std;
1704 	if (StringCmp(prog_name, "blastn") != 0)
1705 	{
1706 		array_size = BlastKarlinGetMatrixValues(search->sbp->name, &open, &extend, &lambda, &K, &H, NULL);
1707 		if (array_size > 0)
1708 		{
1709 			for (index=0; index<array_size; index++)
1710 			{
1711 				if (open[index] == INT2_MAX && extend[index] == INT2_MAX)
1712 				{
1713 					search->sbp->kbp_ideal = BlastKarlinBlkCreate();
1714 					search->sbp->kbp_ideal->Lambda = lambda[index];
1715 					search->sbp->kbp_ideal->K = K[index];
1716 					search->sbp->kbp_ideal->H = H[index];
1717 				}
1718 			}
1719 			MemFree(open);
1720 			MemFree(extend);
1721 			MemFree(lambda);
1722 			MemFree(K);
1723 			MemFree(H);
1724 		}
1725 		if (search->sbp->kbp_ideal == NULL)
1726         		search->sbp->kbp_ideal = BlastKarlinBlkStandardCalcEx(search->sbp);
1727 	}
1728 
1729 	/* Adjust the Karlin parameters. */
1730 	if (StringCmp(prog_name, "blastx") == 0  || StringCmp(prog_name, "tblastx") == 0)
1731 	{
1732                 BlastKarlinBlkStandardCalc(search->sbp, search->first_context, search->last_context);
1733 	}
1734 
1735 	/* If retval was set non-zero above (by the routines calculating Karlin-Altschul params),
1736 	   return here before these values are used.
1737 	*/
1738 	if (retval)
1739 		return retval;
1740         if( 0 != BlastComputeLengthAdjustment(1/min_query_length,
1741                                          search->sbp->
1742                                          kbp[search->first_context]->logK,
1743                                          1/search->sbp->
1744                                          kbp[search->first_context]->H,
1745                                          0.0,
1746                                          length,
1747                                          search->dblen, search->dbseq_num,
1748                                          &length_adjustment) ) {
1749             ErrPostEx(SEV_WARNING, 0, 0,
1750                       "BlastComputeLengthAdjustment failed!");
1751         }
1752         search->length_adjustment = length_adjustment;
1753         search->dblen_eff =
1754             search->dblen - search->dbseq_num*search->length_adjustment;
1755 
1756         effective_query_length    = length - length_adjustment;
1757         for (index=search->first_context; index<=search->last_context; index++)
1758         {
1759             search->context[index].query->effective_length =
1760                 effective_query_length;
1761         }
1762 
1763         if (search->searchsp_eff == 0)
1764             search->searchsp_eff =
1765                 ((Nlm_FloatHi) search->dblen_eff)*
1766                 ((Nlm_FloatHi) effective_query_length);
1767 
1768 	/* The default is that cutoff_s was not set and is zero. */
1769 	if (options->cutoff_s == 0)
1770 	{
1771 		search->pbp->cutoff_e = options->expect_value;
1772 		search->pbp->cutoff_e_set = TRUE;
1773 		search->pbp->cutoff_s = options->cutoff_s;
1774 		search->pbp->cutoff_s_set = FALSE;
1775 	}
1776 	else
1777 	{
1778 		search->pbp->cutoff_e = options->expect_value;
1779 		search->pbp->cutoff_e_set = FALSE;
1780 		search->pbp->cutoff_s = options->cutoff_s;
1781 		search->pbp->cutoff_s_set = TRUE;
1782 	}
1783 /* For now e2 is set to 0.5 and cutoff_e2_set is FALSE.  This is then
1784 changed to the proper values in blast_set_parameters.  In the final version
1785 of this program (where more blast programs and command-line options are
1786 available) this needs to be set higher up. */
1787 	if (options->cutoff_s2 == 0)
1788 	{
1789 		search->pbp->cutoff_e2 = options->e2;
1790 		search->pbp->cutoff_e2_set = FALSE;
1791 		search->pbp->cutoff_s2 = options->cutoff_s2;
1792 		search->pbp->cutoff_s2_set = FALSE;
1793 	}
1794 	else
1795 	{
1796 		search->pbp->cutoff_e2 = options->e2;
1797 		search->pbp->cutoff_e2_set = FALSE;
1798 		search->pbp->cutoff_s2 = options->cutoff_s2;
1799 		search->pbp->cutoff_s2_set = TRUE;
1800 	}
1801 
1802 	search->pbp->discontinuous = options->discontinuous;
1803 
1804 
1805 	/* For postion based blast. */
1806 	search->pbp->ethresh = options->ethresh;
1807 	search->pbp->maxNumPasses = options->maxNumPasses;
1808 	search->pbp->pseudoCountConst = options->pseudoCountConst;
1809 
1810 	search->pbp->process_num = options->number_of_cpus;
1811 	search->pbp->cpu_limit = options->cpu_limit;
1812 	search->pbp->gap_decay_rate = options->gap_decay_rate;
1813 	search->pbp->gap_size = options->gap_size;
1814 	search->pbp->gap_prob = options->gap_prob;
1815 	search->pbp->old_stats = options->old_stats;
1816 	search->pbp->use_large_gaps = options->use_large_gaps;
1817 	search->pbp->number_of_bits = options->number_of_bits;
1818 	search->pbp->two_pass_method = options->two_pass_method;
1819 	search->pbp->multiple_hits_only = options->multiple_hits_only;
1820 	search->pbp->gap_open = options->gap_open;
1821 	search->pbp->gap_extend = options->gap_extend;
1822         search->pbp->decline_align = options->decline_align;
1823 
1824 	search->pbp->hsp_num_max = options->hsp_num_max;
1825 /* CHANGE HERE??? */
1826 	if (search->pbp->gapped_calculation && StringCmp(search->prog_name, "blastn"))
1827 	{
1828 		search->pbp->cutoff_s2_set = TRUE;
1829 		if (StringCmp(search->prog_name, "blastn") != 0)
1830 		{
1831 			search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
1832 			search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp_gap[search->first_context]->Lambda);
1833 			search->pbp->gap_trigger = (BLAST_Score) ((options->gap_trigger*NCBIMATH_LN2+search->sbp->kbp[search->first_context]->logK)/ search->sbp->kbp[search->first_context]->Lambda);
1834 		}
1835 		else
1836 		{
1837 			search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
1838 			search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
1839 			search->pbp->gap_trigger = (BLAST_Score) ((options->gap_trigger*NCBIMATH_LN2+search->sbp->kbp[search->first_context]->logK)/ search->sbp->kbp[search->first_context]->Lambda);
1840 		}
1841 		/* The trigger value sets the s2 cutoff. */
1842 		search->pbp->cutoff_s2 = search->pbp->gap_trigger;
1843 	}
1844 	else
1845 	{
1846 		search->pbp->gap_x_dropoff = (BLAST_Score) (options->gap_x_dropoff*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
1847 		search->pbp->gap_x_dropoff_final = (BLAST_Score) (options->gap_x_dropoff_final*NCBIMATH_LN2 / search->sbp->kbp[search->first_context]->Lambda);
1848 		search->pbp->gap_trigger = (BLAST_Score) ((options->gap_trigger*NCBIMATH_LN2+search->sbp->kbp[search->first_context]->logK)/ search->sbp->kbp[search->first_context]->Lambda);
1849 		/* Set S and S2 equal if not sum stats. */
1850 		if (search->pbp->do_sum_stats == FALSE)
1851 			search->pbp->cutoff_s2 = search->pbp->cutoff_s;
1852 	}
1853 	/* Ensures that gap_x_dropoff_final is at least as large as gap_x_dropoff. */
1854 	search->pbp->gap_x_dropoff_final = MAX(search->pbp->gap_x_dropoff_final, search->pbp->gap_x_dropoff);
1855 
1856 /* "threshold" (first and second) must be set manually for two-pass right now.*/
1857 	search->pbp->threshold_set = TRUE;
1858 	search->pbp->threshold_first = options->threshold_first;
1859 	search->pbp->threshold_second = options->threshold_second;
1860 
1861 	search->pbp->window_size = options->window_size;
1862 	search->pbp->window_size_set = TRUE;
1863 
1864 	search->whole_query = TRUE;
1865 	if (options->required_start != 0 || options->required_end != -1)
1866 	{
1867 		search->whole_query = FALSE;
1868 		search->required_start = options->required_start;
1869 		if (options->required_end != -1)
1870 			search->required_end = options->required_end;
1871 		else
1872 			search->required_end = query_length;
1873 	}
1874 
1875 	if (qlen <= 0)
1876 		qlen = query_length;
1877 
1878 	/* Use DROPOFF_NUMBER_OF_BITS as the default if it's set to zero. */
1879 	if (options->dropoff_1st_pass == 0)
1880 		options->dropoff_1st_pass = DROPOFF_NUMBER_OF_BITS;
1881 
1882 	if (options->dropoff_2nd_pass == 0)
1883 		options->dropoff_2nd_pass = DROPOFF_NUMBER_OF_BITS;
1884 
1885 	if (StringCmp(search->prog_name, "blastn") != 0)
1886 	{
1887 		avglen = BLAST_AA_AVGLEN;
1888 	}
1889 	else
1890 	{
1891 		avglen = BLAST_NT_AVGLEN;
1892 		/* Use only one type of gap for blastn */
1893 		search->pbp->ignore_small_gaps = TRUE;
1894 	}
1895 
1896 	if (blast_set_parameters(search, options->dropoff_1st_pass, options->dropoff_2nd_pass, avglen, search->searchsp_eff, options->window_size) != 0)
1897 		return 1;
1898 
1899 	search->thr_info->awake_index = FALSE;
1900 
1901 	/* Only do this if this is not a pattern search. */
1902 	if (options->isPatternSearch == FALSE)
1903 	{
1904 	     for (index=search->first_context; index<=search->last_context; index++)
1905 	     {
1906 		if (options->threshold_first > 0)
1907 		{
1908 			search->wfp = search->wfp_first;
1909 			if (search->whole_query == TRUE)
1910 			  if (!(search->positionBased)) /*AAS*/
1911 			    status = BlastFindWords(search, 0, search->context[index].query->length, options->threshold_first, (Uint1) index);
1912 			  else
1913 			    status = BlastNewFindWords(search, 0, search->context[index].query->length, options->threshold_first, (Uint1) index);
1914 			else
1915 			  if (!(search->positionBased)) /*AAS*/
1916 			    status = BlastFindWords(search, search->required_start, search->required_end, options->threshold_first, (Uint1) index);
1917 			  else
1918 			    status = BlastFindWords(search, search->required_start, search->required_end, options->threshold_first, (Uint1) index);
1919 			if (status != 0) {
1920                             search->thr_info->awake_index = FALSE;
1921                             ErrPostEx(SEV_WARNING, 0, 0, "BlastFindWords returned non-zero status");
1922                             return 1;
1923 			}
1924 		}
1925 		search->wfp = search->wfp_second;
1926 		if (StringCmp(prog_name, "blastn") != 0)
1927 		{
1928 		    if (search->allocated & BLAST_SEARCH_ALLOC_WFP_SECOND)
1929 		    {
1930 			if (search->whole_query == TRUE)
1931 			  if (!(search->positionBased))
1932 			    status = BlastFindWords(search, 0, search->context[index].query->length, options->threshold_second, (Uint1) index);
1933 			  else
1934 			    status = BlastNewFindWords(search, 0, search->context[index].query->length, options->threshold_second, (Uint1) index);
1935 			else
1936 			  if (!(search->positionBased))
1937 			    status = BlastFindWords(search, search->required_start, search->required_end, options->threshold_second, (Uint1) index);
1938 			  else
1939 			    status = BlastNewFindWords(search, search->required_start, search->required_end, options->threshold_second, (Uint1) index);
1940 		    }
1941 		}
1942 		else
1943 		{
1944 			status = BlastNtFindWords(search, 0, search->context[index].query->length,
1945 				                      (Uint1) index);
1946 		}
1947 
1948 		if (status > 0)
1949 		{
1950 			search->thr_info->awake_index = FALSE;
1951 			sprintf(buffer, "No valid letters to be indexed");
1952 			BlastConstructErrorMessage("Blast", buffer, 2, &(search->error_return));
1953 			return 1;
1954 		}
1955 		else if (status < 0)
1956 		{
1957 			search->thr_info->awake_index = FALSE;
1958 			sprintf(buffer, "Error finding words");
1959 			BlastConstructErrorMessage("Blast", buffer, 2, &(search->error_return));
1960 			return 1;
1961 		}
1962 	    }
1963 	    lookup_position_aux_destruct(search->wfp->lookup);
1964 	}
1965 	/*
1966 	Turn off the index thread by setting this flag.  Don't wait for a join, as the
1967 	search will take much longer than the one second for this to die.
1968 	*/
1969 	search->thr_info->awake_index = FALSE;
1970 
1971 
1972 	if (private_slp && private_slp_delete)
1973 		private_slp = SeqLocFree(private_slp);
1974 	if (private_slp_rev)
1975 		private_slp_rev = SeqLocFree(private_slp_rev);
1976 
1977 	return 0;
1978 }
1979 
1980 /*---------------------------------------------------------------------------*/
1981 /*---------------------------------------------------------------------------*/
CddSetUpSearchWithReadDb(SeqLocPtr query_slp,BioseqPtr query_bsp,CharPtr prog_name,Int4 qlen,CharPtr dbname,BLAST_OptionsBlkPtr options,int (LIBCALLBACK * callback)PROTO ((Int4 done,Int4 positives)),SeqIdPtr seqid_list,BlastDoubleInt4Ptr gi_list,Int4 gi_list_total,ReadDBFILEPtr rdfp)1982 BlastSearchBlkPtr LIBCALL CddSetUpSearchWithReadDb(SeqLocPtr query_slp,
1983                                                    BioseqPtr query_bsp,
1984                                                    CharPtr prog_name,
1985                                                    Int4 qlen,
1986                                                    CharPtr dbname,
1987                                                    BLAST_OptionsBlkPtr options,
1988                                                    int (LIBCALLBACK *callback)PROTO((Int4 done, Int4 positives)),
1989                                                    SeqIdPtr seqid_list,
1990                                                    BlastDoubleInt4Ptr gi_list,
1991                                                    Int4 gi_list_total,
1992                                                    ReadDBFILEPtr rdfp)
1993 
1994 {
1995 	BlastSearchBlkPtr search;
1996 	Boolean           multiple_hits, options_alloc=FALSE;
1997 	Int2              status, first_context, last_context;
1998   Int8	            dblen;
1999 	Int4	            query_length;
2000   ValNodePtr	      vnp;
2001   Int4		          i;
2002 	Nlm_FloatHi	      searchsp_eff=0;
2003 	Boolean		        use_private_gilist = FALSE;
2004 	OIDListPtr	      alias_oidlist;
2005 	Int4		          mask_index, virtual_mask_index;
2006 	Uint4		          oid_bit, virtual_oid_bit;
2007 	ReadDBFILEPtr	    tmprdfp;
2008         Int4                hitlist_size;
2009 
2010 	if (options == NULL) {
2011 		options = BLASTOptionNew(prog_name, FALSE);	options_alloc = TRUE;
2012 	}
2013 
2014 	if (options->window_size != 0) multiple_hits = TRUE;
2015 	else multiple_hits = FALSE;
2016 
2017 	BlastGetFirstAndLastContext(prog_name, query_slp, &first_context, &last_context, options->strand_option);
2018 
2019 	if (query_slp) query_length = SeqLocLen(query_slp);
2020 	else query_length = query_bsp->length;
2021 
2022         hitlist_size = BlastSingleQueryResultSize(options);
2023         /* On the first call query length is used for the subject length. */
2024         search =
2025             BlastSearchBlkNewExtra(options->wordsize, query_length, dbname,
2026                                    multiple_hits, options->threshold_first,
2027                                    options->threshold_second, hitlist_size,
2028                                    prog_name, NULL, first_context,
2029                                    last_context, rdfp, options->window_size);
2030 
2031 	if (search) {
2032 		readdb_get_totals(search->rdfp, &(dblen), &(search->dbseq_num));
2033 		if (seqid_list) BlastAdjustDbNumbers(search->rdfp, &(dblen), &(search->dbseq_num), seqid_list, NULL, NULL, NULL, 0);
2034 
2035 		if (!gi_list) {
2036 /* Ok, we do not have a gi-list specified, but maybe
2037 	we have an a mask database in the list of databases,
2038 	we need to create one mask for all such databases */
2039       OIDListPtr		virtual_oidlist = NULL;
2040       Int4		      final_virtual_db_seq=0, final_db_seq=0;
2041       Int4		      mask, oid, virtual_oid, maskindex, virtual_mask_index, total_virtual_mask, base;
2042       Uint4		      virtual_oid_bit;
2043 
2044       tmprdfp = search->rdfp;
2045       while (tmprdfp) {
2046         final_virtual_db_seq = tmprdfp->stop;
2047 			  if (!tmprdfp->oidlist) final_db_seq = tmprdfp->stop;
2048 			  tmprdfp = tmprdfp->next;
2049 		  }
2050       tmprdfp = search->rdfp;
2051       while (tmprdfp) {
2052 			  if (tmprdfp->oidlist) {
2053 			    if (!virtual_oidlist) {
2054 /* create new oidlist for virtual database */
2055 				    virtual_oidlist = (OIDListPtr) MemNew(sizeof(OIDList));
2056 				    virtual_oidlist->total = final_virtual_db_seq + 1;
2057 				    total_virtual_mask = final_virtual_db_seq/MASK_WORD_SIZE + 2;
2058 				    virtual_oidlist->list = (Uint4Ptr) MemNew (total_virtual_mask*sizeof(Int4));
2059 			    }
2060 /* Now populate the virtual_oidlist */
2061 			    maskindex = 0;
2062 			    base = 0;
2063 			    while (maskindex < (tmprdfp->oidlist->total/MASK_WORD_SIZE +1)) {
2064 /* for each long-word mask */
2065 				    mask = Nlm_SwapUint4(tmprdfp->oidlist->list[maskindex]);
2066             i = 0;
2067 				    while (mask) {
2068 				      if (mask & (((Uint4)0x1)<<(MASK_WORD_SIZE-1))) {
2069 					    oid = base + i;
2070 					    virtual_oid = oid + tmprdfp->start;
2071               virtual_mask_index = virtual_oid/MASK_WORD_SIZE;
2072 					    virtual_oid_bit = 0x1 << (MASK_WORD_SIZE - 1 - virtual_oid % MASK_WORD_SIZE);
2073 					    virtual_oidlist->list[virtual_mask_index] |= virtual_oid_bit;
2074 				    }
2075 				    mask <<= 1;
2076 				    i++;
2077 				  }
2078 				  maskindex++;
2079 				  base += MASK_WORD_SIZE;
2080         }
2081 /* free old mask */
2082 			  tmprdfp->oidlist = OIDListFree(tmprdfp->oidlist);
2083       }
2084 			tmprdfp = tmprdfp->next;
2085     }
2086     if (virtual_oidlist) {
2087       for (i=0; i<total_virtual_mask; i++) {
2088         virtual_oidlist->list[i] = Nlm_SwapUint4(virtual_oidlist->list[i]);
2089       }
2090     }
2091     search->rdfp->oidlist = virtual_oidlist;
2092     readdb_get_totals_ex(search->rdfp, &(dblen), &(search->dbseq_num), TRUE);
2093   }
2094 #if 0
2095 /* Intended for use when a list of gi's is sent in, but the real size is needed. */
2096 /* It's probably still necessary to call BlastAdjustDbNumbers, but it would be nice
2097 	 if this were not required. */
2098     if (options->use_real_db_size)
2099 			readdb_get_totals(search->rdfp, &(dblen), &(search->dbseq_num));
2100 /* use length and num of seqs of the database from alias file */
2101 		if (search->rdfp->aliaslen && !gi_list) dblen = search->rdfp->aliaslen;
2102 		if (search->rdfp->aliasnseq && !gi_list) search->dbseq_num = search->rdfp->aliasnseq;
2103 /* command-line/options trump alias file. */
2104 		if (options->db_length > 0)	dblen = options->db_length;
2105 		if (options->dbseq_num > 0) search->dbseq_num = options->dbseq_num;
2106 #endif
2107 		if (options->searchsp_eff > 0) searchsp_eff = options->searchsp_eff;
2108     if (StringCmp(prog_name, "tblastn") == 0 || StringCmp(prog_name, "tblastx") == 0) {
2109       dblen /= 3;
2110       searchsp_eff /= 3.0;
2111     }
2112 		search->dblen = dblen;
2113 		search->searchsp_eff = searchsp_eff;
2114 		status = CddSetUpSearchInternalByLoc (search, query_slp, query_bsp, prog_name, qlen, options, callback);
2115 		if (status != 0) {
2116 	  	ErrPostEx(SEV_WARNING, 0, 0, "SetUpBlastSearch failed.");
2117 			search->query_invalid = TRUE;
2118 		}
2119 	}
2120 	if (options_alloc) options = BLASTOptionDelete(options);
2121 	return search;
2122 }
2123 
2124 #define  putCkptNlm_FloatHi(d, ckptFile)  (putCkptNumber(&(d),sizeof(Nlm_FloatHi),ckptFile))
2125 #define  putCkptInt4(i, ckptFile)         (putCkptNumber(&(i),sizeof(Int4),ckptFile))
2126 #define  putCkptChar(c, ckptFile)         (putCkptNumber(&(c),sizeof(Char),ckptFile))
2127 
2128 /*---------------------------------------------------------------------------*/
2129 /* General routine for putting the internal representation of a number. */
2130 /*---------------------------------------------------------------------------*/
putCkptNumber(void * numberPtr,Int4 numberSize,FILE * ckptFile)2131 static void LIBCALL putCkptNumber(void * numberPtr, Int4 numberSize, FILE * ckptFile)
2132 {
2133   FileWrite(numberPtr,numberSize,1,ckptFile) ;
2134 }
2135 
2136 /*---------------------------------------------------------------------------*/
2137 /*Code to put a vector of frequencies; put only the interesting entries*/
2138 /*---------------------------------------------------------------------------*/
putFreqVector(Nlm_FloatHi * theVector,Int4 length,FILE * ckptFile)2139 static void LIBCALL putFreqVector(Nlm_FloatHi * theVector, Int4 length, FILE * ckptFile)
2140 {
2141    int  vectorRef;
2142    Int4 charOrder[EFFECTIVE_ALPHABET]; /*standard order of letters according to S. Altschul*/
2143 
2144 
2145    charOrder[0] =  1;  /*A*/
2146    charOrder[1] =  16; /*R*/
2147    charOrder[2] =  13; /*N*/
2148    charOrder[3] =  4;  /*D*/
2149    charOrder[4] =  3;  /*C*/
2150    charOrder[5] =  15; /*Q*/
2151    charOrder[6] =  5;  /*E*/
2152    charOrder[7] =  7;  /*G*/
2153    charOrder[8] =  8;  /*H*/
2154    charOrder[9] =  9;  /*I*/
2155    charOrder[10] = 11; /*L*/
2156    charOrder[11] = 10; /*K*/
2157    charOrder[12] = 12; /*M*/
2158    charOrder[13] =  6; /*F*/
2159    charOrder[14] = 14; /*P*/
2160    charOrder[15] = 17; /*S*/
2161    charOrder[16] = 18; /*T*/
2162    charOrder[17] = 20; /*W*/
2163    charOrder[18] = 22; /*Y*/
2164    charOrder[19] = 19; /*V*/
2165 
2166    for(vectorRef = 0; vectorRef < EFFECTIVE_ALPHABET; vectorRef++)
2167      putCkptNlm_FloatHi(theVector[charOrder[vectorRef]],ckptFile);
2168 }
2169 
2170 /*---------------------------------------------------------------------------*/
2171 /* Code to put a matrix, vector-by-vector. */
2172 /*---------------------------------------------------------------------------*/
putCkptFreqMatrix(Nlm_FloatHi ** theMatrix,Int4 length,Int4 width,FILE * ckptFile)2173 static void LIBCALL putCkptFreqMatrix (Nlm_FloatHi **theMatrix, Int4 length, Int4 width, FILE * ckptFile)
2174 {
2175   int  matrixRef;  /*loop index*/
2176 
2177   for (matrixRef = 0; matrixRef < length ; matrixRef++ )
2178     putFreqVector(theMatrix[matrixRef], width, ckptFile);
2179 }
2180 
2181 
2182 /*---------------------------------------------------------------------------*/
2183 /*Take a checkpoint at the end of the current PSI-BLAST round, stores
2184 query length, query, and position-specific target frequencies.
2185 Returns TRUE if checkpoint was sucessful and FALSE otherwise. */
2186 /*---------------------------------------------------------------------------*/
CddposTakeCheckpoint(posSearchItems * posSearch,compactSearchItems * compactSearch,CharPtr fileName,ValNodePtr * error_return)2187 Boolean LIBCALL CddposTakeCheckpoint(posSearchItems * posSearch, compactSearchItems * compactSearch, CharPtr fileName, ValNodePtr *error_return)
2188 {
2189   FILE * checkFile; /*file in which to take the checkpoint*/
2190   Int4 length; /*length of query sequence, and an index for it*/
2191   Int4 i; /*indices to position and alphabet */
2192   Char localChar; /*temporary character*/
2193 
2194   checkFile = FileOpen(fileName, "w");
2195   if (NULL == checkFile) {
2196     BlastConstructErrorMessage("CddposTakeCheckpoint", "Could not open checkpoint file", 1, error_return);
2197     return(FALSE);
2198   }
2199   length = compactSearch->qlength;
2200   putCkptInt4(length, checkFile);
2201   for(i = 0; i < length; i++) {
2202     localChar = getRes(compactSearch->query[i]);
2203     putCkptChar(localChar, checkFile);
2204   }
2205   putCkptFreqMatrix(posSearch->posFreqs,length,compactSearch->alphabetSize, checkFile);
2206   FileClose(checkFile);
2207   return(TRUE);
2208 }
2209 
2210 /*---------------------------------------------------------------------------*/
2211 /*---------------------------------------------------------------------------*/
2212 /* modelled after posReadAlignment (posit.c, as of 12/28/2000)               */
2213 /* instead of reading in a text alignment from a file, the posDesc array is  */
2214 /* populated from the DenseDiag alignment                                    */
2215 /*---------------------------------------------------------------------------*/
2216 /*---------------------------------------------------------------------------*/
CddposReadAlignment(compactSearchItems * compactSearch,SeqAlignPtr salp,Int4 numSeqs,Int4 alignLength,BioseqPtr bsp)2217 static posDesc** CddposReadAlignment(compactSearchItems *compactSearch,
2218                                      SeqAlignPtr salp, Int4 numSeqs,
2219 				     Int4 alignLength, BioseqPtr bsp)
2220 {
2221   posDesc    **returnArray;                    /*array of sequences to return*/
2222   SeqAlignPtr  salpThis;
2223   DenseDiagPtr ddp;
2224   SeqPortPtr   spp;
2225   Int4         i,j,c;                                          /*loop indices*/
2226   Int4         alignPos, queryOffset, subjectOffset, matchLength;
2227   Int4         index, splace, qplace, retrievalOffset, subjectLength;
2228   Uint1Ptr     s;
2229 
2230   returnArray = (posDesc**) MemNew(numSeqs * sizeof(posDesc *));
2231   if (NULL == returnArray) exit(EXIT_FAILURE);
2232   for (i = 0; i < numSeqs; i++) {
2233     returnArray[i] = (posDesc *) MemNew(alignLength * sizeof(posDesc));
2234     if (NULL == returnArray[i]) exit(EXIT_FAILURE);
2235     for(j = 0; j < alignLength; j++) {
2236       returnArray[i][j].letter = GAP_CHAR;
2237       returnArray[i][j].used = TRUE;
2238     }
2239   }
2240 
2241   alignPos = 0; i = 0;
2242   ASSERT(NULL != bsp);
2243   spp = SeqPortNew(bsp,0,bsp->length-1,Seq_strand_unknown,Seq_code_ncbistdaa);
2244   for (index=0; index<alignLength; index++) {
2245     returnArray[i][index].letter = SeqPortGetResidue(spp);
2246     returnArray[i][index].used   = TRUE;
2247   }
2248   spp = SeqPortFree(spp);
2249   salpThis = salp;
2250 
2251   while (salpThis) {
2252     i++;
2253     ddp = (DenseDiagPtr) salpThis->segs;
2254     s = CddGetSequenceWithDenseDiag(ddp,FALSE, &retrievalOffset, &subjectLength);
2255     while (ddp) {
2256       queryOffset = ddp->starts[0];
2257       subjectOffset = ddp->starts[1] - retrievalOffset;
2258       matchLength = ddp->len;
2259       for(c = 0, qplace = queryOffset, splace = subjectOffset; c < matchLength; c++, qplace++, splace++) {
2260         ASSERT(qplace < alignLength);
2261         ASSERT(i < numSeqs);
2262         returnArray[i][qplace].letter = (Int1) s[splace];
2263         returnArray[i][qplace].used = TRUE;
2264       }
2265       ddp = ddp->next;
2266     }
2267     s = MemFree(s);
2268     salpThis = salpThis->next;
2269   }
2270   return(returnArray);
2271 }
2272 
2273 
2274 
2275 /*---------------------------------------------------------------------------*/
2276 /*---------------------------------------------------------------------------*/
2277 /* modelled after posProcessAlignment (posit.c, as of 12/28/2000)            */
2278 /*---------------------------------------------------------------------------*/
2279 /*---------------------------------------------------------------------------*/
CddposProcessAlignment(posSearchItems * posSearch,compactSearchItems * compactSearch,SeqAlignPtr salp,Int4 numSeqs,Int4 alignLength,BioseqPtr bsp)2280 void LIBCALL CddposProcessAlignment(posSearchItems *posSearch,
2281                                     compactSearchItems *compactSearch,
2282 				    SeqAlignPtr salp, Int4 numSeqs,
2283 				    Int4 alignLength, BioseqPtr bsp)
2284 {
2285   Int4 queryPos, alignPos, linePos; /*placeholder for pos. in query & alignmt*/
2286   Int4 *queryDesc;      /*position correspondence between alignment and query*/
2287   Int4 seqIndex;                                      /*counter for sequences*/
2288   posDesc ** alignArray;
2289   Int4 queryLength;                                /*length of query sequence*/
2290 
2291   alignArray = CddposReadAlignment(compactSearch,salp,numSeqs,alignLength,bsp);
2292   queryDesc = (Int4 *) MemNew(alignLength * sizeof(Int4));
2293   if (NULL == queryDesc) exit(EXIT_FAILURE);
2294   for(alignPos = 0; alignPos < alignLength; alignPos++) queryDesc[alignPos] = GAP_HERE;
2295   alignPos = 0;
2296   queryPos = 0;
2297   for(linePos = 0; linePos < alignLength; linePos++) {
2298     queryDesc[alignPos] = queryPos;
2299     posSearch->posDescMatrix[0][queryPos].letter = alignArray[0][linePos].letter;
2300     posSearch->posDescMatrix[0][queryPos].used = TRUE;
2301     posSearch->posDescMatrix[0][queryPos].e_value = compactSearch->ethresh/2;
2302     posSearch->posDescMatrix[0][queryPos].leftExtent = 0;
2303     posSearch->posDescMatrix[0][queryPos].rightExtent = compactSearch->qlength - 1;
2304     posSearch->posC[queryPos][alignArray[0][linePos].letter]++;
2305     posSearch->posCount[queryPos]++;
2306     queryPos++;
2307     alignPos++;
2308   }
2309 
2310 
2311   queryLength = queryPos;
2312   for(seqIndex = 1; seqIndex < numSeqs; seqIndex++) {
2313     for(linePos = 0; linePos < alignLength; linePos++) {
2314       if (!(posSearch->posDescMatrix[0][queryDesc[linePos]].used)) {
2315         /*mark column as not participating*/
2316         posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].used = FALSE;
2317       }
2318       else {
2319         posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].letter = alignArray[seqIndex][linePos].letter;
2320         posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].used = TRUE;
2321         posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].e_value = compactSearch->ethresh/2;
2322         posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].leftExtent = 0;
2323         posSearch->posDescMatrix[seqIndex][queryDesc[linePos]].rightExtent = compactSearch->qlength;
2324       }
2325     }
2326   }
2327 
2328   /*make terminal gaps unused*/
2329   for(seqIndex = 1; seqIndex < numSeqs; seqIndex++) {
2330     linePos = 0;
2331     while((linePos < queryLength) && (posSearch->posDescMatrix[seqIndex][linePos].letter == GAP_CHAR)) {
2332       posSearch->posDescMatrix[seqIndex][linePos].used = FALSE;
2333       linePos++;
2334     }
2335     linePos = queryLength - 1;
2336     while((linePos >= 0) && (posSearch->posDescMatrix[seqIndex][linePos].letter == GAP_CHAR)) {
2337       posSearch->posDescMatrix[seqIndex][linePos].used = FALSE;
2338       linePos--;
2339     }
2340   }
2341 
2342 /* clean up */
2343   for (seqIndex=0;seqIndex<numSeqs;seqIndex++) {
2344     MemFree(alignArray[seqIndex]);
2345   }
2346   MemFree(alignArray);
2347   MemFree(queryDesc);
2348 }
2349 
2350