1 static char const rcsid[] = "$Id: impatool.c,v 6.16 2006/06/30 18:44:40 camacho Exp $";
2 
3 /* $Id: impatool.c,v 6.16 2006/06/30 18:44:40 camacho Exp $
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28 
29 /*****************************************************************************
30 
31 File name: impatool.c
32 
33 Author: Alejandro Schaffer
34 
35 Contents: utility routines for IMPALA.
36 
37  $Revision: 6.16 $
38 
39  $Log: impatool.c,v $
40  Revision 6.16  2006/06/30 18:44:40  camacho
41  Add support for O and J to getRes, ResToInt
42 
43  Revision 6.15  2004/10/04 13:35:53  camacho
44  Do not use hard coded constants in IMPALAfindUngappedLambda
45 
46  Revision 6.14  2004/06/30 14:14:50  madden
47  Remove add_string_to_bufferEx, already defined in blfmtutl.c
48 
49  Revision 6.13  2004/06/30 13:57:27  kans
50  add_string_to_bufferEx had to be LIBCALL because of blfmtutl.h prototype
51 
52  Revision 6.12  2004/06/30 13:42:27  kans
53  include <blfmtutl.h> to clear up Mac compiler missing prototype errors
54 
55  Revision 6.11  2003/05/30 17:25:36  coulouri
56  add rcsid
57 
58  Revision 6.10  2001/12/28 18:01:54  dondosha
59  Modified sorting routines to break ties by score when E-value is (mis)-reported as 0.0 due to underflow in BlastKarlinStoE_simple
60 
61  Revision 6.9  2000/07/26 16:54:07  lewisg
62  add LIBCALLs
63 
64  Revision 6.8  2000/07/25 18:12:05  shavirin
65  WARNING: This is no-turning-back changed related to S&W Blast from
66  Alejandro Schaffer
67 
68 
69 *****************************************************************************/
70 
71 
72 
73 #include <ncbi.h>
74 #include <objseq.h>
75 #include <objsset.h>
76 #include <sequtil.h>
77 #include <seqport.h>
78 #include <tofasta.h>
79 #include <blast.h>
80 #include <blastpri.h>
81 #include <txalign.h>
82 #include <simutil.h>
83 #include <posit.h>
84 #include <profiles.h>
85 #include <blfmtutl.h>
86 
87 
88 /*convert a residue character to its integer representation*/
getRes(Char input)89 Char LIBCALL getRes(Char input)
90 {
91     switch(input)
92       {
93       case 0:
94 	return('-');
95       case 1:
96 	return('A');
97       case 2:
98 	return('B');
99       case 3:
100 	return('C');
101       case 4:
102 	return('D');
103       case 5:
104 	return('E');
105       case 6:
106 	return('F');
107       case 7:
108 	return('G');
109       case 8:
110 	return('H');
111       case 9:
112 	return('I');
113       case 10:
114 	return('K');
115       case 11:
116 	return('L');
117       case 12:
118 	return('M');
119       case 13:
120 	return('N');
121       case 14:
122 	return('P');
123       case 15:
124 	return('Q');
125       case 16:
126 	return('R');
127       case 17:
128 	return('S');
129       case 18:
130 	return('T');
131       case 19:
132 	return('V');
133       case 20:
134 	return('W');
135       case 21:
136 	return('X');
137       case 22:
138 	return('Y');
139       case 23:
140 	return('Z');
141       case 24:
142 	return('U');
143       case 25:
144 	return('*');
145       case 26:
146 	return('O');
147       case 27:
148 	return('J');
149       default:
150         return('?');
151     }
152 }
153 
154 
155 /*get the citation for IMPALA*/
156 
157 static CharPtr
IMPALAGetReference(Boolean html)158 IMPALAGetReference(Boolean html)
159 
160 {
161 	CharPtr ret_buffer;
162 	Int2 ret_buffer_length;
163 
164 	ret_buffer = NULL;
165 	ret_buffer_length = 0;
166 
167 
168 	if (html) {
169           ;
170 	} else
171 		add_string_to_bufferEx("Reference: Alejandro A. Schaffer, Yuri I. Wolf, Chris P. Ponting, ", &ret_buffer, &ret_buffer_length, TRUE);
172 	add_string_to_bufferEx("Eugene V. Koonin, L. Aravind, Stephen F. Altschul (1999), ", &ret_buffer, &ret_buffer_length, TRUE);
173 	add_string_to_bufferEx("\"IMPALA: Matching a Protein Sequence Against a Collection of ", &ret_buffer, &ret_buffer_length, TRUE);
174 	add_string_to_bufferEx("\"PSI-BLAST-Constructed Position-Specific Score Matrices\",", &ret_buffer, &ret_buffer_length, TRUE);
175 	add_string_to_bufferEx("Bioinformatics 15:1000-1011.", &ret_buffer, &ret_buffer_length, TRUE);
176 	return ret_buffer;
177 }
178 
179 /*print the citation for IMPALA*/
180 Boolean LIBCALL
IMPALAPrintReference(Boolean html,Int4 line_length,FILE * outfp)181 IMPALAPrintReference(Boolean html, Int4 line_length, FILE *outfp)
182 
183 {
184 	CharPtr ret_buffer;
185 
186 	if (outfp == NULL)
187 		return FALSE;
188 
189         ret_buffer = IMPALAGetReference(html);
190         PrintTildeSepLines(ret_buffer, line_length, outfp);
191         ret_buffer = MemFree(ret_buffer);
192 
193 	return TRUE;
194 }
195 
196 static CharPtr
makematGetHelp(Boolean html)197 makematGetHelp(Boolean html)
198 {
199 
200 	CharPtr ret_buffer;
201 	Int2 ret_buffer_length;
202 
203 	ret_buffer = NULL;
204 	ret_buffer_length = 0;
205 
206 
207 	if (html) {
208           ;
209 	} else {
210 	  add_string_to_bufferEx("makemat is the first profile preprocessor in the IMPALA software package.", &ret_buffer, &ret_buffer_length, TRUE);
211 	  add_string_to_bufferEx("makemat  converts a collection of byte-encoded profiles, created by the -C option of PSI-BLAST, into portable ASCII form.", &ret_buffer, &ret_buffer_length, TRUE);
212 	  add_string_to_bufferEx("Prepare the following files: ", &ret_buffer, &ret_buffer_length, TRUE);
213 	  add_string_to_bufferEx("i. a collection of PSI-BLAST-generated profiles with arbitrary   names and suffix .chk  ", &ret_buffer, &ret_buffer_length, TRUE);
214 	  add_string_to_bufferEx("ii. a collection of master sequences, associated with the profiles, each in a separate file with arbitrary name and a 3 character suffix starting with c", &ret_buffer, &ret_buffer_length, TRUE);
215 	  add_string_to_bufferEx("iii. a list of profile file names, one per line, named <database_name>.pn", &ret_buffer, &ret_buffer_length, TRUE);
216 	  add_string_to_bufferEx("iv. a list of master sequence file names, one per line, in the same order as a list of profile names, named <database_name>.sn", &ret_buffer, &ret_buffer_length, TRUE);
217 	  add_string_to_bufferEx("The following files will be created:", &ret_buffer, &ret_buffer_length, TRUE);
218 	  add_string_to_bufferEx("i. a collection of ASCII files, corresponding to each of the original profiles, named  <profile_name>.mtx", &ret_buffer, &ret_buffer_length, TRUE);
219 	  add_string_to_bufferEx("ii. a list of ASCII matrix files, named <database_name>.mn", &ret_buffer, &ret_buffer_length, TRUE);
220 	  add_string_to_bufferEx("iii. an ASCII file with auxiliary information, named <database_name>.aux", &ret_buffer, &ret_buffer_length, TRUE);
221 	  add_string_to_bufferEx("For the list of arguments to makemat enter makemat -", &ret_buffer, &ret_buffer_length, TRUE);
222         }
223 	return ret_buffer;
224 }
225 
226 static CharPtr
copymatGetHelp(Boolean html)227 copymatGetHelp(Boolean html)
228 {
229 
230 
231 	CharPtr ret_buffer;
232 	Int2 ret_buffer_length;
233 
234 	ret_buffer = NULL;
235 	ret_buffer_length = 0;
236 
237 
238 	if (html) {
239           ;
240 	} else {
241 	  add_string_to_bufferEx("copymat is the second profile preprocessor in the IMPALA software package.", &ret_buffer, &ret_buffer_length, TRUE);
242 	  add_string_to_bufferEx("copymat converts ASCII matrices, produced by the primary preprocessor, makemat into a database that can be read into memory quickly", &ret_buffer, &ret_buffer_length, TRUE);
243 	  add_string_to_bufferEx("Prepare the following files:", &ret_buffer, &ret_buffer_length, TRUE);
244 	  add_string_to_bufferEx("i. a collection of ASCII files, correspondingto each of the original profiles, named <profile_name>.mtx (created by makemat)", &ret_buffer, &ret_buffer_length, TRUE);
245 	  add_string_to_bufferEx("ii. a collection of profile master sequences, associated with the profiles, each in a separate file with arbitrary name and a 3-charecter suffix starting with c", &ret_buffer, &ret_buffer_length, TRUE);
246 	  add_string_to_bufferEx("iii. a list of ASCII_matrix files, named <database_name>.mn  (created by makemat)", &ret_buffer, &ret_buffer_length, TRUE);
247 	  add_string_to_bufferEx("iv. a list of master sequence file names, one per  line, in the same order as a list of matrix names, named <database_name>.sn;", &ret_buffer, &ret_buffer_length, TRUE);
248 	  add_string_to_bufferEx("v. ASCII file with auxiliary information, named  <database_name>.aux (created by makemat)", &ret_buffer, &ret_buffer_length, TRUE);
249 	  add_string_to_bufferEx("The files input to copymatices are in ASCII format and thus portable between machines with different encodings for machine-readable files", &ret_buffer, &ret_buffer_length, TRUE);
250 	  add_string_to_bufferEx("The following file will be created:", &ret_buffer, &ret_buffer_length, TRUE);
251 	  add_string_to_bufferEx("i. a huge binary file, containing all profile matrices, named <database_name>.mat", &ret_buffer, &ret_buffer_length, TRUE);
252         }
253 	return ret_buffer;
254 }
255 
256 static CharPtr
impalaGetHelp(Boolean html)257 impalaGetHelp(Boolean html)
258 {
259 	CharPtr ret_buffer;
260 	Int2 ret_buffer_length;
261 
262 	ret_buffer = NULL;
263 	ret_buffer_length = 0;
264 
265 
266 	if (html) {
267           ;
268 	} else {
269 	  add_string_to_bufferEx("impala is the main program in the IMPALA software package.", &ret_buffer, &ret_buffer_length, TRUE);
270 	  add_string_to_bufferEx("impala searches for matches between a query sequence and a library of score matrices, prepared by makemat and copymat.", &ret_buffer, &ret_buffer_length, TRUE);
271 	  add_string_to_bufferEx("impala produces BLAST-formatted output", &ret_buffer, &ret_buffer_length, TRUE);
272 	  add_string_to_bufferEx("Before you start searching, check that in the directory with the library you have the following files.", &ret_buffer, &ret_buffer_length, TRUE);
273 	  add_string_to_bufferEx("If the library has K matrices, you should have:", &ret_buffer, &ret_buffer_length, TRUE);
274 	  add_string_to_bufferEx("K files with names ending in .mtx", &ret_buffer, &ret_buffer_length, TRUE);
275 	  add_string_to_bufferEx("K files with names ending in a 3-letter extension starting with c", &ret_buffer, &ret_buffer_length, TRUE);
276 	  add_string_to_bufferEx("1 file with name ending in .sn", &ret_buffer, &ret_buffer_length, TRUE);
277 	  add_string_to_bufferEx("1 file with name ending in .aux", &ret_buffer, &ret_buffer_length, TRUE);
278 	  add_string_to_bufferEx("1 file with name ending in .mn", &ret_buffer, &ret_buffer_length, TRUE);
279 	  add_string_to_bufferEx("1 file with name ending in .mat", &ret_buffer, &ret_buffer_length, TRUE);
280         }
281 	return ret_buffer;
282 }
283 
284 Boolean LIBCALL
IMPALAPrintHelp(Boolean html,Int4 line_length,Char * programName,FILE * outfp)285 IMPALAPrintHelp(Boolean html, Int4 line_length, Char * programName, FILE *outfp)
286 
287 {
288 	CharPtr ret_buffer;
289 
290 	if (outfp == NULL)
291 		return FALSE;
292 
293         if (0 == strcmp(programName,"makemat"))
294 	  ret_buffer = makematGetHelp(html);
295         if (0 == strcmp(programName,"copymat"))
296 	  ret_buffer = copymatGetHelp(html);
297         if (0 == strcmp(programName,"impala"))
298           ret_buffer = impalaGetHelp(html);
299 
300         PrintTildeSepLines(ret_buffer, line_length, outfp);
301         ret_buffer = MemFree(ret_buffer);
302 
303 	return TRUE;
304 }
305 
306 
307 
308 /*prefix and suffix are strings, returns a string with prefix
309   concatenated to suffix. Used to build multiple distinct
310   file names with a common prefix*/
addSuffixToName(Char * prefix,Char * suffix)311 Char * LIBCALL addSuffixToName(Char *prefix, Char *suffix)
312 {
313   Char *returnName; /*string to return*/
314   Int4 i,j; /*loop indices*/
315   Int4 prefixLength, suffixLength, totalLength; /*length of pieces and whole*/
316 
317 
318   prefixLength = strlen(prefix);
319   suffixLength = strlen(suffix);
320   totalLength = prefixLength + suffixLength;
321   returnName = (Char *) MemNew((totalLength + 1) * sizeof(Char));
322   for(i = 0; i < prefixLength; i++)
323     returnName[i] = prefix[i];
324   for(j = 0, i = prefixLength; j < suffixLength; i++, j++)
325     returnName[i] = suffix[j];
326   returnName[totalLength] = '\0';
327   return(returnName);
328 }
329 
330 /*Use the specified name of the database to make file names
331   for the auxiliary information, mmap'd score matrices, list of
332   sequence files, and list of matrices, and lists of checkpoints.
333   Also extract a directory prefix that is to be prepended
334   to all file names for sequences and matrices and checkpoints */
impalaMakeFileNames(Char * matrixDbName,Char * auxiliaryFileName,Char * mmapFileName,Char * seqFileName,Char * matrixFileName,Char * ckptFileName,Char * directoryPrefix)335 void  LIBCALL impalaMakeFileNames(Char * matrixDbName,
336 		    Char * auxiliaryFileName, Char * mmapFileName,
337 		    Char * seqFileName, Char *matrixFileName,
338                     Char * ckptFileName,
339 		    Char *directoryPrefix)
340 {
341   Int4 commonLength;  /*length of common name prefix*/
342   Int4 c; /*loop index*/
343 
344   commonLength = strlen(matrixDbName);
345   strcpy(directoryPrefix,matrixDbName);
346   /*remove file name for profile library to peel back to last '/' */
347   for(c = commonLength; c >= 0; c--)
348     if ('/' == directoryPrefix[c])
349       break;
350   directoryPrefix[c+1] = '\0';
351   if (NULL != auxiliaryFileName)
352      strcpy(auxiliaryFileName,matrixDbName);
353   if (NULL != mmapFileName)
354      strcpy(mmapFileName,matrixDbName);
355   if (NULL != seqFileName)
356      strcpy(seqFileName,matrixDbName);
357   if (NULL != matrixFileName)
358      strcpy(matrixFileName,matrixDbName);
359   if (NULL != ckptFileName)
360      strcpy(ckptFileName,matrixDbName);
361   if (NULL != auxiliaryFileName) {
362     auxiliaryFileName[commonLength] = '.';
363     auxiliaryFileName[commonLength +1] = 'a';
364     auxiliaryFileName[commonLength + 2] = 'u';
365     auxiliaryFileName[commonLength + 3] = 'x';
366     auxiliaryFileName[commonLength + 4] = '\0';
367   }
368   if (NULL != mmapFileName) {
369     mmapFileName[commonLength] = '.';
370     mmapFileName[commonLength + 1] = 'm';
371     mmapFileName[commonLength + 2] = 'a';
372     mmapFileName[commonLength + 3] = 't';
373     mmapFileName[commonLength + 4] = '\0';
374   }
375   if (NULL != seqFileName) {
376     seqFileName[commonLength] = '.';
377     seqFileName[commonLength + 1] = 's';
378     seqFileName[commonLength+2] = 'n';
379     seqFileName[commonLength+3] = '\0';
380   }
381   if (NULL != matrixFileName) {
382     matrixFileName[commonLength] = '.';
383     matrixFileName[commonLength + 1] = 'm';
384     matrixFileName[commonLength+2] = 'n';
385     matrixFileName[commonLength+3] = '\0';
386   }
387   if (NULL != ckptFileName) {
388     ckptFileName[commonLength] = '.';
389     ckptFileName[commonLength + 1] = 'p';
390     ckptFileName[commonLength+2] = 'n';
391     ckptFileName[commonLength+3] = '\0';
392   }
393 }
394 
395 Nlm_FloatHi LIBCALL
IMPALAfindUngappedLambda(Char * matrixName)396 IMPALAfindUngappedLambda(Char *matrixName)
397 {
398     Nlm_FloatHi* lambda_array = NULL;
399     int num_lambdas = BlastKarlinGetMatrixValues(matrixName, NULL, NULL,
400                                                  &lambda_array, NULL, NULL,
401                                                  NULL);
402     if (num_lambdas > 0) {
403         Nlm_FloatHi retval = lambda_array[0];
404         lambda_array = MemFree(lambda_array);
405         return retval;
406     } else {
407         return 0.0;
408     }
409 }
410 
411 /*Given a sequence of 'length' amino acid residues, compute the
412   probability of each residue and put that in the array resProb*/
413 void LIBCALL
IMPALAfillResidueProbability(Uint1Ptr sequence,Int4 length,Nlm_FloatHi * resProb)414 IMPALAfillResidueProbability(Uint1Ptr sequence, Int4 length, Nlm_FloatHi * resProb)
415 {
416   Int4 frequency[PRO_ALPHABET_SIZE]; /*frequency of each letter*/
417   Int4 i; /*index*/
418   Int4 denominator; /*length not including X's*/
419 
420   denominator = length;
421   for(i = 0; i < PRO_ALPHABET_SIZE; i++)
422     frequency[i] = 0;
423   for(i = 0; i < length; i++)
424     if (Xchar != sequence[i])
425       frequency[sequence[i]]++;
426     else
427       denominator--;
428   for(i = 0; i < PRO_ALPHABET_SIZE; i++) {
429     if (frequency[i] == 0)
430       resProb[i] = 0.0;
431     else
432       resProb[i] = ((Nlm_FloatHi) (frequency[i])) /((Nlm_FloatHi) denominator);
433   }
434 }
435 
436 /*matrix is a position-specific score matrix with matrixLength positions
437   queryProbArray is an array containing the probability of occurrence
438   of each residue in the query
439   scoreArray is an array of probabilities for each score that is
440     to be used as a field in return_sfp
441   return_sfp is a the structure to be filled in and returned
442   range is the size of scoreArray and is an upper bound on the
443    difference between maximum score and minimum score in the matrix
444   the routine fillSfp computes the probability of each score weighted
445    by the probability of each query residue and fills those probabilities
446    into scoreArray and puts scoreArray as a field in
447    that in the structure that is returned
448    for indexing convenience the field storing scoreArray points to the
449    entry for score 0, so that referring to the -k index corresponds to
450    score -k */
451 BLAST_ScoreFreqPtr LIBCALL
IMPALAfillSfp(BLAST_Score ** matrix,Int4 matrixLength,Nlm_FloatHi * queryProbArray,Nlm_FloatHi * scoreArray,BLAST_ScoreFreqPtr return_sfp,Int4 range)452 IMPALAfillSfp(BLAST_Score **matrix, Int4 matrixLength, Nlm_FloatHi *queryProbArray, Nlm_FloatHi *scoreArray,  BLAST_ScoreFreqPtr return_sfp, Int4 range)
453 {
454   Int4 minScore, maxScore; /*observed minimum and maximum scores*/
455   Int4 i,j; /* indices */
456   Nlm_FloatHi onePosFrac; /*1/matrix length as a double*/
457 
458   minScore = maxScore = 0;
459 
460   for(i = 0; i < matrixLength; i++) {
461     for(j = 0 ; j < PRO_ALPHABET_SIZE; j++) {
462       if (Xchar == j)
463         continue;
464       if ((matrix[i][j] != BLAST_SCORE_MIN) && (matrix[i][j] < minScore))
465 	minScore = matrix[i][j];
466       if (matrix[i][j] > maxScore)
467         maxScore = matrix[i][j];
468     }
469   }
470   return_sfp->obs_min = minScore;
471   return_sfp->obs_max = maxScore;
472   for (i = 0; i < range; i++)
473     scoreArray[i] = 0.0;
474   return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/
475   onePosFrac = 1.0/ ((Nlm_FloatHi) matrixLength);
476   for(i = 0; i < matrixLength; i++) {
477     for (j = 0; j < PRO_ALPHABET_SIZE; j++) {
478       if (Xchar == j)
479         continue;
480       if(matrix[i][j] >= minScore) {
481         return_sfp->sprob[matrix[i][j]] += (onePosFrac * queryProbArray[j]);
482       }
483     }
484   }
485   return_sfp->score_avg = 0;
486   for(i = minScore; i <= maxScore; i++)
487     return_sfp->score_avg += i * return_sfp->sprob[i];
488   return(return_sfp);
489 }
490 
491 /*Gets the scores of an alignment together into a ScorePtr;
492   adapted from similar code with a different formula in pseed3.c*/
addScoresToSeqAlign(Int4 rawScore,Nlm_FloatHi eValue,Nlm_FloatHi Lambda,Nlm_FloatHi logK)493 ScorePtr LIBCALL addScoresToSeqAlign(Int4 rawScore, Nlm_FloatHi eValue,
494            Nlm_FloatHi Lambda, Nlm_FloatHi logK)
495 {
496   Nlm_FloatHi bitScoreUnrounded; /*conversion for raw score to bit score*/
497   ScorePtr returnScore = NULL;
498 
499   MakeBlastScore(&returnScore,"score",0.0, rawScore);
500   MakeBlastScore(&returnScore,"e_value",eValue,0);
501   bitScoreUnrounded = ((rawScore * Lambda) - logK)/NCBIMATH_LN2;
502   MakeBlastScore(&returnScore,"bit_score",bitScoreUnrounded, 0);
503   return(returnScore);
504 }
505 
506 /*Bubble sort the entries in qs from index i through j*/
pro_bbsort(SWResults ** qs,Int4 i,Int4 j)507 static void pro_bbsort(SWResults **qs, Int4 i, Int4 j)
508 {
509     Int4 x, y; /*loop bounds for the two ends of the array to be sorted*/
510     SWResults *sp; /*temporary pointer for swapping*/
511 
512     for (x = j; x > i; x--) {
513       for (y = i; y < x; y++) {
514 	if ((qs[y]->eValue < qs[y+1]->eValue) ||
515 	    ((qs[y]->eValue == qs[y+1]->eValue) &&
516              (0.0 == qs[y]->eValue) &&
517              (qs[y]->score > qs[y+1]->score)) ||
518             ((qs[y]->eValue == qs[y+1]->eValue) &&
519              ((0.0 != qs[y]->eValue) || (qs[y]->score == qs[y+1]->score)) &&
520              (qs[y]->subject_index < qs[y+1]->subject_index)) ||
521             ((qs[y]->eValue == qs[y+1]->eValue) &&
522              (qs[y]->subject_index == qs[y+1]->subject_index) &&
523                (qs[y]->eValueThisAlign < qs[y+1]->eValueThisAlign)) ||
524             ((qs[y]->eValue == qs[y+1]->eValue) &&
525              (qs[y]->subject_index == qs[y+1]->subject_index) &&
526                (qs[y]->eValueThisAlign == qs[y+1]->eValueThisAlign) &&
527 	     (qs[y]->scoreThisAlign > qs[y+1]-> scoreThisAlign))) {
528 	  /*swap pointers for inverted adjacent elements*/
529 	  sp = qs[y];
530 	  qs[y] = qs[y+1];
531 	  qs[y+1] = sp;
532 	}
533       }
534     }
535 }
536 
537 /*choose the median of  elemnts indexed i, i+j/2, j for
538 partitioning in quicksort, if median is not in
539 position i to start with, swap it to position i*/
medianOfThree(SWResults ** qs,Int4 i,Int4 j)540 static void  medianOfThree(SWResults **qs, Int4 i, Int4 j)
541 {
542   Int4 middle;
543   Int4 swapIndex;
544   SWResults *swapTemp;
545 
546   middle = (i+j)/2;
547   if (qs[i]->eValue < qs[middle]->eValue) {
548     if (qs[middle]->eValue < qs[j]->eValue)
549       swapIndex = middle;
550     else
551       if (qs[i]->eValue < qs[j]->eValue)
552         swapIndex = j;
553       else
554         swapIndex = i;
555   }
556   else {
557     if (qs[j]->eValue < qs[middle]->eValue)
558       swapIndex = middle;
559     else
560       if (qs[j]->eValue < qs[i]->eValue)
561         swapIndex = j;
562       else
563         swapIndex = i;
564   }
565   if (i != swapIndex) {
566     swapTemp = qs[i];
567     qs[i] = qs[swapIndex];
568     qs[swapIndex] = swapTemp;
569   }
570 }
571 
572 /*quicksort the entries in qs from qs[i] through qs[j] */
pro_quicksort(SWResults ** qs,Int4 i,Int4 j)573 static void pro_quicksort(SWResults **qs, Int4 i, Int4 j)
574 {
575     Int4 lf, rt;  /*left and right fingers into the array*/
576     Nlm_FloatHi partitionEvalue; /*Evalue to partition around*/
577     Int4  secondaryPartitionValue; /*for breaking ties*/
578     Int4 scorePartitionValue; /*for breaking ties with 0.0 eValue*/
579     Nlm_FloatHi tertiaryPartitionValue; /*for breaking ties*/
580     SWResults * tp; /*temporary pointer for swapping*/
581     if (j-i <= SORT_THRESHOLD) {
582       pro_bbsort(qs, i,j);
583       return;
584     }
585 
586     lf = i+1;
587     rt = j;
588     /*use median of three since array may be nearly sorted*/
589     medianOfThree(qs, i, j);
590     /*implicitly choose qs[i] as the partition element*/
591     partitionEvalue = qs[i]->eValue;
592     secondaryPartitionValue = qs[i]->subject_index;
593     tertiaryPartitionValue = qs[i]->eValueThisAlign;
594     scorePartitionValue = qs[i]->score;
595     /*partititon around partitionEvalue = qs[i]*/
596     while (lf <= rt) {
597       while ((qs[lf]->eValue >  partitionEvalue)  ||
598               ((qs[lf]->eValue == partitionEvalue) && (0.0 == qs[lf]->eValue)
599                 && (qs[lf]->score < scorePartitionValue)) ||
600               ((qs[lf]->eValue == partitionEvalue) &&
601                ((0.0 != qs[lf]->eValue) || (qs[lf]->score == scorePartitionValue)) &&
602                (qs[lf]->subject_index > secondaryPartitionValue)) ||
603 	      ((qs[lf]->eValue == partitionEvalue) &&
604                (qs[lf]->subject_index == secondaryPartitionValue) &&
605                (qs[lf]->eValueThisAlign > tertiaryPartitionValue)))
606 	lf++;
607       while ((qs[rt]->eValue <  partitionEvalue)  ||
608               ((qs[rt]->eValue == partitionEvalue) && (0.0 == qs[rt]->eValue)
609                && (qs[rt]->score > scorePartitionValue)) ||
610               ((qs[rt]->eValue == partitionEvalue) &&
611 	       ((0.0 != qs[rt]->eValue) || (qs[rt]->score == scorePartitionValue)) &&
612                (qs[rt]->subject_index < secondaryPartitionValue)) ||
613 	      ((qs[rt]->eValue == partitionEvalue) &&
614                (qs[rt]->subject_index == secondaryPartitionValue) &&
615                (qs[rt]->eValueThisAlign < tertiaryPartitionValue)))
616 	rt--;
617 
618 
619       if (lf < rt) {
620 	/*swap elements on wrong side of partition*/
621 	tp = qs[lf];
622 	qs[lf] = qs[rt];
623 	qs[rt] = tp;
624 	rt--;
625 	lf++;
626       }
627       else
628 	break;
629     }
630     /*swap partition element into middle position*/
631     tp = qs[i];
632     qs[i] = qs[rt];
633     qs[rt] = tp;
634     /*call recursively on two parts*/
635     pro_quicksort(qs, i,rt-1);
636     pro_quicksort(qs, rt+1, j);
637 }
638 
639 
640 
641 /*Sort the sequences that hit the query by increasing score;
642   This routine converts the list in proResultsList to
643   an array for sorting and then converts back to a singly-linked list
644   adapted from quicksort_hits of pseed3.c
645   no_of_seq is the number of entries in the results list*/
pro_quicksort_hits(Int4 no_of_seq,SWResults ** proResultsList)646 void LIBCALL pro_quicksort_hits(Int4 no_of_seq, SWResults **proResultsList)
647 {
648   Int4 i; /*loop index for the resutls list*/
649     SWResults *sp; /*pointer to one entry in the array*/
650     SWResults sentinel; /*sentinel to add at the end of the array*/
651     SWResults **qs; /*local array for sorting*/
652 
653     /*Copy the list starting from proResultsList
654       into the array qs*/
655     qs = (SWResults **) MemNew(sizeof(SWResults*)*(no_of_seq+1));
656     for (i = 0, sp = (*proResultsList);
657 	 i < no_of_seq; i++, sp = sp->next)
658       qs[i] = sp;
659     /*Put sentinel at the end of the array*/
660     qs[i] = &sentinel;
661     sentinel.eValue = -0.1;
662     sentinel.eValueThisAlign = -0.1;
663     pro_quicksort(qs, 0, no_of_seq-1);
664     /*Copy back to the list starting at seedResults->listOfMatchingSequences*/
665     for (i = no_of_seq-1; i > 0; i--)
666       qs[i]->next = qs[i-1];
667     qs[0]->next = NULL;
668     (*proResultsList) = qs[no_of_seq-1];
669     MemFree(qs);
670 }
671 
672