1 static char const rcsid[] = "$Id: impatool.c,v 6.16 2006/06/30 18:44:40 camacho Exp $";
2
3 /* $Id: impatool.c,v 6.16 2006/06/30 18:44:40 camacho Exp $
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28
29 /*****************************************************************************
30
31 File name: impatool.c
32
33 Author: Alejandro Schaffer
34
35 Contents: utility routines for IMPALA.
36
37 $Revision: 6.16 $
38
39 $Log: impatool.c,v $
40 Revision 6.16 2006/06/30 18:44:40 camacho
41 Add support for O and J to getRes, ResToInt
42
43 Revision 6.15 2004/10/04 13:35:53 camacho
44 Do not use hard coded constants in IMPALAfindUngappedLambda
45
46 Revision 6.14 2004/06/30 14:14:50 madden
47 Remove add_string_to_bufferEx, already defined in blfmtutl.c
48
49 Revision 6.13 2004/06/30 13:57:27 kans
50 add_string_to_bufferEx had to be LIBCALL because of blfmtutl.h prototype
51
52 Revision 6.12 2004/06/30 13:42:27 kans
53 include <blfmtutl.h> to clear up Mac compiler missing prototype errors
54
55 Revision 6.11 2003/05/30 17:25:36 coulouri
56 add rcsid
57
58 Revision 6.10 2001/12/28 18:01:54 dondosha
59 Modified sorting routines to break ties by score when E-value is (mis)-reported as 0.0 due to underflow in BlastKarlinStoE_simple
60
61 Revision 6.9 2000/07/26 16:54:07 lewisg
62 add LIBCALLs
63
64 Revision 6.8 2000/07/25 18:12:05 shavirin
65 WARNING: This is no-turning-back changed related to S&W Blast from
66 Alejandro Schaffer
67
68
69 *****************************************************************************/
70
71
72
73 #include <ncbi.h>
74 #include <objseq.h>
75 #include <objsset.h>
76 #include <sequtil.h>
77 #include <seqport.h>
78 #include <tofasta.h>
79 #include <blast.h>
80 #include <blastpri.h>
81 #include <txalign.h>
82 #include <simutil.h>
83 #include <posit.h>
84 #include <profiles.h>
85 #include <blfmtutl.h>
86
87
88 /*convert a residue character to its integer representation*/
getRes(Char input)89 Char LIBCALL getRes(Char input)
90 {
91 switch(input)
92 {
93 case 0:
94 return('-');
95 case 1:
96 return('A');
97 case 2:
98 return('B');
99 case 3:
100 return('C');
101 case 4:
102 return('D');
103 case 5:
104 return('E');
105 case 6:
106 return('F');
107 case 7:
108 return('G');
109 case 8:
110 return('H');
111 case 9:
112 return('I');
113 case 10:
114 return('K');
115 case 11:
116 return('L');
117 case 12:
118 return('M');
119 case 13:
120 return('N');
121 case 14:
122 return('P');
123 case 15:
124 return('Q');
125 case 16:
126 return('R');
127 case 17:
128 return('S');
129 case 18:
130 return('T');
131 case 19:
132 return('V');
133 case 20:
134 return('W');
135 case 21:
136 return('X');
137 case 22:
138 return('Y');
139 case 23:
140 return('Z');
141 case 24:
142 return('U');
143 case 25:
144 return('*');
145 case 26:
146 return('O');
147 case 27:
148 return('J');
149 default:
150 return('?');
151 }
152 }
153
154
155 /*get the citation for IMPALA*/
156
157 static CharPtr
IMPALAGetReference(Boolean html)158 IMPALAGetReference(Boolean html)
159
160 {
161 CharPtr ret_buffer;
162 Int2 ret_buffer_length;
163
164 ret_buffer = NULL;
165 ret_buffer_length = 0;
166
167
168 if (html) {
169 ;
170 } else
171 add_string_to_bufferEx("Reference: Alejandro A. Schaffer, Yuri I. Wolf, Chris P. Ponting, ", &ret_buffer, &ret_buffer_length, TRUE);
172 add_string_to_bufferEx("Eugene V. Koonin, L. Aravind, Stephen F. Altschul (1999), ", &ret_buffer, &ret_buffer_length, TRUE);
173 add_string_to_bufferEx("\"IMPALA: Matching a Protein Sequence Against a Collection of ", &ret_buffer, &ret_buffer_length, TRUE);
174 add_string_to_bufferEx("\"PSI-BLAST-Constructed Position-Specific Score Matrices\",", &ret_buffer, &ret_buffer_length, TRUE);
175 add_string_to_bufferEx("Bioinformatics 15:1000-1011.", &ret_buffer, &ret_buffer_length, TRUE);
176 return ret_buffer;
177 }
178
179 /*print the citation for IMPALA*/
180 Boolean LIBCALL
IMPALAPrintReference(Boolean html,Int4 line_length,FILE * outfp)181 IMPALAPrintReference(Boolean html, Int4 line_length, FILE *outfp)
182
183 {
184 CharPtr ret_buffer;
185
186 if (outfp == NULL)
187 return FALSE;
188
189 ret_buffer = IMPALAGetReference(html);
190 PrintTildeSepLines(ret_buffer, line_length, outfp);
191 ret_buffer = MemFree(ret_buffer);
192
193 return TRUE;
194 }
195
196 static CharPtr
makematGetHelp(Boolean html)197 makematGetHelp(Boolean html)
198 {
199
200 CharPtr ret_buffer;
201 Int2 ret_buffer_length;
202
203 ret_buffer = NULL;
204 ret_buffer_length = 0;
205
206
207 if (html) {
208 ;
209 } else {
210 add_string_to_bufferEx("makemat is the first profile preprocessor in the IMPALA software package.", &ret_buffer, &ret_buffer_length, TRUE);
211 add_string_to_bufferEx("makemat converts a collection of byte-encoded profiles, created by the -C option of PSI-BLAST, into portable ASCII form.", &ret_buffer, &ret_buffer_length, TRUE);
212 add_string_to_bufferEx("Prepare the following files: ", &ret_buffer, &ret_buffer_length, TRUE);
213 add_string_to_bufferEx("i. a collection of PSI-BLAST-generated profiles with arbitrary names and suffix .chk ", &ret_buffer, &ret_buffer_length, TRUE);
214 add_string_to_bufferEx("ii. a collection of master sequences, associated with the profiles, each in a separate file with arbitrary name and a 3 character suffix starting with c", &ret_buffer, &ret_buffer_length, TRUE);
215 add_string_to_bufferEx("iii. a list of profile file names, one per line, named <database_name>.pn", &ret_buffer, &ret_buffer_length, TRUE);
216 add_string_to_bufferEx("iv. a list of master sequence file names, one per line, in the same order as a list of profile names, named <database_name>.sn", &ret_buffer, &ret_buffer_length, TRUE);
217 add_string_to_bufferEx("The following files will be created:", &ret_buffer, &ret_buffer_length, TRUE);
218 add_string_to_bufferEx("i. a collection of ASCII files, corresponding to each of the original profiles, named <profile_name>.mtx", &ret_buffer, &ret_buffer_length, TRUE);
219 add_string_to_bufferEx("ii. a list of ASCII matrix files, named <database_name>.mn", &ret_buffer, &ret_buffer_length, TRUE);
220 add_string_to_bufferEx("iii. an ASCII file with auxiliary information, named <database_name>.aux", &ret_buffer, &ret_buffer_length, TRUE);
221 add_string_to_bufferEx("For the list of arguments to makemat enter makemat -", &ret_buffer, &ret_buffer_length, TRUE);
222 }
223 return ret_buffer;
224 }
225
226 static CharPtr
copymatGetHelp(Boolean html)227 copymatGetHelp(Boolean html)
228 {
229
230
231 CharPtr ret_buffer;
232 Int2 ret_buffer_length;
233
234 ret_buffer = NULL;
235 ret_buffer_length = 0;
236
237
238 if (html) {
239 ;
240 } else {
241 add_string_to_bufferEx("copymat is the second profile preprocessor in the IMPALA software package.", &ret_buffer, &ret_buffer_length, TRUE);
242 add_string_to_bufferEx("copymat converts ASCII matrices, produced by the primary preprocessor, makemat into a database that can be read into memory quickly", &ret_buffer, &ret_buffer_length, TRUE);
243 add_string_to_bufferEx("Prepare the following files:", &ret_buffer, &ret_buffer_length, TRUE);
244 add_string_to_bufferEx("i. a collection of ASCII files, correspondingto each of the original profiles, named <profile_name>.mtx (created by makemat)", &ret_buffer, &ret_buffer_length, TRUE);
245 add_string_to_bufferEx("ii. a collection of profile master sequences, associated with the profiles, each in a separate file with arbitrary name and a 3-charecter suffix starting with c", &ret_buffer, &ret_buffer_length, TRUE);
246 add_string_to_bufferEx("iii. a list of ASCII_matrix files, named <database_name>.mn (created by makemat)", &ret_buffer, &ret_buffer_length, TRUE);
247 add_string_to_bufferEx("iv. a list of master sequence file names, one per line, in the same order as a list of matrix names, named <database_name>.sn;", &ret_buffer, &ret_buffer_length, TRUE);
248 add_string_to_bufferEx("v. ASCII file with auxiliary information, named <database_name>.aux (created by makemat)", &ret_buffer, &ret_buffer_length, TRUE);
249 add_string_to_bufferEx("The files input to copymatices are in ASCII format and thus portable between machines with different encodings for machine-readable files", &ret_buffer, &ret_buffer_length, TRUE);
250 add_string_to_bufferEx("The following file will be created:", &ret_buffer, &ret_buffer_length, TRUE);
251 add_string_to_bufferEx("i. a huge binary file, containing all profile matrices, named <database_name>.mat", &ret_buffer, &ret_buffer_length, TRUE);
252 }
253 return ret_buffer;
254 }
255
256 static CharPtr
impalaGetHelp(Boolean html)257 impalaGetHelp(Boolean html)
258 {
259 CharPtr ret_buffer;
260 Int2 ret_buffer_length;
261
262 ret_buffer = NULL;
263 ret_buffer_length = 0;
264
265
266 if (html) {
267 ;
268 } else {
269 add_string_to_bufferEx("impala is the main program in the IMPALA software package.", &ret_buffer, &ret_buffer_length, TRUE);
270 add_string_to_bufferEx("impala searches for matches between a query sequence and a library of score matrices, prepared by makemat and copymat.", &ret_buffer, &ret_buffer_length, TRUE);
271 add_string_to_bufferEx("impala produces BLAST-formatted output", &ret_buffer, &ret_buffer_length, TRUE);
272 add_string_to_bufferEx("Before you start searching, check that in the directory with the library you have the following files.", &ret_buffer, &ret_buffer_length, TRUE);
273 add_string_to_bufferEx("If the library has K matrices, you should have:", &ret_buffer, &ret_buffer_length, TRUE);
274 add_string_to_bufferEx("K files with names ending in .mtx", &ret_buffer, &ret_buffer_length, TRUE);
275 add_string_to_bufferEx("K files with names ending in a 3-letter extension starting with c", &ret_buffer, &ret_buffer_length, TRUE);
276 add_string_to_bufferEx("1 file with name ending in .sn", &ret_buffer, &ret_buffer_length, TRUE);
277 add_string_to_bufferEx("1 file with name ending in .aux", &ret_buffer, &ret_buffer_length, TRUE);
278 add_string_to_bufferEx("1 file with name ending in .mn", &ret_buffer, &ret_buffer_length, TRUE);
279 add_string_to_bufferEx("1 file with name ending in .mat", &ret_buffer, &ret_buffer_length, TRUE);
280 }
281 return ret_buffer;
282 }
283
284 Boolean LIBCALL
IMPALAPrintHelp(Boolean html,Int4 line_length,Char * programName,FILE * outfp)285 IMPALAPrintHelp(Boolean html, Int4 line_length, Char * programName, FILE *outfp)
286
287 {
288 CharPtr ret_buffer;
289
290 if (outfp == NULL)
291 return FALSE;
292
293 if (0 == strcmp(programName,"makemat"))
294 ret_buffer = makematGetHelp(html);
295 if (0 == strcmp(programName,"copymat"))
296 ret_buffer = copymatGetHelp(html);
297 if (0 == strcmp(programName,"impala"))
298 ret_buffer = impalaGetHelp(html);
299
300 PrintTildeSepLines(ret_buffer, line_length, outfp);
301 ret_buffer = MemFree(ret_buffer);
302
303 return TRUE;
304 }
305
306
307
308 /*prefix and suffix are strings, returns a string with prefix
309 concatenated to suffix. Used to build multiple distinct
310 file names with a common prefix*/
addSuffixToName(Char * prefix,Char * suffix)311 Char * LIBCALL addSuffixToName(Char *prefix, Char *suffix)
312 {
313 Char *returnName; /*string to return*/
314 Int4 i,j; /*loop indices*/
315 Int4 prefixLength, suffixLength, totalLength; /*length of pieces and whole*/
316
317
318 prefixLength = strlen(prefix);
319 suffixLength = strlen(suffix);
320 totalLength = prefixLength + suffixLength;
321 returnName = (Char *) MemNew((totalLength + 1) * sizeof(Char));
322 for(i = 0; i < prefixLength; i++)
323 returnName[i] = prefix[i];
324 for(j = 0, i = prefixLength; j < suffixLength; i++, j++)
325 returnName[i] = suffix[j];
326 returnName[totalLength] = '\0';
327 return(returnName);
328 }
329
330 /*Use the specified name of the database to make file names
331 for the auxiliary information, mmap'd score matrices, list of
332 sequence files, and list of matrices, and lists of checkpoints.
333 Also extract a directory prefix that is to be prepended
334 to all file names for sequences and matrices and checkpoints */
impalaMakeFileNames(Char * matrixDbName,Char * auxiliaryFileName,Char * mmapFileName,Char * seqFileName,Char * matrixFileName,Char * ckptFileName,Char * directoryPrefix)335 void LIBCALL impalaMakeFileNames(Char * matrixDbName,
336 Char * auxiliaryFileName, Char * mmapFileName,
337 Char * seqFileName, Char *matrixFileName,
338 Char * ckptFileName,
339 Char *directoryPrefix)
340 {
341 Int4 commonLength; /*length of common name prefix*/
342 Int4 c; /*loop index*/
343
344 commonLength = strlen(matrixDbName);
345 strcpy(directoryPrefix,matrixDbName);
346 /*remove file name for profile library to peel back to last '/' */
347 for(c = commonLength; c >= 0; c--)
348 if ('/' == directoryPrefix[c])
349 break;
350 directoryPrefix[c+1] = '\0';
351 if (NULL != auxiliaryFileName)
352 strcpy(auxiliaryFileName,matrixDbName);
353 if (NULL != mmapFileName)
354 strcpy(mmapFileName,matrixDbName);
355 if (NULL != seqFileName)
356 strcpy(seqFileName,matrixDbName);
357 if (NULL != matrixFileName)
358 strcpy(matrixFileName,matrixDbName);
359 if (NULL != ckptFileName)
360 strcpy(ckptFileName,matrixDbName);
361 if (NULL != auxiliaryFileName) {
362 auxiliaryFileName[commonLength] = '.';
363 auxiliaryFileName[commonLength +1] = 'a';
364 auxiliaryFileName[commonLength + 2] = 'u';
365 auxiliaryFileName[commonLength + 3] = 'x';
366 auxiliaryFileName[commonLength + 4] = '\0';
367 }
368 if (NULL != mmapFileName) {
369 mmapFileName[commonLength] = '.';
370 mmapFileName[commonLength + 1] = 'm';
371 mmapFileName[commonLength + 2] = 'a';
372 mmapFileName[commonLength + 3] = 't';
373 mmapFileName[commonLength + 4] = '\0';
374 }
375 if (NULL != seqFileName) {
376 seqFileName[commonLength] = '.';
377 seqFileName[commonLength + 1] = 's';
378 seqFileName[commonLength+2] = 'n';
379 seqFileName[commonLength+3] = '\0';
380 }
381 if (NULL != matrixFileName) {
382 matrixFileName[commonLength] = '.';
383 matrixFileName[commonLength + 1] = 'm';
384 matrixFileName[commonLength+2] = 'n';
385 matrixFileName[commonLength+3] = '\0';
386 }
387 if (NULL != ckptFileName) {
388 ckptFileName[commonLength] = '.';
389 ckptFileName[commonLength + 1] = 'p';
390 ckptFileName[commonLength+2] = 'n';
391 ckptFileName[commonLength+3] = '\0';
392 }
393 }
394
395 Nlm_FloatHi LIBCALL
IMPALAfindUngappedLambda(Char * matrixName)396 IMPALAfindUngappedLambda(Char *matrixName)
397 {
398 Nlm_FloatHi* lambda_array = NULL;
399 int num_lambdas = BlastKarlinGetMatrixValues(matrixName, NULL, NULL,
400 &lambda_array, NULL, NULL,
401 NULL);
402 if (num_lambdas > 0) {
403 Nlm_FloatHi retval = lambda_array[0];
404 lambda_array = MemFree(lambda_array);
405 return retval;
406 } else {
407 return 0.0;
408 }
409 }
410
411 /*Given a sequence of 'length' amino acid residues, compute the
412 probability of each residue and put that in the array resProb*/
413 void LIBCALL
IMPALAfillResidueProbability(Uint1Ptr sequence,Int4 length,Nlm_FloatHi * resProb)414 IMPALAfillResidueProbability(Uint1Ptr sequence, Int4 length, Nlm_FloatHi * resProb)
415 {
416 Int4 frequency[PRO_ALPHABET_SIZE]; /*frequency of each letter*/
417 Int4 i; /*index*/
418 Int4 denominator; /*length not including X's*/
419
420 denominator = length;
421 for(i = 0; i < PRO_ALPHABET_SIZE; i++)
422 frequency[i] = 0;
423 for(i = 0; i < length; i++)
424 if (Xchar != sequence[i])
425 frequency[sequence[i]]++;
426 else
427 denominator--;
428 for(i = 0; i < PRO_ALPHABET_SIZE; i++) {
429 if (frequency[i] == 0)
430 resProb[i] = 0.0;
431 else
432 resProb[i] = ((Nlm_FloatHi) (frequency[i])) /((Nlm_FloatHi) denominator);
433 }
434 }
435
436 /*matrix is a position-specific score matrix with matrixLength positions
437 queryProbArray is an array containing the probability of occurrence
438 of each residue in the query
439 scoreArray is an array of probabilities for each score that is
440 to be used as a field in return_sfp
441 return_sfp is a the structure to be filled in and returned
442 range is the size of scoreArray and is an upper bound on the
443 difference between maximum score and minimum score in the matrix
444 the routine fillSfp computes the probability of each score weighted
445 by the probability of each query residue and fills those probabilities
446 into scoreArray and puts scoreArray as a field in
447 that in the structure that is returned
448 for indexing convenience the field storing scoreArray points to the
449 entry for score 0, so that referring to the -k index corresponds to
450 score -k */
451 BLAST_ScoreFreqPtr LIBCALL
IMPALAfillSfp(BLAST_Score ** matrix,Int4 matrixLength,Nlm_FloatHi * queryProbArray,Nlm_FloatHi * scoreArray,BLAST_ScoreFreqPtr return_sfp,Int4 range)452 IMPALAfillSfp(BLAST_Score **matrix, Int4 matrixLength, Nlm_FloatHi *queryProbArray, Nlm_FloatHi *scoreArray, BLAST_ScoreFreqPtr return_sfp, Int4 range)
453 {
454 Int4 minScore, maxScore; /*observed minimum and maximum scores*/
455 Int4 i,j; /* indices */
456 Nlm_FloatHi onePosFrac; /*1/matrix length as a double*/
457
458 minScore = maxScore = 0;
459
460 for(i = 0; i < matrixLength; i++) {
461 for(j = 0 ; j < PRO_ALPHABET_SIZE; j++) {
462 if (Xchar == j)
463 continue;
464 if ((matrix[i][j] != BLAST_SCORE_MIN) && (matrix[i][j] < minScore))
465 minScore = matrix[i][j];
466 if (matrix[i][j] > maxScore)
467 maxScore = matrix[i][j];
468 }
469 }
470 return_sfp->obs_min = minScore;
471 return_sfp->obs_max = maxScore;
472 for (i = 0; i < range; i++)
473 scoreArray[i] = 0.0;
474 return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/
475 onePosFrac = 1.0/ ((Nlm_FloatHi) matrixLength);
476 for(i = 0; i < matrixLength; i++) {
477 for (j = 0; j < PRO_ALPHABET_SIZE; j++) {
478 if (Xchar == j)
479 continue;
480 if(matrix[i][j] >= minScore) {
481 return_sfp->sprob[matrix[i][j]] += (onePosFrac * queryProbArray[j]);
482 }
483 }
484 }
485 return_sfp->score_avg = 0;
486 for(i = minScore; i <= maxScore; i++)
487 return_sfp->score_avg += i * return_sfp->sprob[i];
488 return(return_sfp);
489 }
490
491 /*Gets the scores of an alignment together into a ScorePtr;
492 adapted from similar code with a different formula in pseed3.c*/
addScoresToSeqAlign(Int4 rawScore,Nlm_FloatHi eValue,Nlm_FloatHi Lambda,Nlm_FloatHi logK)493 ScorePtr LIBCALL addScoresToSeqAlign(Int4 rawScore, Nlm_FloatHi eValue,
494 Nlm_FloatHi Lambda, Nlm_FloatHi logK)
495 {
496 Nlm_FloatHi bitScoreUnrounded; /*conversion for raw score to bit score*/
497 ScorePtr returnScore = NULL;
498
499 MakeBlastScore(&returnScore,"score",0.0, rawScore);
500 MakeBlastScore(&returnScore,"e_value",eValue,0);
501 bitScoreUnrounded = ((rawScore * Lambda) - logK)/NCBIMATH_LN2;
502 MakeBlastScore(&returnScore,"bit_score",bitScoreUnrounded, 0);
503 return(returnScore);
504 }
505
506 /*Bubble sort the entries in qs from index i through j*/
pro_bbsort(SWResults ** qs,Int4 i,Int4 j)507 static void pro_bbsort(SWResults **qs, Int4 i, Int4 j)
508 {
509 Int4 x, y; /*loop bounds for the two ends of the array to be sorted*/
510 SWResults *sp; /*temporary pointer for swapping*/
511
512 for (x = j; x > i; x--) {
513 for (y = i; y < x; y++) {
514 if ((qs[y]->eValue < qs[y+1]->eValue) ||
515 ((qs[y]->eValue == qs[y+1]->eValue) &&
516 (0.0 == qs[y]->eValue) &&
517 (qs[y]->score > qs[y+1]->score)) ||
518 ((qs[y]->eValue == qs[y+1]->eValue) &&
519 ((0.0 != qs[y]->eValue) || (qs[y]->score == qs[y+1]->score)) &&
520 (qs[y]->subject_index < qs[y+1]->subject_index)) ||
521 ((qs[y]->eValue == qs[y+1]->eValue) &&
522 (qs[y]->subject_index == qs[y+1]->subject_index) &&
523 (qs[y]->eValueThisAlign < qs[y+1]->eValueThisAlign)) ||
524 ((qs[y]->eValue == qs[y+1]->eValue) &&
525 (qs[y]->subject_index == qs[y+1]->subject_index) &&
526 (qs[y]->eValueThisAlign == qs[y+1]->eValueThisAlign) &&
527 (qs[y]->scoreThisAlign > qs[y+1]-> scoreThisAlign))) {
528 /*swap pointers for inverted adjacent elements*/
529 sp = qs[y];
530 qs[y] = qs[y+1];
531 qs[y+1] = sp;
532 }
533 }
534 }
535 }
536
537 /*choose the median of elemnts indexed i, i+j/2, j for
538 partitioning in quicksort, if median is not in
539 position i to start with, swap it to position i*/
medianOfThree(SWResults ** qs,Int4 i,Int4 j)540 static void medianOfThree(SWResults **qs, Int4 i, Int4 j)
541 {
542 Int4 middle;
543 Int4 swapIndex;
544 SWResults *swapTemp;
545
546 middle = (i+j)/2;
547 if (qs[i]->eValue < qs[middle]->eValue) {
548 if (qs[middle]->eValue < qs[j]->eValue)
549 swapIndex = middle;
550 else
551 if (qs[i]->eValue < qs[j]->eValue)
552 swapIndex = j;
553 else
554 swapIndex = i;
555 }
556 else {
557 if (qs[j]->eValue < qs[middle]->eValue)
558 swapIndex = middle;
559 else
560 if (qs[j]->eValue < qs[i]->eValue)
561 swapIndex = j;
562 else
563 swapIndex = i;
564 }
565 if (i != swapIndex) {
566 swapTemp = qs[i];
567 qs[i] = qs[swapIndex];
568 qs[swapIndex] = swapTemp;
569 }
570 }
571
572 /*quicksort the entries in qs from qs[i] through qs[j] */
pro_quicksort(SWResults ** qs,Int4 i,Int4 j)573 static void pro_quicksort(SWResults **qs, Int4 i, Int4 j)
574 {
575 Int4 lf, rt; /*left and right fingers into the array*/
576 Nlm_FloatHi partitionEvalue; /*Evalue to partition around*/
577 Int4 secondaryPartitionValue; /*for breaking ties*/
578 Int4 scorePartitionValue; /*for breaking ties with 0.0 eValue*/
579 Nlm_FloatHi tertiaryPartitionValue; /*for breaking ties*/
580 SWResults * tp; /*temporary pointer for swapping*/
581 if (j-i <= SORT_THRESHOLD) {
582 pro_bbsort(qs, i,j);
583 return;
584 }
585
586 lf = i+1;
587 rt = j;
588 /*use median of three since array may be nearly sorted*/
589 medianOfThree(qs, i, j);
590 /*implicitly choose qs[i] as the partition element*/
591 partitionEvalue = qs[i]->eValue;
592 secondaryPartitionValue = qs[i]->subject_index;
593 tertiaryPartitionValue = qs[i]->eValueThisAlign;
594 scorePartitionValue = qs[i]->score;
595 /*partititon around partitionEvalue = qs[i]*/
596 while (lf <= rt) {
597 while ((qs[lf]->eValue > partitionEvalue) ||
598 ((qs[lf]->eValue == partitionEvalue) && (0.0 == qs[lf]->eValue)
599 && (qs[lf]->score < scorePartitionValue)) ||
600 ((qs[lf]->eValue == partitionEvalue) &&
601 ((0.0 != qs[lf]->eValue) || (qs[lf]->score == scorePartitionValue)) &&
602 (qs[lf]->subject_index > secondaryPartitionValue)) ||
603 ((qs[lf]->eValue == partitionEvalue) &&
604 (qs[lf]->subject_index == secondaryPartitionValue) &&
605 (qs[lf]->eValueThisAlign > tertiaryPartitionValue)))
606 lf++;
607 while ((qs[rt]->eValue < partitionEvalue) ||
608 ((qs[rt]->eValue == partitionEvalue) && (0.0 == qs[rt]->eValue)
609 && (qs[rt]->score > scorePartitionValue)) ||
610 ((qs[rt]->eValue == partitionEvalue) &&
611 ((0.0 != qs[rt]->eValue) || (qs[rt]->score == scorePartitionValue)) &&
612 (qs[rt]->subject_index < secondaryPartitionValue)) ||
613 ((qs[rt]->eValue == partitionEvalue) &&
614 (qs[rt]->subject_index == secondaryPartitionValue) &&
615 (qs[rt]->eValueThisAlign < tertiaryPartitionValue)))
616 rt--;
617
618
619 if (lf < rt) {
620 /*swap elements on wrong side of partition*/
621 tp = qs[lf];
622 qs[lf] = qs[rt];
623 qs[rt] = tp;
624 rt--;
625 lf++;
626 }
627 else
628 break;
629 }
630 /*swap partition element into middle position*/
631 tp = qs[i];
632 qs[i] = qs[rt];
633 qs[rt] = tp;
634 /*call recursively on two parts*/
635 pro_quicksort(qs, i,rt-1);
636 pro_quicksort(qs, rt+1, j);
637 }
638
639
640
641 /*Sort the sequences that hit the query by increasing score;
642 This routine converts the list in proResultsList to
643 an array for sorting and then converts back to a singly-linked list
644 adapted from quicksort_hits of pseed3.c
645 no_of_seq is the number of entries in the results list*/
pro_quicksort_hits(Int4 no_of_seq,SWResults ** proResultsList)646 void LIBCALL pro_quicksort_hits(Int4 no_of_seq, SWResults **proResultsList)
647 {
648 Int4 i; /*loop index for the resutls list*/
649 SWResults *sp; /*pointer to one entry in the array*/
650 SWResults sentinel; /*sentinel to add at the end of the array*/
651 SWResults **qs; /*local array for sorting*/
652
653 /*Copy the list starting from proResultsList
654 into the array qs*/
655 qs = (SWResults **) MemNew(sizeof(SWResults*)*(no_of_seq+1));
656 for (i = 0, sp = (*proResultsList);
657 i < no_of_seq; i++, sp = sp->next)
658 qs[i] = sp;
659 /*Put sentinel at the end of the array*/
660 qs[i] = &sentinel;
661 sentinel.eValue = -0.1;
662 sentinel.eValueThisAlign = -0.1;
663 pro_quicksort(qs, 0, no_of_seq-1);
664 /*Copy back to the list starting at seedResults->listOfMatchingSequences*/
665 for (i = no_of_seq-1; i > 0; i--)
666 qs[i]->next = qs[i-1];
667 qs[0]->next = NULL;
668 (*proResultsList) = qs[no_of_seq-1];
669 MemFree(qs);
670 }
671
672