1 static char const rcsid[] = "$Id: copymat.c,v 6.50 2016/06/21 14:55:29 madden Exp $";
2
3 /*
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28
29 /*****************************************************************************
30
31 File name: copymat.c
32
33 Authors: Alejandro Schaffer, Sergei Shavirin
34
35 Contents: main routines for copymatrices program to convert
36 score matrices output by makematrices into a single byte-encoded file.
37
38 $Log: copymat.c,v $
39 Revision 6.50 2016/06/21 14:55:29 madden
40 Add one more argument for jumper changes, JIRA SB-1713
41
42 Revision 6.49 2008/11/04 16:44:38 maning
43 add type cast to fix compilation error
44
45 Revision 6.48 2008/02/01 14:04:25 madden
46 LookupTableWrapInit prototype change
47
48 Revision 6.47 2006/11/24 19:06:15 kans
49 added include of blast_filter.h
50
51 Revision 6.46 2006/11/21 17:24:20 papadopo
52 1. rearrange headers
53 2. change lookup table type
54
55 Revision 6.45 2006/09/15 15:45:35 madden
56 Change to LookupTableWrapInit
57
58 Revision 6.44 2005/12/22 14:22:19 papadopo
59 change signature of BLAST_FillLookupTableOptions
60
61 Revision 6.43 2005/12/20 15:36:39 papadopo
62 change name of structure field
63
64 Revision 6.42 2005/05/20 18:57:51 camacho
65 Update to use new signature to BLAST_FillLookupTableOptions
66
67 Revision 6.41 2005/02/14 14:11:55 camacho
68 Changes to use SBlastScoreMatrix
69
70 Revision 6.40 2005/01/10 13:48:20 madden
71 Change to BLAST_FillInitialWordOptions prototype
72
73 Revision 6.39 2004/09/15 17:40:21 papadopo
74 change use of ListNode to use of BlastSeqLoc for lookup table creation
75
76 Revision 6.38 2004/07/12 16:30:44 papadopo
77 LookupTable->BlastLookupTable
78
79 Revision 6.37 2004/06/22 16:45:56 camacho
80 Changed the blast_type_* definitions for the EBlastProgramType enumeration from
81 algo/blast.
82
83 Revision 6.36 2004/04/23 21:11:31 papadopo
84 force the thick backbone (dumped to the RPS .loo file) to contain the number of cells assumed by RPS blast
85
86 Revision 6.35 2004/04/16 14:48:04 papadopo
87 remove unneeded argument to FillLookupTableOptions
88
89 Revision 6.34 2004/04/07 21:48:48 camacho
90 Add missing header file
91
92 Revision 6.33 2004/04/06 12:15:44 camacho
93 Rename DoubleInt -> SSeqRange
94
95 Revision 6.32 2004/03/10 20:21:27 papadopo
96 add (unused) RPS blast parameters to FillLookupTableOptions
97
98 Revision 6.31 2004/03/04 21:16:10 papadopo
99 add (unused) RPS blast parameter to FillLookupTable call
100
101 Revision 6.30 2004/01/30 20:34:45 coulouri
102 fix minor nit to FileWrite call
103
104 Revision 6.29 2004/01/26 19:40:48 coulouri
105 * Correct buffer overrun
106 * Use offset rather than pointer in LookupBackboneCell
107
108 Revision 6.28 2003/11/24 18:18:47 coulouri
109 Correction to previous fix for 64-bit irix
110
111 Revision 6.27 2003/11/21 18:01:15 ivanov
112 Added extern definition for impalaMakeFileNames()
113
114 Revision 6.26 2003/11/20 15:44:32 camacho
115 Tom Madden's changes to use lookup table contruction code from algo/blast.
116
117 Revision 6.25 2003/05/30 17:31:09 coulouri
118 add rcsid
119
120 Revision 6.24 2003/05/13 16:02:42 coulouri
121 make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
122
123 Revision 6.23 2002/11/06 21:26:47 ucko
124 RPSConcatSequences: provide useful error messages, ignore all trailing space.
125
126 Revision 6.22 2002/04/08 19:02:31 madden
127 Allow float for threshold
128
129 Revision 6.21 2001/06/07 16:45:08 shavirin
130 Removed bug related to 64bit address structure on SGI platform.
131
132 Revision 6.20 2001/04/12 19:50:12 madden
133 Comment out unrescaling of matrix
134
135 Revision 6.19 2000/11/14 23:17:52 shavirin
136 Removed serious bug under NT platform related to diffence in "w" and "wb"
137 flag when opening file on PC NT computer. Removed unused header files.
138
139 Revision 6.18 2000/11/13 21:25:22 shavirin
140 Fixed possible bug in the function RPSUpdatePointers (64 bit architecture
141 specific).
142
143 Revision 6.17 2000/11/08 18:34:19 kans
144 commented out UNIX-specific headers, included by ncbilcl.h for UNIX anyway
145
146 Revision 6.16 2000/10/20 21:46:37 shavirin
147 Added additional parameters for creating RPS database.
148
149 Revision 6.15 2000/02/29 16:27:39 shavirin
150 Added protection against matrix with scaleFactor != 1 for RPS Blast
151
152 Revision 6.14 2000/02/28 21:08:34 shavirin
153 This fixes DEC Alpha problems of RPS Blast.
154
155 Revision 6.13 2000/02/28 19:06:47 shavirin
156 Added comments for RPS Blast functions.
157 Removed unused code.
158
159 Revision 6.12 2000/02/22 19:29:06 shavirin
160 Fixed DEC Alpha specific bug in the function RPSCreateLookupFile().
161
162 Revision 6.11 2000/02/17 19:11:15 shavirin
163 Removed reference to theCacheSize.
164
165 Revision 6.10 2000/01/13 15:27:10 shavirin
166 Added concatenation of files into single file (for later formatdb).
167
168 Revision 6.9 2000/01/12 14:39:46 shavirin
169 Added parameter to set cache size in lookup table foe RPS Blast.
170
171 Revision 6.8 2000/01/07 22:31:47 shavirin
172 Lookup table header now has notice, that this is single table.
173
174 Revision 6.7 1999/12/30 18:34:20 shavirin
175 Last row in the matrix for every sequence will be gap-row (-INT2_MAX)
176
177 Revision 6.6 1999/12/29 18:49:29 shavirin
178 Changed a little format of RPS lookup tables file.
179
180
181 *****************************************************************************/
182
183
184 #include <ncbi.h>
185 #include <sequtil.h>
186 #include <seqport.h>
187 #include <tofasta.h>
188 #include <algo/blast/core/blast_aalookup.h>
189 #include <algo/blast/core/blast_stat.h>
190 #include <algo/blast/core/blast_encoding.h>
191 #include <algo/blast/core/lookup_wrap.h>
192 #include <algo/blast/core/blast_filter.h>
193
194 #ifndef MAXLINELEN
195 # define MAXLINELEN 2000
196 #endif
197 #ifndef MAX_NAME_LENGTH
198 # define MAX_NAME_LENGTH 500
199 #endif
200 #ifndef PRO_ALPHABET_SIZE
201 # define PRO_ALPHABET_SIZE 26
202 #endif
203 #ifndef SORT_THRESHOLD
204 # define SORT_THRESHOLD 20
205 #endif
206 #ifndef RPS_MAGIC_NUMBER
207 # define RPS_MAGIC_NUMBER 7702
208 #endif
209 #ifndef RPS_ARRAY_SIZE
210 # define RPS_ARRAY_SIZE 32768
211 #endif
212 /*factor used to multiply the gapped K parameter to make it
213 more accurate in most cases*/
214 #ifndef PRO_K_MULTIPLIER
215 # define PRO_K_MULTIPLIER 1.2
216 #endif
217 #include <algo/blast/core/blast_lookup.h>
218 #include <algo/blast/core/blast_options.h>
219
220 typedef Int4 ScoreRow[PRO_ALPHABET_SIZE];
221 extern Boolean LIBCALL
222 IMPALAPrintHelp PROTO((Boolean html, Int4 line_length, Char * programName,
223 FILE *outfp));
224 extern void LIBCALL
225 impalaMakeFileNames PROTO((Char * matrixDbName, Char * auxiliaryFileName,
226 Char * mmapFileName, Char * seqFileName,
227 Char *matrixFileName, Char * ckptFileName,
228 Char *directoryPrefix));
229
230 #define NUMARG (sizeof(myargs)/sizeof(myargs[0]))
231
232 static Args myargs [] = {
233 { "Database for matrix profiles", /* 0 */
234 "stdin", NULL, NULL, FALSE, 'P', ARG_FILE_IN, 0.0, 0, NULL},
235 { "Print help; overrides all other arguments", /* 1 */
236 "F", NULL, NULL, FALSE, 'H', ARG_BOOLEAN, 0.0, 0, NULL},
237 { "Create RPS mem map file(s)", /* 2 */
238 "T", NULL, NULL, FALSE, 'r', ARG_BOOLEAN, 0.0, 0, NULL},
239 { "Threshold for extending hits for RPS database", /* 3 */
240 "11", NULL, NULL, FALSE, 'f', ARG_FLOAT, 0.0, 0, NULL},
241 { "Word size for RPS database", /* 4 */
242 "3", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL},
243 };
244
245 /*counts the number of items in sequencesFile and matricesFile, assumed to
246 be one per line, and checks that the numbers are equal.
247 returns the number if equal, 0 if unequal, rewinds the file descriptors
248 before returning*/
countProfiles(FILE * sequencesFile,FILE * matricesFile)249 static Int4 countProfiles(FILE *sequencesFile, FILE *matricesFile)
250 {
251 Int4 sequencesCount = 0; /*count for sequencesFile*/
252 Int4 matricesCount = 0; /*count for matricesFile*/
253 Char oneFileName[MAXLINELEN]; /*for reading one line per file*/
254
255 while (fgets(oneFileName,MAXLINELEN,sequencesFile))
256 sequencesCount++;
257 while (fgets(oneFileName,MAXLINELEN,matricesFile))
258 matricesCount++;
259 rewind(matricesFile);
260 rewind(sequencesFile);
261 if (sequencesCount == matricesCount)
262 return(sequencesCount);
263 else {
264 ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Sequences file has %d entries; Matrices file has %d entries; these should be equal\n", sequencesCount,matricesCount);
265 return(0);
266 }
267 }
268
269 /*free the memory associated with the position-specific score matrices*/
freeMatrix(ScoreRow * posMatrix)270 static void freeMatrix(ScoreRow *posMatrix)
271 {
272
273 MemFree(posMatrix);
274 }
275
276 /*allocate memory for the position-specific score matrices
277 enough memory is allocated to hold the largest matrix
278 the memory is reused for each different matrix*/
allocateMatrix(Int4 maxSequenceLength)279 static ScoreRow * allocateMatrix(Int4 maxSequenceLength)
280 {
281 ScoreRow *returnMatrix; /*matrix to return*/
282
283 returnMatrix = (ScoreRow *) MemNew(maxSequenceLength * sizeof(ScoreRow));
284 return(returnMatrix);
285 }
286
287 /* read in a position-specific score matrix from thisMatrixFile
288 the number of positions is dbSequenceLength
289 kbp keeps the Karlin-ALtschul parameters
290 returnMatrix is the memory address where the matrix is to be stored*/
readNextMatrix(FILE * thisMatrixFile,Int4 startPos,Int4 * endPos,ScoreRow * bigMatrix)291 static void readNextMatrix(FILE * thisMatrixFile,
292 Int4 startPos, Int4 *endPos,
293 ScoreRow *bigMatrix)
294 {
295 Int4 i, r; /*row indices for sequence and matrix*/
296 Int4 lengthInFile; /*length of query*/
297 Nlm_FloatHi junkLambda, junkK, junklogK, junkH; /*used to read in useless
298 Karlin blocks*/
299 Char *sequence; /*sequence to read in*/
300 Char rowOfScores[MAXLINELEN]; /*one row of scores to be read in*/
301
302 fscanf(thisMatrixFile, "%d", &lengthInFile);
303 sequence = (Char *) MemNew((lengthInFile + 2) * sizeof(Char));
304 fscanf(thisMatrixFile,"%s",sequence);
305 MemFree(sequence);
306 /*read in useless Karlin block*/
307 fscanf(thisMatrixFile,"%le", &junkLambda);
308 fscanf(thisMatrixFile,"%le", &junkK);
309 fscanf(thisMatrixFile,"%le", &junklogK);
310 fscanf(thisMatrixFile,"%le", &junkH);
311 /*read in useless Karlin block*/
312 fscanf(thisMatrixFile,"%le", &junkLambda);
313 fscanf(thisMatrixFile,"%le", &junkK);
314 fscanf(thisMatrixFile,"%le", &junklogK);
315 fscanf(thisMatrixFile,"%le", &junkH);
316 /*read in useless Karlin block*/
317 fscanf(thisMatrixFile,"%le", &junkLambda);
318 fscanf(thisMatrixFile,"%le", &junkK);
319 fscanf(thisMatrixFile,"%le", &junklogK);
320 fscanf(thisMatrixFile,"%le\n", &junkH);
321 for(i = 0, r = startPos; i < lengthInFile; i++, r++) {
322 fgets(rowOfScores, MAXLINELEN, thisMatrixFile);
323 sscanf(rowOfScores, "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",
324 &(bigMatrix[r][0]),
325 &(bigMatrix[r][1]),
326 &(bigMatrix[r][2]),
327 &(bigMatrix[r][3]),
328 &(bigMatrix[r][4]),
329 &(bigMatrix[r][5]),
330 &(bigMatrix[r][6]),
331 &(bigMatrix[r][7]),
332 &(bigMatrix[r][8]),
333 &(bigMatrix[r][9]),
334 &(bigMatrix[r][10]),
335 &(bigMatrix[r][11]),
336 &(bigMatrix[r][12]),
337 &(bigMatrix[r][13]),
338 &(bigMatrix[r][14]),
339 &(bigMatrix[r][15]),
340 &(bigMatrix[r][16]),
341 &(bigMatrix[r][17]),
342 &(bigMatrix[r][18]),
343 &(bigMatrix[r][19]),
344 &(bigMatrix[r][20]),
345 &(bigMatrix[r][21]),
346 &(bigMatrix[r][22]),
347 &(bigMatrix[r][23]),
348 &(bigMatrix[r][24]),
349 &(bigMatrix[r][25]));
350 }
351
352 if((Boolean) myargs[2].intvalue) {
353 /* Last row in the matrix will be gap-row (-INT2_MAX) */
354
355 for(i = 0; i < 26; i++) {
356 bigMatrix[r][i] = -INT2_MAX;
357 }
358 r++;
359 }
360
361 *endPos = r;
362 }
363
364 /*read each matrix in turn and store its scores in combinedMatrix*/
readAllMatrices(FILE * matrixnamefp,ScoreRow * combinedMatrix,Int4 numProfiles,CharPtr directoryPrefix,Int4Ptr seqlens)365 static void readAllMatrices(FILE *matrixnamefp, ScoreRow *combinedMatrix,
366 Int4 numProfiles, CharPtr directoryPrefix,
367 Int4Ptr seqlens)
368 {
369 Int4 i; /*loop index*/
370 Char oneMatrixFileName[MAXLINELEN]; /*name of matrix file to read*/
371 FILE *thisMatrixFile; /*descriptor for one matrix file*/
372 Int4 startPos; /*starting row in big matrix for next small matrix*/
373 Int4 endPos; /*ending row + 1 in big matrix for this small matrix*/
374 Int4 prefixLength; /*length of directoryPrefix*/
375 Int4 c1,c2; /*loop indices over characters*/
376 Char relativeMatrixFileName[MAXLINELEN];
377
378 startPos = 0;
379 endPos = 0;
380 if ('\0' != directoryPrefix[0]) {
381 strcpy(oneMatrixFileName, directoryPrefix);
382 prefixLength = strlen(directoryPrefix);
383 }
384 for (i = 0; i < numProfiles; i++) {
385 if ('\0' == directoryPrefix[0])
386 fscanf(matrixnamefp,"%s", oneMatrixFileName);
387 else {
388 fscanf(matrixnamefp,"%s", relativeMatrixFileName);
389 for(c1 = prefixLength, c2 = 0; relativeMatrixFileName[c2] != '\0';
390 c1++, c2++)
391 oneMatrixFileName[c1] = relativeMatrixFileName[c2];
392 oneMatrixFileName[c1] = '\0';
393 }
394
395 if ((thisMatrixFile = FileOpen(oneMatrixFileName, "r")) == NULL) {
396 ErrPostEx(SEV_FATAL, 1, 0, "profiles: Unable to open matrix file %s\n", oneMatrixFileName);
397 return;
398 }
399 readNextMatrix(thisMatrixFile, startPos, &endPos,
400 combinedMatrix);
401
402 if(seqlens != NULL) {
403 seqlens[i] = startPos;
404 }
405
406 startPos = endPos;
407 FileClose(thisMatrixFile);
408 }
409
410 if(seqlens != NULL) { /* Last entry - is the end of last sequence */
411 seqlens[i] = startPos;
412 }
413
414 return;
415 }
416
417 /*findTotalLength scans matrixAuxiliaryFile to find the
418 total number of positions among all the position-specific matrices*/
findTotalLength(FILE * matrixAuxiliaryFile,Int4 numProfiles,Nlm_FloatHiPtr scalingFactor)419 static Int4 findTotalLength(FILE *matrixAuxiliaryFile, Int4 numProfiles,
420 Nlm_FloatHiPtr scalingFactor)
421 {
422 Int4 maxLength; /*maximum length of sequence*/
423 Int4 thisLength; /*length of next sequence*/
424 Int4 totalLength; /*total length to return*/
425 Int4 dbLength; /*length of database*/
426 Int4 i; /*loop index*/
427 Nlm_FloatHi Kungapped, Hungapped; /*two values to read*/
428 Char * underlyingMatrixName; /*name of matrix to read*/
429 Int4 gap_open, gap_extend; /*gap costs to skip over in reading*/
430
431 underlyingMatrixName = MemNew(MAXLINELEN * sizeof(Char));
432 fscanf(matrixAuxiliaryFile,"%s",underlyingMatrixName);
433 fscanf(matrixAuxiliaryFile,"%d\n", &gap_open);
434 fscanf(matrixAuxiliaryFile,"%d\n", &gap_extend);
435 fscanf(matrixAuxiliaryFile, "%le", &Kungapped);
436 fscanf(matrixAuxiliaryFile, "%le", &Hungapped);
437 fscanf(matrixAuxiliaryFile, "%d", &maxLength);
438 fscanf(matrixAuxiliaryFile, "%d", &dbLength);
439 fscanf(matrixAuxiliaryFile, "%lf", scalingFactor);
440 totalLength = 0;
441 for (i = 0; i < numProfiles; i++) {
442 fscanf(matrixAuxiliaryFile, "%d", &thisLength);
443 fscanf(matrixAuxiliaryFile, "%le", &Kungapped);
444 totalLength += thisLength;
445 }
446 rewind(matrixAuxiliaryFile);
447 MemFree(underlyingMatrixName);
448 return(totalLength);
449 }
450
RPSUpdateOffsets(BlastAaLookupTable * lookup)451 static Boolean RPSUpdateOffsets(BlastAaLookupTable *lookup)
452 {
453 Uint4 len;
454 Int4 index;
455 Int4 num_used;
456 Int4 offset_diff;
457 AaLookupBackboneCell *bbc;
458 Int4 *ovf;
459
460 len = lookup->backbone_size;
461 offset_diff = lookup->word_length - 1;
462
463 // database assumes backbone type of lookup table
464 ASSERT(lookup->bone_type == eBackbone);
465 bbc = (AaLookupBackboneCell *)(lookup->thick_backbone);
466 ovf = (Int4 *)(lookup->overflow);
467
468 /* Walk through table, copying info into mod_lt[] */
469 for(index = 0; index < len; index++) {
470
471 if((num_used=bbc[index].num_used) <= 3)
472 {
473 while (num_used > 0)
474 {
475 num_used--;
476 bbc[index].payload.entries[num_used] += offset_diff;
477 }
478 }
479 else
480 {
481 while (num_used > 0)
482 {
483 num_used--;
484 ovf[ bbc[index].payload.overflow_cursor + num_used] += offset_diff;
485 }
486 }
487 }
488 return TRUE;
489 }
490
491
492 /* #define RPS_THRESHOLD 11 */
493 /* #define RPS_WORDSIZE 3 */
494
495 /* -- SSH --
496 Updates absolute pointers of the lookup table to relative pointers -
497 pointers relative to the start of "mod_lookup_table_memory" chunk
498 RPS Blast will calculate real pointers in run time using these values
499 */
RPSUpdatePointers(BlastAaLookupTable * lookup,Uint4 * new_overflow,Uint4 * new_overflow_size)500 Boolean RPSUpdatePointers(BlastAaLookupTable *lookup, Uint4 *new_overflow, Uint4 *new_overflow_size)
501 {
502 Uint4 len;
503 Int4 index;
504 Uint4 *start_address;
505 long mlpp_address;
506 Uint4 *new_overflow_cursor;
507 Int4 *src;
508 Int4 first_hit;
509 AaLookupBackboneCell *bbc;
510 Int4 *ovf;
511
512 // database assumes backbone type of lookup table
513 ASSERT(lookup->bone_type == eBackbone);
514 bbc = (AaLookupBackboneCell *)(lookup->thick_backbone);
515 ovf = (Int4 *)(lookup->overflow);
516
517 len = lookup->backbone_size;
518
519 start_address = new_overflow_cursor = new_overflow;
520
521 /* Walk through table, copying info into mod_lt[] */
522 for(index = 0; index < len; index++) {
523
524 if(bbc[index].num_used <= 3)
525 continue;
526
527 src = &(ovf[bbc[index].payload.overflow_cursor]);
528 MemCpy(new_overflow_cursor, &src[1], sizeof(Uint4)*(bbc[index].num_used-1));
529
530 mlpp_address = (long) new_overflow_cursor;
531
532 new_overflow_cursor += bbc[index].num_used-1;
533 first_hit = src[0];
534
535 mlpp_address -= (long) start_address;
536
537 /* Now this is new relative address - usually small */
538 bbc[index].payload.entries[1] = (Int4) mlpp_address;
539 bbc[index].payload.entries[0] = first_hit;
540
541 }
542
543 *new_overflow_size = new_overflow_cursor - new_overflow;
544
545 return TRUE;
546 }
547
548 /* -- SSH --
549 Write lookup table to the disk into file "*.loo", which will be
550 used memory-mapped during RPS Blast search
551 */
RPSDumpLookupTable(BlastAaLookupTable * lookup,FILE * fd)552 Boolean RPSDumpLookupTable(BlastAaLookupTable *lookup, FILE *fd)
553 {
554 Uint4 *new_overflow;
555 Uint4 new_overflow_size;
556 AaLookupBackboneCell empty_cell;
557 Int4 index;
558
559 RPSUpdateOffsets(lookup);
560
561 new_overflow = malloc(lookup->overflow_size*sizeof(Uint4));
562 RPSUpdatePointers(lookup, new_overflow, &new_overflow_size);
563
564 FileWrite(lookup->thick_backbone, sizeof(AaLookupBackboneCell), lookup->backbone_size, fd);
565
566 /* write empty cells out to the thick backbone size that
567 RPS blast expects */
568
569 memset(&empty_cell, 0, sizeof(empty_cell));
570 for (index = lookup->backbone_size; index < RPS_ARRAY_SIZE + 1; index++)
571 FileWrite(&empty_cell, sizeof(empty_cell), 1, fd);
572
573 if(new_overflow_size)
574 FileWrite(new_overflow,
575 sizeof(Uint4),
576 new_overflow_size,
577 fd);
578
579 sfree(new_overflow);
580
581 return TRUE;
582 }
583
584 /* Copied verbatim from algo/blast/core/blast_traceback.c */
RPSPsiMatrixAttach(BlastScoreBlk * sbp,Int4 ** rps_pssm)585 void RPSPsiMatrixAttach(BlastScoreBlk* sbp, Int4** rps_pssm)
586 {
587 ASSERT(sbp);
588
589 /* Create a dummy PSI-BLAST matrix structure, only to then free it as we'd
590 * like to piggy back on the already created structure to use the gapped
591 * alignment routines */
592 sbp->psi_matrix = (SPsiBlastScoreMatrix*)
593 calloc(1, sizeof(SPsiBlastScoreMatrix));
594 ASSERT(sbp->psi_matrix);
595
596 sbp->psi_matrix->pssm = (SBlastScoreMatrix*)
597 calloc(1, sizeof(SBlastScoreMatrix));
598 ASSERT(sbp->psi_matrix->pssm);
599
600 /* The only data field that RPS-BLAST really needs */
601 sbp->psi_matrix->pssm->data = rps_pssm;
602 }
603
RPSPsiMatrixDetach(BlastScoreBlk * sbp)604 void RPSPsiMatrixDetach(BlastScoreBlk* sbp)
605 {
606 ASSERT(sbp);
607 sbp->psi_matrix->pssm->data = NULL;
608 sfree(sbp->psi_matrix->pssm);
609 sfree(sbp->psi_matrix);
610 }
611
612
613 /* -- SSH --
614 Create lookup table for the large sequence, that represented
615 by all collection of PSSM matrixes and dump this table to disk
616 Used by RPS Blast.
617 */
RPSCreateLookupFile(ScoreRow * combinedMatrix,Int4 numProfiles,Int4Ptr seqlens,CharPtr filename,Nlm_FloatHi scalingFactor)618 Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles,
619 Int4Ptr seqlens, CharPtr filename,
620 Nlm_FloatHi scalingFactor)
621 {
622 BlastScoreBlk *sbp;
623 FILE *fd;
624 Int4 **posMatrix;
625 Int4 start, i, header_size, all_length, magicNumber;
626 Int4Ptr offsets;
627 Int4 num_lookups;
628 BlastSeqLoc *lookup_segment=NULL;
629 BlastAaLookupTable *lookup;
630 LookupTableWrap* lookup_wrap_ptr=NULL;
631 LookupTableOptions* lookup_options;
632
633
634 if((fd = FileOpen(filename, "wb")) == NULL)
635 return FALSE;
636
637 num_lookups = 1; /* Single lookup table for all set */
638
639 all_length = seqlens[numProfiles] - seqlens[0];
640
641 posMatrix = MemNew((all_length + 1) * sizeof(Int4 *));
642 for (i = 0; i < all_length; i++) {
643 posMatrix[i] = (Int4 *) &(combinedMatrix[i][0]);
644 }
645
646 /* Last row is necessary */
647 posMatrix[all_length] = MemNew(sizeof(Int4) * PRO_ALPHABET_SIZE);
648
649 for(i = 0; i < PRO_ALPHABET_SIZE; i++) {
650 posMatrix[all_length][i] = -INT2_MAX;
651 }
652
653 sbp = BlastScoreBlkNew(BLASTAA_SEQ_CODE, 1);
654 RPSPsiMatrixAttach(sbp, posMatrix);
655 LookupTableOptionsNew(eBlastTypeBlastp, &lookup_options);
656 BLAST_FillLookupTableOptions(lookup_options, eBlastTypePsiBlast, FALSE,
657 (Int4) (myargs[3].floatvalue*scalingFactor), myargs[4].intvalue);
658
659
660 BlastSeqLocNew(&lookup_segment, 0, all_length);
661
662 /* Need query for psi-blast?? where to put the PSSM? */
663 LookupTableWrapInit(NULL, lookup_options, NULL, lookup_segment, sbp, &lookup_wrap_ptr, NULL, NULL, NULL);
664
665 RPSPsiMatrixDetach(sbp);
666 sbp = BlastScoreBlkFree(sbp);
667 lookup_options = LookupTableOptionsFree(lookup_options);
668 lookup_segment = BlastSeqLocFree(lookup_segment);
669
670 lookup = (BlastAaLookupTable*) lookup_wrap_ptr->lut;
671
672 /* Only Uint4 maximum length for lookup file allowed in current
673 implementation */
674 header_size = (numProfiles+1)*sizeof(Int4) + 8*sizeof(Int4);
675
676 /* Beginning of file will be allocated for lookup offsets */
677 fseek(fd, header_size, SEEK_SET);
678
679 offsets = MemNew(sizeof(Int4) * (num_lookups + 1));
680
681
682 offsets[0] = ftell(fd);
683
684 start = seqlens[0]; /* 0 */
685
686 RPSDumpLookupTable(lookup, fd);
687
688 i = 1;
689
690 offsets[i] = ftell(fd); /* Last offset also recorded */
691
692 fseek(fd, 0, SEEK_SET);
693 magicNumber = RPS_MAGIC_NUMBER;
694 FileWrite(&magicNumber, sizeof(Int4), 1, fd); /* header[0] */
695 FileWrite(&num_lookups, sizeof(Int4), 1, fd); /* header[1] */
696 FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[2] */
697 FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[3] */
698 FileWrite(&lookup->overflow_size, sizeof(Int4), 1, fd); /* header[4] */
699
700 /* Now writing recorded offsets in the beginning of the file */
701
702 fseek(fd, 8*sizeof(Int4), SEEK_SET);
703 FileWrite(offsets, sizeof(Int4), num_lookups + 1, fd);
704 FileClose(fd);
705
706 /* Final memory cleenup */
707
708 MemFree(posMatrix[all_length]);
709 MemFree(posMatrix);
710
711 return TRUE;
712 }
713
714 /* -- SSH --
715 Create file <database_name> (without extention), which is concatenation
716 of all FASTA files used. Used by RPS Blast.
717 */
RPSConcatSequences(FILE * sfp,CharPtr fastaname)718 Boolean RPSConcatSequences(FILE *sfp, CharPtr fastaname)
719 {
720 FILE *fasta_fp, *fd;
721 Char oneFileName[MAXLINELEN]; /*for reading one line per file*/
722 Char buffer[1024];
723 Int4 bytes;
724 CharPtr chptr, last_non_space;
725
726 if((fasta_fp = FileOpen(fastaname, "w")) == NULL) {
727 ErrPostEx(SEV_FATAL, 1, 0, "concatenate sequences: "
728 "Unable to open target fasta file %s: %s\n",
729 fastaname, strerror(errno));
730 return FALSE;
731 }
732
733 rewind(sfp);
734
735 while (fgets(oneFileName, MAXLINELEN, sfp)) {
736
737 /* Remove trailing whitespace */
738 last_non_space = NULL;
739 for(chptr = oneFileName; *chptr != NULLB; chptr++) {
740 if (!isspace(*chptr))
741 last_non_space = chptr;
742 }
743 if (last_non_space != NULL)
744 last_non_space[1] = NULLB;
745
746 if((fd = FileOpen(oneFileName, "r")) == NULL) {
747 ErrPostEx(SEV_FATAL, 1, 0, "concatenate sequences: "
748 "Unable to open source fasta file %s: %s\n",
749 oneFileName, strerror(errno));
750 FileClose(fasta_fp);
751 return FALSE;
752 }
753
754 /* Now concatenating this file into set */
755 while((bytes = FileRead(buffer, 1, 1024, fd)) > 0)
756 FileWrite(buffer, 1, bytes, fasta_fp);
757 FileClose(fd);
758 }
759
760 FileClose(fasta_fp);
761
762 return TRUE;
763 }
764
Main(void)765 Int2 Main(void)
766
767 {
768
769 Char *profilesFileName; /*file name for list of profile file names*/
770 Char sequencesFileName[MAX_NAME_LENGTH]; /*file anme for list of sequence file names*/
771 Char matrixFileName[MAX_NAME_LENGTH]; /*file name for list of matrix file names*/
772 Char auxFileName[MAX_NAME_LENGTH]; /*file name for file containing auxiliary information*/
773 Char bigFileName[MAX_NAME_LENGTH]; /*file name to store byte-encoded coalesced matrix*/
774 Char lookupName[MAX_NAME_LENGTH]; /*file name to store precalculated lookup table */
775 FILE *auxiliaryfp; /*file descriptor for matrix auxiliary file*/
776 FILE *sequencesfp; /*files descriptor for file containing list of sequences*/
777 FILE *matrixnamefp; /*file descriptor for file containing matrix names*/
778 FILE *bigmatrixfile; /*file descriptor for file containing single big matrix*/
779 Int4 numProfiles; /*number of profiles*/
780 Int4 totalProfileLength; /*total length of all profiles*/
781 ScoreRow *combinedMatrix; /*combined matrix for all profiles*/
782 Char *directoryPrefix; /*directory where profile library is kept, used
783 to reach other directories indirectly*/
784
785 Int4Ptr seqlens;
786 Nlm_FloatHi scalingFactor; /*matrix scale to skip over in reading*/
787
788 if (! GetArgs ("copymatrices", NUMARG, myargs)) {
789 return (1);
790 }
791
792 if ((Boolean) myargs[1].intvalue) {
793 IMPALAPrintHelp(FALSE, 80, "copymat", stdout);
794 return(1);
795 }
796 profilesFileName = myargs[0].strvalue;
797 directoryPrefix = (Char *) MemNew(MAX_NAME_LENGTH *sizeof(char));
798 strcpy(directoryPrefix,profilesFileName);
799
800 impalaMakeFileNames(profilesFileName, auxFileName, bigFileName,
801 sequencesFileName, matrixFileName, NULL,
802 directoryPrefix);
803
804 if ((matrixnamefp = FileOpen(matrixFileName, "r")) == NULL) {
805 ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to open file with matrix file names %s\n", matrixFileName);
806 return (1);
807 }
808
809 if ((sequencesfp = FileOpen(sequencesFileName, "r")) == NULL) {
810 ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to open file with sequence file names %s\n", sequencesFileName);
811 return (1);
812 }
813
814 if ((auxiliaryfp = FileOpen(auxFileName, "r")) == NULL) {
815 ErrPostEx(SEV_FATAL, 1, 0, "profiles: Unable to open auxiliary file %s\n", auxFileName);
816 return (1);
817 }
818
819 /* -- SSH -- Name of matrix file depends on program - RPS or Impala */
820
821 if((Boolean) myargs[2].intvalue) {
822 sprintf(bigFileName, "%s.rps", profilesFileName);
823 }
824
825 if ((bigmatrixfile = FileOpen(bigFileName, "wb")) == NULL) {
826 ErrPostEx(SEV_FATAL, 1, 0, "rps-blast: Unable to open big matrix file %s\n", bigFileName);
827 return (1);
828 }
829
830 numProfiles = countProfiles(sequencesfp, matrixnamefp);
831 totalProfileLength = findTotalLength(auxiliaryfp, numProfiles,
832 &scalingFactor);
833
834 /* -- SSH -- Additional line in matrix with -INT2_MAX values */
835 if((Boolean) myargs[2].intvalue) {
836 totalProfileLength += numProfiles;
837 }
838
839 combinedMatrix = allocateMatrix(totalProfileLength);
840 if (NULL == combinedMatrix) {
841 ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to allocate matrix with%d rows\n", totalProfileLength);
842 return (1);
843
844 }
845 /* -- SSH -- RPS Blast data */
846 if ((Boolean) myargs[2].intvalue) {
847 seqlens = (Int4Ptr) MemNew((numProfiles +1) * sizeof(Int4));
848 } else {
849 seqlens = NULL;
850 }
851
852 readAllMatrices(matrixnamefp, combinedMatrix, numProfiles,
853 directoryPrefix, seqlens);
854
855 /* -- SSH -- For RPS Blast additional info will be added to the file */
856 if ((Boolean) myargs[2].intvalue) {
857 Int4 magicNumber = RPS_MAGIC_NUMBER;
858 FileWrite(&magicNumber, sizeof(Int4), 1, bigmatrixfile);
859 FileWrite(&numProfiles, sizeof(Int4), 1, bigmatrixfile);
860 FileWrite(seqlens, sizeof(Int4), numProfiles + 1, bigmatrixfile);
861
862 sprintf(lookupName, "%s.loo", profilesFileName);
863 RPSCreateLookupFile(combinedMatrix, numProfiles, seqlens, lookupName,
864 scalingFactor);
865
866 if(!RPSConcatSequences(sequencesfp, profilesFileName)) {
867 ErrPostEx(SEV_ERROR, 0,0, "Failure to concatenate sequences");
868 return 1;
869 }
870
871 }
872
873 FileWrite((void *) combinedMatrix[0], sizeof(ScoreRow),
874 (size_t) totalProfileLength, bigmatrixfile);
875 freeMatrix(combinedMatrix);
876 FileClose(bigmatrixfile);
877 FileClose(matrixnamefp);
878 FileClose(sequencesfp);
879 FileClose(auxiliaryfp);
880 return 0;
881 }
882