1 static char const rcsid[] = "$Id: copymat.c,v 6.50 2016/06/21 14:55:29 madden Exp $";
2 
3 /*
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 */
28 
29 /*****************************************************************************
30 
31 File name: copymat.c
32 
33 Authors: Alejandro Schaffer, Sergei Shavirin
34 
35 Contents: main routines for copymatrices program to convert
36 score matrices output by makematrices into a single byte-encoded file.
37 
38 $Log: copymat.c,v $
39 Revision 6.50  2016/06/21 14:55:29  madden
40 Add one more argument for jumper changes, JIRA SB-1713
41 
42 Revision 6.49  2008/11/04 16:44:38  maning
43 add type cast to fix compilation error
44 
45 Revision 6.48  2008/02/01 14:04:25  madden
46 LookupTableWrapInit prototype change
47 
48 Revision 6.47  2006/11/24 19:06:15  kans
49 added include of blast_filter.h
50 
51 Revision 6.46  2006/11/21 17:24:20  papadopo
52 1. rearrange headers
53 2. change lookup table type
54 
55 Revision 6.45  2006/09/15 15:45:35  madden
56 Change to LookupTableWrapInit
57 
58 Revision 6.44  2005/12/22 14:22:19  papadopo
59 change signature of BLAST_FillLookupTableOptions
60 
61 Revision 6.43  2005/12/20 15:36:39  papadopo
62 change name of structure field
63 
64 Revision 6.42  2005/05/20 18:57:51  camacho
65 Update to use new signature to BLAST_FillLookupTableOptions
66 
67 Revision 6.41  2005/02/14 14:11:55  camacho
68 Changes to use SBlastScoreMatrix
69 
70 Revision 6.40  2005/01/10 13:48:20  madden
71 Change to BLAST_FillInitialWordOptions prototype
72 
73 Revision 6.39  2004/09/15 17:40:21  papadopo
74 change use of ListNode to use of BlastSeqLoc for lookup table creation
75 
76 Revision 6.38  2004/07/12 16:30:44  papadopo
77 LookupTable->BlastLookupTable
78 
79 Revision 6.37  2004/06/22 16:45:56  camacho
80 Changed the blast_type_* definitions for the EBlastProgramType enumeration from
81 algo/blast.
82 
83 Revision 6.36  2004/04/23 21:11:31  papadopo
84 force the thick backbone (dumped to the RPS .loo file) to contain the number of cells assumed by RPS blast
85 
86 Revision 6.35  2004/04/16 14:48:04  papadopo
87 remove unneeded argument to FillLookupTableOptions
88 
89 Revision 6.34  2004/04/07 21:48:48  camacho
90 Add missing header file
91 
92 Revision 6.33  2004/04/06 12:15:44  camacho
93 Rename DoubleInt -> SSeqRange
94 
95 Revision 6.32  2004/03/10 20:21:27  papadopo
96 add (unused) RPS blast parameters to FillLookupTableOptions
97 
98 Revision 6.31  2004/03/04 21:16:10  papadopo
99 add (unused) RPS blast parameter to FillLookupTable call
100 
101 Revision 6.30  2004/01/30 20:34:45  coulouri
102 fix minor nit to FileWrite call
103 
104 Revision 6.29  2004/01/26 19:40:48  coulouri
105 * Correct buffer overrun
106 * Use offset rather than pointer in LookupBackboneCell
107 
108 Revision 6.28  2003/11/24 18:18:47  coulouri
109 Correction to previous fix for 64-bit irix
110 
111 Revision 6.27  2003/11/21 18:01:15  ivanov
112 Added extern definition for impalaMakeFileNames()
113 
114 Revision 6.26  2003/11/20 15:44:32  camacho
115 Tom Madden's changes to use lookup table contruction code from algo/blast.
116 
117 Revision 6.25  2003/05/30 17:31:09  coulouri
118 add rcsid
119 
120 Revision 6.24  2003/05/13 16:02:42  coulouri
121 make ErrPostEx(SEV_FATAL, ...) exit with nonzero status
122 
123 Revision 6.23  2002/11/06 21:26:47  ucko
124 RPSConcatSequences: provide useful error messages, ignore all trailing space.
125 
126 Revision 6.22  2002/04/08 19:02:31  madden
127 Allow float for threshold
128 
129 Revision 6.21  2001/06/07 16:45:08  shavirin
130 Removed bug related to 64bit address structure on SGI platform.
131 
132 Revision 6.20  2001/04/12 19:50:12  madden
133 Comment out unrescaling of matrix
134 
135 Revision 6.19  2000/11/14 23:17:52  shavirin
136 Removed serious bug under NT platform related to diffence in "w" and "wb"
137 flag when opening file on PC NT computer. Removed unused header files.
138 
139 Revision 6.18  2000/11/13 21:25:22  shavirin
140 Fixed possible bug in the function RPSUpdatePointers (64 bit architecture
141 specific).
142 
143 Revision 6.17  2000/11/08 18:34:19  kans
144 commented out UNIX-specific headers, included by ncbilcl.h for UNIX anyway
145 
146 Revision 6.16  2000/10/20 21:46:37  shavirin
147 Added additional parameters for creating RPS database.
148 
149 Revision 6.15  2000/02/29 16:27:39  shavirin
150 Added protection against matrix with scaleFactor != 1 for RPS Blast
151 
152 Revision 6.14  2000/02/28 21:08:34  shavirin
153 This fixes DEC Alpha problems of RPS Blast.
154 
155 Revision 6.13  2000/02/28 19:06:47  shavirin
156 Added comments for RPS Blast functions.
157 Removed unused code.
158 
159 Revision 6.12  2000/02/22 19:29:06  shavirin
160 Fixed DEC Alpha specific bug in the function RPSCreateLookupFile().
161 
162 Revision 6.11  2000/02/17 19:11:15  shavirin
163 Removed reference to theCacheSize.
164 
165 Revision 6.10  2000/01/13 15:27:10  shavirin
166 Added concatenation of files into single file (for later formatdb).
167 
168 Revision 6.9  2000/01/12 14:39:46  shavirin
169 Added parameter to set cache size in lookup table foe RPS Blast.
170 
171 Revision 6.8  2000/01/07 22:31:47  shavirin
172 Lookup table header now has notice, that this is single table.
173 
174 Revision 6.7  1999/12/30 18:34:20  shavirin
175 Last row in the matrix for every sequence will be gap-row (-INT2_MAX)
176 
177 Revision 6.6  1999/12/29 18:49:29  shavirin
178 Changed a little format of RPS lookup tables file.
179 
180 
181 *****************************************************************************/
182 
183 
184 #include <ncbi.h>
185 #include <sequtil.h>
186 #include <seqport.h>
187 #include <tofasta.h>
188 #include <algo/blast/core/blast_aalookup.h>
189 #include <algo/blast/core/blast_stat.h>
190 #include <algo/blast/core/blast_encoding.h>
191 #include <algo/blast/core/lookup_wrap.h>
192 #include <algo/blast/core/blast_filter.h>
193 
194 #ifndef MAXLINELEN
195 #   define MAXLINELEN 2000
196 #endif
197 #ifndef MAX_NAME_LENGTH
198 #   define MAX_NAME_LENGTH 500
199 #endif
200 #ifndef PRO_ALPHABET_SIZE
201 #   define PRO_ALPHABET_SIZE  26
202 #endif
203 #ifndef SORT_THRESHOLD
204 #   define SORT_THRESHOLD 20
205 #endif
206 #ifndef RPS_MAGIC_NUMBER
207 #   define RPS_MAGIC_NUMBER 7702
208 #endif
209 #ifndef RPS_ARRAY_SIZE
210 #   define RPS_ARRAY_SIZE 32768
211 #endif
212 /*factor used to multiply the gapped K parameter to make it
213   more accurate in most cases*/
214 #ifndef PRO_K_MULTIPLIER
215 #   define PRO_K_MULTIPLIER 1.2
216 #endif
217 #include <algo/blast/core/blast_lookup.h>
218 #include <algo/blast/core/blast_options.h>
219 
220 typedef Int4 ScoreRow[PRO_ALPHABET_SIZE];
221 extern Boolean LIBCALL
222 IMPALAPrintHelp PROTO((Boolean html, Int4 line_length, Char * programName,
223                        FILE *outfp));
224 extern void  LIBCALL
225 impalaMakeFileNames PROTO((Char * matrixDbName, Char * auxiliaryFileName,
226                            Char * mmapFileName, Char * seqFileName,
227                            Char *matrixFileName, Char * ckptFileName,
228                            Char *directoryPrefix));
229 
230 #define NUMARG (sizeof(myargs)/sizeof(myargs[0]))
231 
232 static Args myargs [] = {
233     { "Database for matrix profiles", /* 0 */
234       "stdin", NULL, NULL, FALSE, 'P', ARG_FILE_IN, 0.0, 0, NULL},
235     { "Print help; overrides all other arguments", /* 1 */
236       "F", NULL, NULL, FALSE, 'H', ARG_BOOLEAN, 0.0, 0, NULL},
237     { "Create RPS mem map file(s)", /* 2 */
238       "T", NULL, NULL, FALSE, 'r', ARG_BOOLEAN, 0.0, 0, NULL},
239     { "Threshold for extending hits for RPS database", /* 3 */
240       "11", NULL, NULL, FALSE, 'f', ARG_FLOAT, 0.0, 0, NULL},
241     { "Word size for RPS database", /* 4 */
242       "3", NULL, NULL, FALSE, 'W', ARG_INT, 0.0, 0, NULL},
243 };
244 
245 /*counts the number of items in sequencesFile and matricesFile, assumed to
246   be one per line, and checks that the numbers are equal.
247   returns the number if equal, 0 if unequal, rewinds the file descriptors
248   before returning*/
countProfiles(FILE * sequencesFile,FILE * matricesFile)249 static Int4 countProfiles(FILE *sequencesFile, FILE *matricesFile)
250 {
251     Int4 sequencesCount = 0; /*count for sequencesFile*/
252     Int4 matricesCount = 0; /*count for matricesFile*/
253     Char oneFileName[MAXLINELEN]; /*for reading one line per file*/
254 
255     while (fgets(oneFileName,MAXLINELEN,sequencesFile))
256         sequencesCount++;
257     while (fgets(oneFileName,MAXLINELEN,matricesFile))
258         matricesCount++;
259     rewind(matricesFile);
260     rewind(sequencesFile);
261     if (sequencesCount == matricesCount)
262         return(sequencesCount);
263     else {
264         ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Sequences file has %d entries; Matrices file has %d entries; these should be equal\n", sequencesCount,matricesCount);
265         return(0);
266     }
267 }
268 
269 /*free the memory associated with the position-specific score matrices*/
freeMatrix(ScoreRow * posMatrix)270 static void  freeMatrix(ScoreRow *posMatrix)
271 {
272 
273   MemFree(posMatrix);
274 }
275 
276 /*allocate memory for the position-specific score matrices
277   enough memory is allocated to hold the largest matrix
278   the memory is reused for each different matrix*/
allocateMatrix(Int4 maxSequenceLength)279 static ScoreRow * allocateMatrix(Int4 maxSequenceLength)
280 {
281   ScoreRow *returnMatrix; /*matrix to return*/
282 
283   returnMatrix = (ScoreRow *) MemNew(maxSequenceLength * sizeof(ScoreRow));
284   return(returnMatrix);
285 }
286 
287 /* read in a position-specific score matrix from thisMatrixFile
288    the number of positions is dbSequenceLength
289    kbp keeps the Karlin-ALtschul parameters
290    returnMatrix is the memory address where the matrix is to be stored*/
readNextMatrix(FILE * thisMatrixFile,Int4 startPos,Int4 * endPos,ScoreRow * bigMatrix)291 static void readNextMatrix(FILE * thisMatrixFile,
292               Int4 startPos, Int4 *endPos,
293               ScoreRow *bigMatrix)
294 {
295   Int4 i, r; /*row indices for sequence and matrix*/
296   Int4 lengthInFile; /*length of query*/
297   Nlm_FloatHi junkLambda, junkK, junklogK, junkH; /*used to read in useless
298 						    Karlin blocks*/
299   Char *sequence;  /*sequence to read in*/
300   Char rowOfScores[MAXLINELEN]; /*one row of scores to be read in*/
301 
302   fscanf(thisMatrixFile, "%d", &lengthInFile);
303   sequence = (Char *) MemNew((lengthInFile + 2) * sizeof(Char));
304   fscanf(thisMatrixFile,"%s",sequence);
305   MemFree(sequence);
306   /*read in useless Karlin block*/
307   fscanf(thisMatrixFile,"%le", &junkLambda);
308   fscanf(thisMatrixFile,"%le", &junkK);
309   fscanf(thisMatrixFile,"%le", &junklogK);
310   fscanf(thisMatrixFile,"%le", &junkH);
311   /*read in useless Karlin block*/
312   fscanf(thisMatrixFile,"%le", &junkLambda);
313   fscanf(thisMatrixFile,"%le", &junkK);
314   fscanf(thisMatrixFile,"%le", &junklogK);
315   fscanf(thisMatrixFile,"%le", &junkH);
316   /*read in useless Karlin block*/
317   fscanf(thisMatrixFile,"%le", &junkLambda);
318   fscanf(thisMatrixFile,"%le", &junkK);
319   fscanf(thisMatrixFile,"%le", &junklogK);
320   fscanf(thisMatrixFile,"%le\n", &junkH);
321   for(i = 0, r = startPos; i < lengthInFile; i++, r++) {
322     fgets(rowOfScores, MAXLINELEN, thisMatrixFile);
323     sscanf(rowOfScores, "%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d",
324                         &(bigMatrix[r][0]),
325                         &(bigMatrix[r][1]),
326                         &(bigMatrix[r][2]),
327                         &(bigMatrix[r][3]),
328                         &(bigMatrix[r][4]),
329                         &(bigMatrix[r][5]),
330                         &(bigMatrix[r][6]),
331                         &(bigMatrix[r][7]),
332                         &(bigMatrix[r][8]),
333                         &(bigMatrix[r][9]),
334                         &(bigMatrix[r][10]),
335                         &(bigMatrix[r][11]),
336                         &(bigMatrix[r][12]),
337                         &(bigMatrix[r][13]),
338                         &(bigMatrix[r][14]),
339                         &(bigMatrix[r][15]),
340                         &(bigMatrix[r][16]),
341                         &(bigMatrix[r][17]),
342                         &(bigMatrix[r][18]),
343                         &(bigMatrix[r][19]),
344                         &(bigMatrix[r][20]),
345                         &(bigMatrix[r][21]),
346                         &(bigMatrix[r][22]),
347                         &(bigMatrix[r][23]),
348                         &(bigMatrix[r][24]),
349                         &(bigMatrix[r][25]));
350   }
351 
352   if((Boolean) myargs[2].intvalue) {
353       /* Last row in the matrix will be gap-row (-INT2_MAX) */
354 
355       for(i = 0; i < 26; i++) {
356           bigMatrix[r][i] = -INT2_MAX;
357       }
358       r++;
359   }
360 
361   *endPos = r;
362 }
363 
364 /*read each matrix in turn and store its scores in combinedMatrix*/
readAllMatrices(FILE * matrixnamefp,ScoreRow * combinedMatrix,Int4 numProfiles,CharPtr directoryPrefix,Int4Ptr seqlens)365 static void readAllMatrices(FILE *matrixnamefp, ScoreRow *combinedMatrix,
366                             Int4 numProfiles, CharPtr directoryPrefix,
367                             Int4Ptr seqlens)
368 {
369     Int4 i; /*loop index*/
370     Char oneMatrixFileName[MAXLINELEN]; /*name of matrix file to read*/
371     FILE *thisMatrixFile; /*descriptor for one matrix file*/
372     Int4 startPos; /*starting row in big matrix for next small matrix*/
373     Int4 endPos; /*ending row + 1 in big matrix for this small matrix*/
374     Int4 prefixLength; /*length of directoryPrefix*/
375     Int4 c1,c2; /*loop indices over characters*/
376     Char relativeMatrixFileName[MAXLINELEN];
377 
378     startPos = 0;
379     endPos = 0;
380     if ('\0' != directoryPrefix[0]) {
381         strcpy(oneMatrixFileName, directoryPrefix);
382         prefixLength = strlen(directoryPrefix);
383     }
384     for (i = 0; i < numProfiles; i++) {
385         if ('\0' == directoryPrefix[0])
386             fscanf(matrixnamefp,"%s", oneMatrixFileName);
387         else {
388             fscanf(matrixnamefp,"%s", relativeMatrixFileName);
389             for(c1 = prefixLength, c2 = 0; relativeMatrixFileName[c2] != '\0';
390                 c1++, c2++)
391                 oneMatrixFileName[c1] = relativeMatrixFileName[c2];
392             oneMatrixFileName[c1] = '\0';
393         }
394 
395         if ((thisMatrixFile = FileOpen(oneMatrixFileName, "r")) == NULL)  {
396             ErrPostEx(SEV_FATAL, 1, 0, "profiles: Unable to open matrix file %s\n", oneMatrixFileName);
397             return;
398         }
399         readNextMatrix(thisMatrixFile, startPos, &endPos,
400                        combinedMatrix);
401 
402         if(seqlens != NULL) {
403             seqlens[i] = startPos;
404         }
405 
406         startPos = endPos;
407         FileClose(thisMatrixFile);
408     }
409 
410     if(seqlens != NULL) {   /* Last entry - is the end of last sequence */
411         seqlens[i] = startPos;
412     }
413 
414     return;
415 }
416 
417 /*findTotalLength scans matrixAuxiliaryFile to find the
418   total  number of positions among all the position-specific matrices*/
findTotalLength(FILE * matrixAuxiliaryFile,Int4 numProfiles,Nlm_FloatHiPtr scalingFactor)419 static Int4 findTotalLength(FILE *matrixAuxiliaryFile, Int4 numProfiles,
420                             Nlm_FloatHiPtr scalingFactor)
421 {
422     Int4 maxLength; /*maximum length of sequence*/
423     Int4 thisLength; /*length of next sequence*/
424     Int4 totalLength; /*total length to return*/
425     Int4 dbLength; /*length of database*/
426     Int4 i; /*loop index*/
427     Nlm_FloatHi Kungapped, Hungapped; /*two values to read*/
428     Char * underlyingMatrixName; /*name of matrix to read*/
429     Int4 gap_open, gap_extend; /*gap costs to skip over in reading*/
430 
431     underlyingMatrixName = MemNew(MAXLINELEN * sizeof(Char));
432     fscanf(matrixAuxiliaryFile,"%s",underlyingMatrixName);
433     fscanf(matrixAuxiliaryFile,"%d\n", &gap_open);
434     fscanf(matrixAuxiliaryFile,"%d\n", &gap_extend);
435     fscanf(matrixAuxiliaryFile, "%le", &Kungapped);
436     fscanf(matrixAuxiliaryFile, "%le", &Hungapped);
437     fscanf(matrixAuxiliaryFile, "%d", &maxLength);
438     fscanf(matrixAuxiliaryFile, "%d", &dbLength);
439     fscanf(matrixAuxiliaryFile, "%lf", scalingFactor);
440     totalLength = 0;
441     for (i = 0; i < numProfiles; i++) {
442         fscanf(matrixAuxiliaryFile, "%d", &thisLength);
443         fscanf(matrixAuxiliaryFile, "%le", &Kungapped);
444         totalLength += thisLength;
445     }
446     rewind(matrixAuxiliaryFile);
447     MemFree(underlyingMatrixName);
448     return(totalLength);
449 }
450 
RPSUpdateOffsets(BlastAaLookupTable * lookup)451 static Boolean RPSUpdateOffsets(BlastAaLookupTable *lookup)
452 {
453     Uint4 len;
454     Int4 index;
455     Int4 num_used;
456     Int4 offset_diff;
457     AaLookupBackboneCell *bbc;
458     Int4 *ovf;
459 
460     len = lookup->backbone_size;
461     offset_diff = lookup->word_length - 1;
462 
463     // database assumes backbone type of lookup table
464     ASSERT(lookup->bone_type == eBackbone);
465     bbc = (AaLookupBackboneCell *)(lookup->thick_backbone);
466     ovf = (Int4 *)(lookup->overflow);
467 
468     /* Walk through table, copying info into mod_lt[] */
469     for(index = 0; index < len; index++) {
470 
471         if((num_used=bbc[index].num_used) <= 3)
472         {
473             while (num_used > 0)
474             {
475                 num_used--;
476             	bbc[index].payload.entries[num_used] += offset_diff;
477             }
478         }
479         else
480         {
481             while (num_used > 0)
482             {
483                  num_used--;
484                  ovf[ bbc[index].payload.overflow_cursor + num_used] += offset_diff;
485             }
486         }
487     }
488     return TRUE;
489 }
490 
491 
492 /* #define RPS_THRESHOLD 11 */
493 /* #define RPS_WORDSIZE  3 */
494 
495 /* -- SSH --
496    Updates absolute pointers of the lookup table to relative pointers -
497    pointers relative to the start of "mod_lookup_table_memory" chunk
498    RPS Blast will calculate real pointers in run time using these values
499 */
RPSUpdatePointers(BlastAaLookupTable * lookup,Uint4 * new_overflow,Uint4 * new_overflow_size)500 Boolean RPSUpdatePointers(BlastAaLookupTable *lookup, Uint4 *new_overflow, Uint4 *new_overflow_size)
501 {
502     Uint4 len;
503     Int4 index;
504     Uint4 *start_address;
505     long mlpp_address;
506     Uint4 *new_overflow_cursor;
507     Int4 *src;
508     Int4 first_hit;
509     AaLookupBackboneCell *bbc;
510     Int4 *ovf;
511 
512     // database assumes backbone type of lookup table
513     ASSERT(lookup->bone_type == eBackbone);
514     bbc = (AaLookupBackboneCell *)(lookup->thick_backbone);
515     ovf = (Int4 *)(lookup->overflow);
516 
517     len = lookup->backbone_size;
518 
519     start_address = new_overflow_cursor = new_overflow;
520 
521     /* Walk through table, copying info into mod_lt[] */
522     for(index = 0; index < len; index++) {
523 
524         if(bbc[index].num_used <= 3)
525             continue;
526 
527         src = &(ovf[bbc[index].payload.overflow_cursor]);
528         MemCpy(new_overflow_cursor, &src[1], sizeof(Uint4)*(bbc[index].num_used-1));
529 
530         mlpp_address = (long) new_overflow_cursor;
531 
532 	new_overflow_cursor += bbc[index].num_used-1;
533 	first_hit = src[0];
534 
535         mlpp_address -= (long) start_address;
536 
537         /* Now this is new relative address - usually small  */
538         bbc[index].payload.entries[1] = (Int4) mlpp_address;
539         bbc[index].payload.entries[0] = first_hit;
540 
541     }
542 
543     *new_overflow_size = new_overflow_cursor - new_overflow;
544 
545     return TRUE;
546 }
547 
548 /* -- SSH --
549    Write lookup table to the disk into file "*.loo", which will be
550    used memory-mapped during RPS Blast search
551 */
RPSDumpLookupTable(BlastAaLookupTable * lookup,FILE * fd)552 Boolean RPSDumpLookupTable(BlastAaLookupTable *lookup, FILE *fd)
553 {
554     Uint4 *new_overflow;
555     Uint4 new_overflow_size;
556     AaLookupBackboneCell empty_cell;
557     Int4 index;
558 
559     RPSUpdateOffsets(lookup);
560 
561     new_overflow = malloc(lookup->overflow_size*sizeof(Uint4));
562     RPSUpdatePointers(lookup, new_overflow, &new_overflow_size);
563 
564     FileWrite(lookup->thick_backbone, sizeof(AaLookupBackboneCell), lookup->backbone_size, fd);
565 
566     /* write empty cells out to the thick backbone size that
567        RPS blast expects */
568 
569     memset(&empty_cell, 0, sizeof(empty_cell));
570     for (index = lookup->backbone_size; index < RPS_ARRAY_SIZE + 1; index++)
571         FileWrite(&empty_cell, sizeof(empty_cell), 1, fd);
572 
573     if(new_overflow_size)
574         FileWrite(new_overflow,
575                   sizeof(Uint4),
576                   new_overflow_size,
577                   fd);
578 
579     sfree(new_overflow);
580 
581     return TRUE;
582 }
583 
584 /* Copied verbatim from algo/blast/core/blast_traceback.c */
RPSPsiMatrixAttach(BlastScoreBlk * sbp,Int4 ** rps_pssm)585 void RPSPsiMatrixAttach(BlastScoreBlk* sbp, Int4** rps_pssm)
586 {
587     ASSERT(sbp);
588 
589     /* Create a dummy PSI-BLAST matrix structure, only to then free it as we'd
590      * like to piggy back on the already created structure to use the gapped
591      * alignment routines */
592     sbp->psi_matrix = (SPsiBlastScoreMatrix*)
593         calloc(1, sizeof(SPsiBlastScoreMatrix));
594     ASSERT(sbp->psi_matrix);
595 
596     sbp->psi_matrix->pssm = (SBlastScoreMatrix*)
597         calloc(1, sizeof(SBlastScoreMatrix));
598     ASSERT(sbp->psi_matrix->pssm);
599 
600     /* The only data field that RPS-BLAST really needs */
601     sbp->psi_matrix->pssm->data = rps_pssm;
602 }
603 
RPSPsiMatrixDetach(BlastScoreBlk * sbp)604 void RPSPsiMatrixDetach(BlastScoreBlk* sbp)
605 {
606     ASSERT(sbp);
607     sbp->psi_matrix->pssm->data = NULL;
608     sfree(sbp->psi_matrix->pssm);
609     sfree(sbp->psi_matrix);
610 }
611 
612 
613 /* -- SSH --
614    Create lookup table for the large sequence, that represented
615    by all collection of PSSM matrixes and dump this table to disk
616    Used by RPS Blast.
617 */
RPSCreateLookupFile(ScoreRow * combinedMatrix,Int4 numProfiles,Int4Ptr seqlens,CharPtr filename,Nlm_FloatHi scalingFactor)618 Boolean RPSCreateLookupFile(ScoreRow *combinedMatrix, Int4 numProfiles,
619                             Int4Ptr seqlens, CharPtr filename,
620                             Nlm_FloatHi scalingFactor)
621 {
622     BlastScoreBlk *sbp;
623     FILE *fd;
624     Int4  **posMatrix;
625     Int4 start, i, header_size, all_length, magicNumber;
626     Int4Ptr offsets;
627     Int4 num_lookups;
628     BlastSeqLoc *lookup_segment=NULL;
629     BlastAaLookupTable *lookup;
630     LookupTableWrap* lookup_wrap_ptr=NULL;
631     LookupTableOptions* lookup_options;
632 
633 
634     if((fd = FileOpen(filename, "wb")) == NULL)
635         return FALSE;
636 
637     num_lookups = 1; /* Single lookup table for all set */
638 
639     all_length = seqlens[numProfiles] - seqlens[0];
640 
641     posMatrix = MemNew((all_length + 1) * sizeof(Int4 *));
642     for (i = 0; i < all_length; i++) {
643         posMatrix[i] = (Int4 *) &(combinedMatrix[i][0]);
644     }
645 
646     /* Last row is necessary */
647     posMatrix[all_length] = MemNew(sizeof(Int4) * PRO_ALPHABET_SIZE);
648 
649     for(i = 0; i < PRO_ALPHABET_SIZE; i++) {
650         posMatrix[all_length][i] = -INT2_MAX;
651     }
652 
653     sbp = BlastScoreBlkNew(BLASTAA_SEQ_CODE, 1);
654     RPSPsiMatrixAttach(sbp, posMatrix);
655     LookupTableOptionsNew(eBlastTypeBlastp, &lookup_options);
656     BLAST_FillLookupTableOptions(lookup_options, eBlastTypePsiBlast, FALSE,
657 	(Int4) (myargs[3].floatvalue*scalingFactor), myargs[4].intvalue);
658 
659 
660     BlastSeqLocNew(&lookup_segment, 0, all_length);
661 
662     /* Need query for psi-blast??  where to put the PSSM? */
663     LookupTableWrapInit(NULL, lookup_options, NULL, lookup_segment, sbp, &lookup_wrap_ptr, NULL, NULL, NULL);
664 
665     RPSPsiMatrixDetach(sbp);
666     sbp = BlastScoreBlkFree(sbp);
667     lookup_options = LookupTableOptionsFree(lookup_options);
668     lookup_segment = BlastSeqLocFree(lookup_segment);
669 
670     lookup = (BlastAaLookupTable*) lookup_wrap_ptr->lut;
671 
672     /* Only Uint4 maximum length for lookup file allowed in current
673        implementation */
674     header_size = (numProfiles+1)*sizeof(Int4) + 8*sizeof(Int4);
675 
676     /* Beginning of file will be allocated for lookup offsets */
677     fseek(fd, header_size, SEEK_SET);
678 
679     offsets = MemNew(sizeof(Int4) * (num_lookups + 1));
680 
681 
682     offsets[0] = ftell(fd);
683 
684     start = seqlens[0]; /* 0 */
685 
686     RPSDumpLookupTable(lookup, fd);
687 
688     i = 1;
689 
690     offsets[i] = ftell(fd); /* Last offset also recorded */
691 
692     fseek(fd, 0, SEEK_SET);
693     magicNumber = RPS_MAGIC_NUMBER;
694     FileWrite(&magicNumber, sizeof(Int4), 1, fd); /* header[0] */
695     FileWrite(&num_lookups, sizeof(Int4), 1, fd); /* header[1] */
696     FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[2] */
697     FileWrite(&lookup->neighbor_matches, sizeof(Int4), 1, fd); /* header[3] */
698     FileWrite(&lookup->overflow_size, sizeof(Int4), 1, fd); /* header[4] */
699 
700     /* Now writing recorded offsets in the beginning of the file */
701 
702     fseek(fd, 8*sizeof(Int4), SEEK_SET);
703     FileWrite(offsets, sizeof(Int4), num_lookups + 1, fd);
704     FileClose(fd);
705 
706     /* Final memory cleenup */
707 
708     MemFree(posMatrix[all_length]);
709     MemFree(posMatrix);
710 
711     return TRUE;
712 }
713 
714 /* -- SSH --
715    Create file <database_name> (without extention), which is concatenation
716    of all FASTA files used. Used by RPS Blast.
717 */
RPSConcatSequences(FILE * sfp,CharPtr fastaname)718 Boolean RPSConcatSequences(FILE *sfp, CharPtr fastaname)
719 {
720     FILE *fasta_fp, *fd;
721     Char oneFileName[MAXLINELEN]; /*for reading one line per file*/
722     Char buffer[1024];
723     Int4 bytes;
724     CharPtr chptr, last_non_space;
725 
726     if((fasta_fp = FileOpen(fastaname, "w")) == NULL) {
727         ErrPostEx(SEV_FATAL, 1, 0, "concatenate sequences: "
728                   "Unable to open target fasta file %s: %s\n",
729                   fastaname, strerror(errno));
730         return FALSE;
731     }
732 
733     rewind(sfp);
734 
735     while (fgets(oneFileName, MAXLINELEN, sfp)) {
736 
737         /* Remove trailing whitespace */
738         last_non_space = NULL;
739         for(chptr = oneFileName; *chptr != NULLB; chptr++) {
740             if (!isspace(*chptr))
741                 last_non_space = chptr;
742         }
743         if (last_non_space != NULL)
744             last_non_space[1] = NULLB;
745 
746         if((fd = FileOpen(oneFileName, "r")) == NULL) {
747             ErrPostEx(SEV_FATAL, 1, 0, "concatenate sequences: "
748                       "Unable to open source fasta file %s: %s\n",
749                       oneFileName, strerror(errno));
750             FileClose(fasta_fp);
751             return FALSE;
752         }
753 
754         /* Now concatenating this file into set */
755         while((bytes = FileRead(buffer, 1, 1024, fd)) > 0)
756             FileWrite(buffer, 1, bytes, fasta_fp);
757         FileClose(fd);
758     }
759 
760     FileClose(fasta_fp);
761 
762     return TRUE;
763 }
764 
Main(void)765 Int2  Main(void)
766 
767 {
768 
769     Char *profilesFileName; /*file name for list of profile file names*/
770     Char sequencesFileName[MAX_NAME_LENGTH]; /*file anme for list of sequence file names*/
771     Char matrixFileName[MAX_NAME_LENGTH]; /*file name for list of matrix file names*/
772     Char auxFileName[MAX_NAME_LENGTH]; /*file name for file containing auxiliary information*/
773     Char bigFileName[MAX_NAME_LENGTH]; /*file name to store byte-encoded coalesced matrix*/
774     Char lookupName[MAX_NAME_LENGTH]; /*file name to store precalculated lookup table */
775     FILE *auxiliaryfp; /*file descriptor for matrix auxiliary file*/
776     FILE *sequencesfp; /*files descriptor for file containing list of sequences*/
777     FILE *matrixnamefp; /*file descriptor for file containing matrix names*/
778     FILE *bigmatrixfile; /*file descriptor for file containing single big matrix*/
779     Int4 numProfiles; /*number of profiles*/
780     Int4 totalProfileLength; /*total length of all profiles*/
781     ScoreRow *combinedMatrix; /*combined matrix for all profiles*/
782     Char *directoryPrefix; /*directory where profile library is kept, used
783                              to reach other directories indirectly*/
784 
785     Int4Ptr seqlens;
786     Nlm_FloatHi scalingFactor; /*matrix scale to skip over in reading*/
787 
788     if (! GetArgs ("copymatrices", NUMARG, myargs)) {
789         return (1);
790     }
791 
792     if ((Boolean) myargs[1].intvalue) {
793         IMPALAPrintHelp(FALSE, 80, "copymat", stdout);
794         return(1);
795     }
796     profilesFileName = myargs[0].strvalue;
797     directoryPrefix = (Char *) MemNew(MAX_NAME_LENGTH *sizeof(char));
798     strcpy(directoryPrefix,profilesFileName);
799 
800     impalaMakeFileNames(profilesFileName, auxFileName, bigFileName,
801                         sequencesFileName, matrixFileName, NULL,
802                         directoryPrefix);
803 
804     if ((matrixnamefp = FileOpen(matrixFileName, "r")) == NULL) {
805         ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to open file with matrix file names %s\n", matrixFileName);
806         return (1);
807     }
808 
809     if ((sequencesfp = FileOpen(sequencesFileName, "r")) == NULL) {
810         ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to open file with sequence file names %s\n", sequencesFileName);
811         return (1);
812     }
813 
814     if ((auxiliaryfp = FileOpen(auxFileName, "r")) == NULL) {
815         ErrPostEx(SEV_FATAL, 1, 0, "profiles: Unable to open auxiliary file %s\n", auxFileName);
816         return (1);
817     }
818 
819     /* -- SSH -- Name of matrix file depends on program - RPS or Impala */
820 
821     if((Boolean) myargs[2].intvalue) {
822         sprintf(bigFileName, "%s.rps", profilesFileName);
823     }
824 
825     if ((bigmatrixfile = FileOpen(bigFileName, "wb")) == NULL) {
826         ErrPostEx(SEV_FATAL, 1, 0, "rps-blast: Unable to open big matrix file %s\n", bigFileName);
827         return (1);
828     }
829 
830     numProfiles =  countProfiles(sequencesfp, matrixnamefp);
831     totalProfileLength = findTotalLength(auxiliaryfp, numProfiles,
832                                          &scalingFactor);
833 
834     /* -- SSH -- Additional line in matrix with -INT2_MAX values */
835     if((Boolean) myargs[2].intvalue) {
836         totalProfileLength += numProfiles;
837     }
838 
839     combinedMatrix = allocateMatrix(totalProfileLength);
840     if (NULL == combinedMatrix) {
841         ErrPostEx(SEV_FATAL, 1, 0, "copymatrices: Unable to allocate matrix with%d rows\n", totalProfileLength);
842         return (1);
843 
844     }
845     /* -- SSH -- RPS Blast data */
846     if ((Boolean) myargs[2].intvalue) {
847         seqlens = (Int4Ptr) MemNew((numProfiles +1) * sizeof(Int4));
848     } else {
849         seqlens = NULL;
850     }
851 
852     readAllMatrices(matrixnamefp, combinedMatrix, numProfiles,
853                     directoryPrefix, seqlens);
854 
855     /* -- SSH -- For RPS Blast additional info will be added to the file */
856     if ((Boolean) myargs[2].intvalue) {
857         Int4 magicNumber = RPS_MAGIC_NUMBER;
858         FileWrite(&magicNumber, sizeof(Int4), 1, bigmatrixfile);
859         FileWrite(&numProfiles, sizeof(Int4), 1, bigmatrixfile);
860         FileWrite(seqlens, sizeof(Int4), numProfiles + 1, bigmatrixfile);
861 
862         sprintf(lookupName, "%s.loo", profilesFileName);
863         RPSCreateLookupFile(combinedMatrix, numProfiles, seqlens, lookupName,
864                             scalingFactor);
865 
866         if(!RPSConcatSequences(sequencesfp, profilesFileName)) {
867             ErrPostEx(SEV_ERROR, 0,0, "Failure to concatenate sequences");
868             return 1;
869         }
870 
871     }
872 
873     FileWrite((void *) combinedMatrix[0], sizeof(ScoreRow),
874               (size_t) totalProfileLength, bigmatrixfile);
875     freeMatrix(combinedMatrix);
876     FileClose(bigmatrixfile);
877     FileClose(matrixnamefp);
878     FileClose(sequencesfp);
879     FileClose(auxiliaryfp);
880     return 0;
881 }
882