1 /* Sequence/subsequence indices: fast lookup in large sequence files by keyword.
2  *
3  * Contents:
4  *  1. Using (reading) an SSI index.
5  *  2. Creating (writing) new SSI files.
6  *  3. Portable binary i/o.
7  *  4. Unit tests.
8  *  5. Test driver.
9  *  6. Example code.
10  */
11 #include "esl_config.h"
12 
13 #include <stdio.h>
14 #include <string.h>
15 
16 #include "easel.h"
17 #include "esl_ssi.h"
18 
19 static uint32_t v30magic = 0xd3d3c9b3; /* SSI 3.0: "ssi3" + 0x80808080 */
20 static uint32_t v30swap  = 0xb3c9d3d3; /* byteswapped */
21 
22 
23 /*****************************************************************
24  *# 1. Using (reading) an SSI index.
25  *****************************************************************/
26 
27 static int  binary_search(ESL_SSI *ssi, const char *key, uint32_t klen, off_t base,
28 			  uint32_t recsize, uint64_t maxidx);
29 
30 /* Function:  esl_ssi_Open()
31  * Synopsis:  Open an SSI index as an <ESL_SSI>.
32  *
33  * Purpose:   Open the SSI index file <filename>, and returns a pointer
34  *            to the new <ESL_SSI> object in <ret_ssi>.
35  *
36  *            Caller is responsible for closing the SSI file with
37  *            <esl_ssi_Close()>.
38  *
39  * Args:      <filename>   - name of SSI index file to open.
40  *            <ret_ssi>    - RETURN: the new <ESL_SSI>.
41  *
42  * Returns:   <eslOK>        on success;
43  *            <eslENOTFOUND> if <filename> cannot be opened for reading;
44  *            <eslEFORMAT>   if it's not in correct SSI file format;
45  *            <eslERANGE>    if it uses 64-bit file offsets, and we're on a system
46  *                           that doesn't support 64-bit file offsets.
47  *
48  * Throws:    <eslEMEM> on allocation error.
49  */
50 int
esl_ssi_Open(const char * filename,ESL_SSI ** ret_ssi)51 esl_ssi_Open(const char *filename, ESL_SSI **ret_ssi)
52 {
53   ESL_SSI *ssi = NULL;
54   int      status;
55   uint32_t magic;	/* magic number that starts the SSI file */
56   uint16_t i;		/* counter over files */
57 
58   /* Initialize the SSI structure, null'ing so we can autocleanup.
59    */
60   ESL_ALLOC(ssi, sizeof(ESL_SSI));
61   ssi->fp         = NULL;
62   ssi->filename   = NULL;
63   ssi->fileformat = NULL;
64   ssi->fileflags  = NULL;
65   ssi->bpl        = NULL;
66   ssi->rpl        = NULL;
67   ssi->nfiles     = 0;
68 
69   /* Open the file.
70    */
71   status = eslENOTFOUND;
72   if ((ssi->fp = fopen(filename, "rb")) == NULL) goto ERROR;
73 
74   /* Read the magic number: make sure it's an SSI file, and determine
75    * whether it's byteswapped.
76    */
77   status = eslEFORMAT;
78   if (esl_fread_u32(ssi->fp, &magic)        != eslOK) goto ERROR;
79   if (magic != v30magic && magic != v30swap)          goto ERROR;
80   if (esl_fread_u32(ssi->fp, &(ssi->flags)) != eslOK) goto ERROR;
81   if (esl_fread_u32(ssi->fp, &(ssi->offsz)) != eslOK) goto ERROR;
82 
83   status = eslERANGE;
84   if (ssi->offsz != 4 && ssi->offsz != 8) goto ERROR;
85   if (ssi->offsz > sizeof(off_t))         goto ERROR;
86 
87   /* The header data. */
88   status = eslEFORMAT;
89   if (esl_fread_u16(ssi->fp, &(ssi->nfiles))     != eslOK) goto ERROR;
90   if (esl_fread_u64(ssi->fp, &(ssi->nprimary))   != eslOK) goto ERROR;
91   if (esl_fread_u64(ssi->fp, &(ssi->nsecondary)) != eslOK) goto ERROR;
92   if (esl_fread_u32(ssi->fp, &(ssi->flen))       != eslOK) goto ERROR;
93   if (esl_fread_u32(ssi->fp, &(ssi->plen))       != eslOK) goto ERROR;
94   if (esl_fread_u32(ssi->fp, &(ssi->slen))       != eslOK) goto ERROR;
95   if (esl_fread_u32(ssi->fp, &(ssi->frecsize))   != eslOK) goto ERROR;
96   if (esl_fread_u32(ssi->fp, &(ssi->precsize))   != eslOK) goto ERROR;
97   if (esl_fread_u32(ssi->fp, &(ssi->srecsize))   != eslOK) goto ERROR;
98 
99   if (esl_fread_offset(ssi->fp, ssi->offsz, &(ssi->foffset)) != eslOK) goto ERROR;
100   if (esl_fread_offset(ssi->fp, ssi->offsz, &(ssi->poffset)) != eslOK) goto ERROR;
101   if (esl_fread_offset(ssi->fp, ssi->offsz, &(ssi->soffset)) != eslOK) goto ERROR;
102 
103   /* The file information.
104    * We expect the number of files to be small, so reading it once
105    * should be advantageous overall. If SSI ever had to deal with
106    * large numbers of files, you'd probably want to read file
107    * information on demand.
108    */
109   status = eslEFORMAT;
110   if (ssi->nfiles == 0) goto ERROR;
111 
112   ESL_ALLOC(ssi->filename,   sizeof(char *) * ssi->nfiles);
113   for (i = 0; i < ssi->nfiles; i++)  ssi->filename[i] = NULL;
114   ESL_ALLOC(ssi->fileformat, sizeof(uint32_t) * ssi->nfiles);
115   ESL_ALLOC(ssi->fileflags,  sizeof(uint32_t) * ssi->nfiles);
116   ESL_ALLOC(ssi->bpl,        sizeof(uint32_t) * ssi->nfiles);
117   ESL_ALLOC(ssi->rpl,        sizeof(uint32_t) * ssi->nfiles);
118 
119   /* (most) allocations done, now we read. */
120   for (i = 0; i < ssi->nfiles; i++)
121     {
122       ESL_ALLOC(ssi->filename[i], sizeof(char)* ssi->flen);
123       /* We do have to explicitly position, because header and file
124        * records may expand in the future; frecsize and foffset
125        * give us forwards compatibility.
126        */
127       status = eslEFORMAT;
128       if (fseeko(ssi->fp, ssi->foffset + (i * ssi->frecsize), SEEK_SET) != 0) goto ERROR;
129       if (fread(ssi->filename[i],sizeof(char),ssi->flen, ssi->fp)!=ssi->flen) goto ERROR;
130       if (esl_fread_u32(ssi->fp, &(ssi->fileformat[i])))                      goto ERROR;
131       if (esl_fread_u32(ssi->fp, &(ssi->fileflags[i])))                       goto ERROR;
132       if (esl_fread_u32(ssi->fp, &(ssi->bpl[i])))                             goto ERROR;
133       if (esl_fread_u32(ssi->fp, &(ssi->rpl[i])))                             goto ERROR;
134     }
135   *ret_ssi = ssi;
136   return eslOK;
137 
138  ERROR:
139   if (ssi != NULL) esl_ssi_Close(ssi);
140   *ret_ssi = NULL;
141   return status;
142 }
143 
144 
145 /* Function: esl_ssi_FindName()
146  * Synopsis: Look up a primary or secondary key.
147  *
148  * Purpose:  Looks up the string <key> in index <ssi>.
149  *           <key> can be either a primary or secondary key. If <key>
150  *           is found, <ret_fh> contains a unique handle on
151  *           the file that contains <key> (suitable for an <esl_ssi_FileInfo()>
152  *           call, or for comparison to the handle of the last file
153  *           that was opened for retrieval), and <ret_offset> contains
154  *           the offset of the sequence record in that file.
155  *
156  * Args:     <ssi>         - open index file
157  *           <key>         - name to search for
158  *           <ret_fh>      - RETURN: handle on file that key is in
159  *           <ret_roff>    - RETURN: offset of the start of that key's record
160  *           <opt_doff>    - optRETURN: data offset (may be 0 if unset)
161  *           <opt_L>       - optRETURN: length of data record (may be 0 if unset)
162  *
163  * Returns:  <eslOK>        on success;
164  *           <eslENOTFOUND> if no such key is in the index;
165  *           <eslEFORMAT>   if an fread() or fseeko() fails, which almost
166  *                          certainly reflects some kind of misformatting of
167  *                          the index.
168  *
169  * Throws:   <eslEMEM>      on allocation error.
170  */
171 int
esl_ssi_FindName(ESL_SSI * ssi,const char * key,uint16_t * ret_fh,off_t * ret_roff,off_t * opt_doff,int64_t * opt_L)172 esl_ssi_FindName(ESL_SSI *ssi, const char *key, uint16_t *ret_fh, off_t *ret_roff, off_t *opt_doff, int64_t *opt_L)
173 {
174   int       status;
175   off_t     doff;
176   int64_t   L;
177   char     *pkey   = NULL;
178 
179   /* Look in the primary keys.
180    */
181   status = binary_search(ssi, key, ssi->plen, ssi->poffset, ssi->precsize,
182 			 ssi->nprimary);
183 
184   if (status == eslOK)
185     { /* We found it as a primary key; get our data & return. */
186       status = eslEFORMAT;
187       if (esl_fread_u16(ssi->fp, ret_fh)                  != eslOK) goto ERROR;
188       if (esl_fread_offset(ssi->fp, ssi->offsz, ret_roff) != eslOK) goto ERROR;
189       if (esl_fread_offset(ssi->fp, ssi->offsz, &doff)    != eslOK) goto ERROR;
190       if (esl_fread_i64   (ssi->fp, &L)                   != eslOK) goto ERROR;
191     }
192   else if (status == eslENOTFOUND)
193     { /* Not in the primary keys? OK, try the secondary keys. */
194       if (ssi->nsecondary > 0) {
195 	if ((status = binary_search(ssi, key, ssi->slen, ssi->soffset, ssi->srecsize, ssi->nsecondary)) != eslOK) goto ERROR;
196 
197 	/* We have the secondary key; flip to its primary key, then look that up. */
198 	ESL_ALLOC(pkey, sizeof(char) * ssi->plen);
199 	status = eslEFORMAT;
200 	if (fread(pkey, sizeof(char), ssi->plen, ssi->fp) != ssi->plen) goto ERROR;
201 	if ((status = esl_ssi_FindName(ssi, pkey, ret_fh, ret_roff, &doff, &L)) != eslOK) goto ERROR;
202       } else goto ERROR;	/* no secondary keys? pass along the ENOTFOUND error. */
203     } else goto ERROR;	/* status from binary search was an error code. */
204 
205   if (pkey != NULL) free(pkey);
206   if (opt_doff != NULL) *opt_doff = doff;
207   if (opt_L    != NULL) *opt_L    = L;
208   return eslOK;
209 
210  ERROR:
211   if (pkey != NULL) free(pkey);
212   *ret_fh   = 0;
213   *ret_roff = 0;
214   if (opt_doff != NULL) *opt_doff = 0;
215   if (opt_L    != NULL) *opt_L    = 0;
216   return status;
217 }
218 
219 
220 
221 /* Function:  esl_ssi_FindNumber()
222  * Synopsis:  Look up the n'th primary key.
223  *
224  * Purpose:   Looks up primary key number <nkey> in the open index
225  *            <ssi>.  <nkey> ranges from <0..ssi->nprimary-1>. When
226  *            key <nkey> is found, any/all of several optional
227  *            arguments point to results. <*opt_fh> contains a unique
228  *            handle on the file that contains that key (suitable for
229  *            an <esl_ssi_FileInfo()> call, or for comparison to the
230  *            handle of the last file that was opened for retrieval).
231  *            <*opt_roff> contains the record offset; <*opt_doff>
232  *            contains the data offset; <*opt_L> contains the record
233  *            length; and <*opt_pkey> points to the primary key name
234  *            (a string, allocated here, that the caller becomes
235  *            responsible for free'ing).
236  *
237  * Args:      <ssi>        - open index file
238  *            <nkey>       - primary key number to retrieve (0..nprimary-1)
239  *            <opt_fh>     - optRETURN: handle on file that key is in
240  *            <opt_roff>   - optRETURN: offset of the start of that key's record
241  *            <opt_doff>   - optRETURN: data offset (may be 0 if unset)
242  *            <opt_L>      - optRETURN: length of data record (may be 0 if unset)
243  *            <opt_pkey>   - optRETURN: primary key name (allocated here; caller must free)
244  *
245  * Returns:   <eslOK>        on success;
246  *            <eslENOTFOUND> if there is no sequence record <nkey>;
247  *            <eslEFORMAT>   if a read or a seek fails, probably indicating
248  *                           some kind of file misformatting.
249  *
250  * Throws:    <eslEMEM> on allocation error.
251  */
252 int
esl_ssi_FindNumber(ESL_SSI * ssi,int64_t nkey,uint16_t * opt_fh,off_t * opt_roff,off_t * opt_doff,int64_t * opt_L,char ** opt_pkey)253 esl_ssi_FindNumber(ESL_SSI *ssi, int64_t nkey, uint16_t *opt_fh, off_t *opt_roff, off_t *opt_doff, int64_t *opt_L, char **opt_pkey)
254 {
255   int      status;
256   uint16_t fh;
257   off_t    doff, roff;
258   uint64_t L;
259   char    *pkey = NULL;
260 
261   if (nkey >= ssi->nprimary) { status = eslENOTFOUND; goto ERROR; }
262   ESL_ALLOC(pkey, sizeof(char) * ssi->plen);
263 
264   status = eslEFORMAT;
265   if (fseeko(ssi->fp, ssi->poffset+ssi->precsize*nkey, SEEK_SET)!= 0) goto ERROR;
266   if (fread(pkey, sizeof(char), ssi->plen, ssi->fp)   != ssi->plen)   goto ERROR;
267   if (esl_fread_u16(ssi->fp, &fh)                     != eslOK)       goto ERROR;
268   if (esl_fread_offset(ssi->fp, ssi->offsz, &roff)    != eslOK)       goto ERROR;
269   if (esl_fread_offset(ssi->fp, ssi->offsz, &doff)    != eslOK)       goto ERROR;
270   if (esl_fread_u64   (ssi->fp, &L)                   != eslOK)       goto ERROR;
271 
272   if (opt_fh   != NULL) *opt_fh   = fh;
273   if (opt_roff != NULL) *opt_roff = roff;
274   if (opt_doff != NULL) *opt_doff = doff;
275   if (opt_L    != NULL) *opt_L    = L;
276   if (opt_pkey != NULL) *opt_pkey = pkey; else free(pkey);
277   return eslOK;
278 
279  ERROR:
280   if (pkey     != NULL) free(pkey);
281   if (opt_fh   != NULL) *opt_fh   = 0;
282   if (opt_roff != NULL) *opt_roff = 0;
283   if (opt_doff != NULL) *opt_doff = 0;
284   if (opt_L    != NULL) *opt_L    = 0;
285   if (opt_pkey != NULL) *opt_pkey = NULL;
286   return status;
287 }
288 
289 
290 /* Function: esl_ssi_FindSubseq()
291  * Synopsis: Look up a specific subsequence's start.
292  * Date:     SRE, Mon Jan  1 19:49:31 2001 [St. Louis]
293  *
294  * Purpose:  Fast subsequence retrieval: look up a primary or secondary
295  *           <key> in the open index <ssi>, and ask for the nearest data
296  *           offset to a subsequence starting at residue
297  *           <requested_start> in the sequence (numbering the sequence
298  *           <1..L>).  If <key> is found, on return, <ret_fh> contains
299  *           a unique handle on the file that contains <key>;
300  *           <ret_roff> contains the disk offset to the start of the
301  *           sequence record; <ret_doff> contains the disk offset
302  *           (see below); and <ret_actual_start) contains the coordinate
303  *           (1..L) of the first valid residue at or after
304  *           <data_offset>. <ret_actual_start> is $\leq$
305  *           <requested_start>.
306  *
307  *           Depending on the file's characteristics, there are four
308  *           possible outcomes.
309  *
310  *           If the file has the <eslSSI_FASTSUBSEQ> flag set, a data
311  *           offset was indexed for this key, and the data can be
312  *           indexed at single residue resolution (because the file's
313  *           lines contain only residues, no spaces), then <ret_doff>
314  *           is exactly the position of residue <requested_start> on
315  *           disk, and <ret_actual_start> is <requested_start>.
316  *
317  *           If the file has the <eslSSI_FASTSUBSEQ> flag set, a data
318  *           offset was indexed for this key, but the data can only be
319  *           indexed at line resolution (because at least some of the
320  *           file's lines contain spaces), then <ret_doff> is the
321  *           position of the start of the line that <requested_start>
322  *           is on, and <ret_actual_start> is the coord <1..L> of the
323  *           first residue on that line.
324  *
325  *           If the file does not have the <eslSSI_FASTSUBSEQ> flag
326  *           set (because lines contain a variable number of residues
327  *           and/or bytes), but a data offset was indexed for this
328  *           key, then we can still at least return that data offset,
329  *           but the caller is going to have to start from the
330  *           beginning of the data and read residues until it reaches
331  *           the desired <requested_start>. Now <ret_doff> is the
332  *           offset to the start of the first line of the sequence
333  *           data, and <ret_actual_start> is 1.
334  *
335  *           If the key does not have a data offset indexed at all,
336  *           then regardless of the file's <eslSSI_FASTSUBSEQ>
337  *           setting, we can't calculate even the position of the
338  *           first line. In this case, <ret_doff> is 0 (for
339  *           unset/unknown), and <ret_actual_start> is <1>.
340  *
341  *           A caller that's going to position the disk and read a
342  *           subseq must check for all four possible outcomes (pardon
343  *           redundancy with the above, but just to be clear, from the
344  *           caller's perspective now):
345  *
346  *           If <ret_doff> is 0, no data offset information can be
347  *           calculated; the caller can still use <ret_roff> to
348  *           position the disk to the start of <key>'s record, but it
349  *           will need to parse the header to find the start of the
350  *           sequence data; then it will need to parse the sequence
351  *           data, skipping to residue <requested start>.
352  *
353  *           If <ret_doff> is valid ($>0$), and <ret_actual_start> is
354  *           1, then caller may use <ret_doff> to position the disk to
355  *           the start of the first sequence data line, but will still
356  *           need to parse all the sequence data, counting and
357  *           skipping to residue <requested start>. This is equivalent
358  *           to (and in practice, not much more efficient than)
359  *           positioning to the record start and parsing the header to
360  *           locate the sequence data start.
361  *
362  *           If <ret_doff> is valid ($>0$), and <ret_actual_start> is
363  *           $>1$ but $<$ <requested_start>, then <ret_doff> is the
364  *           offset to the first byte of a line on which the
365  *           subsequence begins. The caller can position the disk
366  *           there, then start parsing, skipping <requested_start -
367  *           *ret_actual_start> residues to reach the
368  *           <requested_start>. (In the case where the subsequence
369  *           begins on the first line, then <ret_actual_start> will be
370  *           1, and the caller will have to handle this as the case
371  *           above.)
372  *
373  *           If <<ret_doff> is valid ($>0$), and <ret_actual_start> is
374  *           $=$ <requested_start>, then <ret_doff> is the offset to a
375  *           byte in the file, such that the requested subsequence
376  *           starts at the next valid residue at or after that
377  *           position.  (The <ret_doff> would usually be exactly the
378  *           first residue of the subsequence, because we used single
379  *           residue resolution arithmetic to find it, but there's a
380  *           case where <requested_start> happens to be the first
381  *           residue of a line and we calculated <ret_doff> using
382  *           line-resolution arithmetic; in this latter case,
383  *           <ret_doff> could be pointing at a space before the first
384  *           subseq residue.) The caller may position the disk there
385  *           and start parsing immediately; the first valid residue
386  *           will be the start of the subsequence.
387  *
388  * Args:     <ssi>             - open index file
389  *           <key>             - primary or secondary key to find
390  *           <requested_start> - residue we'd like to start at (1..L)
391  *           <ret_fh>          - RETURN: handle for file the key is in
392  *           <ret_roff>        - RETURN: offset to start of sequence record
393  *           <ret_doff>        - RETURN: offset to closest start of subseq data, or 0.
394  *           <ret_L>           - RETURN: length of <key> in residues (may be 0 if unset)
395  *           <ret_actual_start>- RETURN: coord (1..L) of residue at <ret_doff>
396  *
397  * Returns:  <eslOK>         on any of the four successful outcomes.
398  *           <eslENOTFOUND>  if no such key is found in the index;
399  *           <eslEFORMAT> on a read or seek failure, presumably meaning that
400  *                        the file is misformatted somehow;
401  *           <eslERANGE>  if <requested_start> isn't somewhere in the range
402  *                        <1..len> for the target sequence.
403  *
404  * Throws:   <eslEMEM> on allocation error.
405  */
406 int
esl_ssi_FindSubseq(ESL_SSI * ssi,const char * key,int64_t requested_start,uint16_t * ret_fh,off_t * ret_roff,off_t * ret_doff,int64_t * ret_L,int64_t * ret_actual_start)407 esl_ssi_FindSubseq(ESL_SSI *ssi, const char *key, int64_t requested_start,
408 		   uint16_t *ret_fh, off_t *ret_roff, off_t *ret_doff, int64_t *ret_L, int64_t *ret_actual_start)
409 {
410   int      status;
411   uint64_t r, b, i, l;	/* tmp variables for "clarity", to match docs */
412 
413   /* Look up the key by name.
414    */
415   if ((status = esl_ssi_FindName(ssi, key, ret_fh, ret_roff, ret_doff, ret_L)) != eslOK) goto ERROR;
416   if (requested_start < 0 || requested_start > *ret_L) { status = eslERANGE; goto ERROR; }
417 
418   /* Do we have a data offset for this key? If not, we're case 4.    */
419   /* Can we do fast subseq lookup on this file? If no, we're case 3. */
420   if (*ret_doff == 0 || ! (ssi->fileflags[*ret_fh] & eslSSI_FASTSUBSEQ))
421     {
422       *ret_actual_start = 1;
423       return eslOK;
424     }
425 
426   /* Set up tmp variables for clarity of equations below,
427    * and to make them match tex documentation
428    */
429   r = ssi->rpl[*ret_fh];         /* residues per line */
430   b = ssi->bpl[*ret_fh];         /* bytes per line    */
431   i = requested_start;	         /* start position 1..L */
432   l = (i-1)/r;		         /* data line # (0..) that the residue is on */
433   if (r == 0 || b == 0) { status = eslEINVAL; goto ERROR; }
434 
435   /* When b = r+1, there's nothing but sequence on each data line (and the \0).
436    * In this case, we know we can find each residue precisely: outcome #1.
437    */
438   if (b == r+1)
439     {
440       *ret_doff        += l*b + (i-1)%r;
441       *ret_actual_start = requested_start;
442     }
443   /* else, there's other stuff on seq lines - probably spaces - so the best
444    * we can do (without figuring out the spacing pattern and checking that
445    * it's consistent everywhere) is to position at start of relevant line.
446    */
447   else
448     {
449       *ret_doff         += l*b;
450       *ret_actual_start = 1 + l*r;
451     }
452   return eslOK;
453 
454  ERROR:
455   *ret_fh           = 0;
456   *ret_roff         = 0;
457   *ret_doff         = 0;
458   *ret_L            = 0;
459   *ret_actual_start = 0;
460   return status;
461 }
462 
463 
464 /* Function: esl_ssi_FileInfo()
465  * Synopsis: Retrieve a file name and format code.
466  * Date:     SRE, Tue Jan  2 10:31:01 2001 [St. Louis]
467  *
468  * Purpose:  Given a file number <fh> in an open index file
469  *           <ssi>, retrieve file name <ret_filename> and
470  *           the file format <ret_format>.
471  *
472  *           <ret_filename> is a pointer to a string maintained
473  *           internally by <ssi>. It should not be free'd;
474  *           <esl_ssi_Close(ssi)> will take care of it.
475  *
476  * Args:     <ssi>          - open index file
477  *           <fh>           - handle on file to look up
478  *           <ret_filename> - RETURN: name of file n
479  *           <ret_format>   - RETURN: format code for file n
480  *
481  * Returns:  <eslOK> on success.
482  *
483  * Throws:   <eslEINVAL> if there is no such file number <fh>.
484  */
485 int
esl_ssi_FileInfo(ESL_SSI * ssi,uint16_t fh,char ** ret_filename,int * ret_format)486 esl_ssi_FileInfo(ESL_SSI *ssi, uint16_t fh, char **ret_filename, int *ret_format)
487 {
488   int status;
489 
490   if (fh >= ssi->nfiles) ESL_XEXCEPTION(eslEINVAL, "no such file number");
491   *ret_filename = ssi->filename[fh];
492   *ret_format   = ssi->fileformat[fh];
493   return eslOK;
494 
495  ERROR:
496   *ret_filename = NULL;
497   *ret_format   = 0;
498   return status;
499 }
500 
501 
502 /* Function:  esl_ssi_Close()
503  * Synopsis:  Close an SSI index.
504  *
505  * Purpose:   Close an open SSI index <ssi>.
506  *
507  * Args:      <ssi>   - an open SSI index file.
508  */
509 void
esl_ssi_Close(ESL_SSI * ssi)510 esl_ssi_Close(ESL_SSI *ssi)
511 {
512   int i;
513 
514   if (ssi == NULL) return;
515 
516   if (ssi->fp != NULL) fclose(ssi->fp);
517   if (ssi->filename != NULL) {
518     for (i = 0; i < ssi->nfiles; i++)
519       if (ssi->filename[i] != NULL) free(ssi->filename[i]);
520     free(ssi->filename);
521   }
522   if (ssi->fileformat != NULL) free(ssi->fileformat);
523   if (ssi->fileflags  != NULL) free(ssi->fileflags);
524   if (ssi->bpl        != NULL) free(ssi->bpl);
525   if (ssi->rpl        != NULL) free(ssi->rpl);
526   free(ssi);
527 }
528 
529 
530 /* binary_search()
531  * Date:     SRE, Sun Dec 31 16:05:03 2000 [St. Louis]
532  *
533  * Purpose:  Find <key> in an SSI index, by a binary search
534  *           in an alphabetically sorted list of keys. If successful,
535  *           return <eslOK>, and the index file is positioned to read
536  *           the rest of the data for that key. If unsuccessful,
537  *           return <eslFAIL>, and the positioning of the index file
538  *           is left in an undefined state.
539  *
540  * Args:     <ssi>     - an open ESL_SSI
541  *           <key>     - key to find
542  *           <klen>    - key length to allocate (plen or slen from ssi)
543  *           <base>    - base offset (poffset or soffset)
544  *           <recsize> - size of each key record in bytes (precsize or srecsize)
545  *           <maxidx>  - # of keys (nprimary or nsecondary)
546  *
547  * Returns:  <eslOK> on success, and leaves file positioned for reading remaining
548  *           data for the key.
549  *
550  *           <eslENOTFOUND> if <key> is not found.
551  *           <eslEFORMAT>   if an fread() or fseeko() fails, probably indicating
552  *                          some kind of misformatting of the index file.
553  *
554  * Throws:   <eslEMEM> on allocation failure.
555  *
556  */
557 static int
binary_search(ESL_SSI * ssi,const char * key,uint32_t klen,off_t base,uint32_t recsize,uint64_t maxidx)558 binary_search(ESL_SSI *ssi, const char *key, uint32_t klen, off_t base,
559 	      uint32_t recsize, uint64_t maxidx)
560 {
561   char        *name;
562   uint64_t     left, right, mid;
563   int          cmp;
564   int          status;
565 
566   if (maxidx == 0) return eslENOTFOUND; /* special case: empty index */
567 
568   ESL_ALLOC(name, (sizeof(char)*klen));
569 
570   left  = 0;
571   right = maxidx-1;
572   while (1) {			/* A binary search: */
573     mid   = (left+right) / 2;	/* careful here. left+right potentially overflows if
574 				   we didn't limit unsigned vars to signed ranges. */
575     status = eslEFORMAT;
576     if (fseeko(ssi->fp, base + recsize*mid, SEEK_SET) != 0)    goto ERROR;
577     if (fread(name, sizeof(char), klen, ssi->fp)      != klen) goto ERROR;
578 
579     status = eslENOTFOUND;
580     cmp = strcmp(name, key);
581     if      (cmp == 0) break;	             /* found it!               */
582     else if (left >= right) goto ERROR;      /* no such key             */
583     else if (cmp < 0)       left  = mid+1;   /* it's still right of mid */
584     else if (cmp > 0) {
585       if (mid == 0) goto ERROR;              /* beware left edge case   */
586       else right = mid-1;                    /* it's left of mid        */
587     }
588   }
589 
590   if (name != NULL) free(name);
591   return eslOK;  /* and ssi->fp is positioned to read the record. */
592 
593  ERROR:
594   if (name != NULL) free(name);
595   return status;
596 }
597 
598 
599 /*****************************************************************
600  *# 2. Creating (writing) new SSI files.
601  *****************************************************************/
602 static int current_newssi_size(const ESL_NEWSSI *ns);
603 static int activate_external_sort(ESL_NEWSSI *ns);
604 static int parse_pkey(char *buf, ESL_PKEY *pkey);
605 static int parse_skey(char *buf, ESL_SKEY *skey);
606 static int pkeysort(const void *k1, const void *k2);
607 static int skeysort(const void *k1, const void *k2);
608 
609 /* Function:  esl_newssi_Open()
610  * Synopsis:  Create a new <ESL_NEWSSI>.
611  *
612  * Purpose:   Creates and returns a <ESL_NEWSSI>, in order to create a
613  *            new SSI index file.
614  *
615  * Returns:   <eslOK> on success, and <*ret_newssi> is a pointer to a
616  *            new <ESL_NEWSSI> structure.
617  *
618  *            Returns <eslENOTFOUND> if <ssifile> can't be opened.
619  *
620  *            Returns <eslEOVERWRITE> if <allow_overwrite> is <FALSE>
621  *            and <ssifile> (or any necessary tmp files) already
622  *            exist, to block overwriting of an existing SSI file.
623  *
624  * Throws:    <eslEMEM> on allocation error.
625  */
626 int
esl_newssi_Open(const char * ssifile,int allow_overwrite,ESL_NEWSSI ** ret_newssi)627 esl_newssi_Open(const char *ssifile, int allow_overwrite, ESL_NEWSSI **ret_newssi)
628 {
629   ESL_NEWSSI *ns = NULL;
630   int i;
631   int status;
632 
633   ESL_ALLOC(ns, sizeof(ESL_NEWSSI));
634   ns->ssifile    = NULL;
635   ns->ssifp      = NULL;
636   ns->external   = FALSE;	    /* we'll switch to external sort if...       */
637   ns->max_ram    = eslSSI_MAXRAM;   /* ... if we exceed this memory limit in MB. */
638   ns->filenames  = NULL;
639   ns->fileformat = NULL;
640   ns->bpl        = NULL;
641   ns->rpl        = NULL;
642   ns->flen       = 0;
643   ns->nfiles     = 0;
644   ns->pkeys      = NULL;
645   ns->plen       = 0;
646   ns->nprimary   = 0;
647   ns->ptmpfile   = NULL;
648   ns->ptmp       = NULL;
649   ns->skeys      = NULL;
650   ns->slen       = 0;
651   ns->nsecondary = 0;
652   ns->stmpfile   = NULL;
653   ns->stmp       = NULL;
654   ns->errbuf[0]  = '\0';
655 
656   if ((status = esl_strdup(ssifile, -1, &(ns->ssifile)))    != eslOK) goto ERROR;
657   if ((status = esl_strdup(ssifile, -1, &(ns->ptmpfile)))   != eslOK) goto ERROR;
658   if ((status = esl_strdup(ssifile, -1, &(ns->stmpfile)))   != eslOK) goto ERROR;
659   if ((status = esl_strcat(&ns->ptmpfile, -1, ".1", 2))     != eslOK) goto ERROR;
660   if ((status = esl_strcat(&ns->stmpfile, -1, ".2", 2))     != eslOK) goto ERROR;
661 
662   if (! allow_overwrite)
663     {
664       if (esl_FileExists(ssifile)      ||
665 	  esl_FileExists(ns->ptmpfile) ||
666 	  esl_FileExists(ns->stmpfile))
667 	{ status = eslEOVERWRITE; goto ERROR; }
668     }
669 
670   if ((ns->ssifp = fopen(ssifile, "w")) == NULL)  { status = eslENOTFOUND; goto ERROR; }
671 
672   ESL_ALLOC(ns->filenames,  sizeof(char *)   * eslSSI_FCHUNK);
673   for (i = 0; i < eslSSI_FCHUNK; i++)
674     ns->filenames[i] = NULL;
675   ESL_ALLOC(ns->fileformat, sizeof(uint32_t) * eslSSI_FCHUNK);
676   ESL_ALLOC(ns->bpl,        sizeof(uint32_t) * eslSSI_FCHUNK);
677   ESL_ALLOC(ns->rpl,        sizeof(uint32_t) * eslSSI_FCHUNK);
678   ESL_ALLOC(ns->pkeys,      sizeof(ESL_PKEY) * eslSSI_KCHUNK);
679   for (i = 0; i < eslSSI_KCHUNK; i++)
680     ns->pkeys[i].key = NULL;
681   ESL_ALLOC(ns->skeys,      sizeof(ESL_SKEY) * eslSSI_KCHUNK);
682   for (i = 0; i < eslSSI_KCHUNK; i++) {
683     ns->skeys[i].key  = NULL;
684     ns->skeys[i].pkey = NULL;
685   }
686   *ret_newssi = ns;
687   return eslOK;
688 
689  ERROR:
690   esl_newssi_Close(ns);	/* free the damaged structure */
691   return status;
692 }
693 
694 
695 /* Function:  esl_newssi_AddFile()
696  * Synopsis:  Add a filename to a growing index.
697  *
698  * Purpose:   Registers the file <filename> into the new index <ns>,
699  *            along with its format code <fmt>. The index assigns it
700  *            a unique handle, which it returns in <ret_fh>. This
701  *            handle is needed when registering primary keys.
702  *
703  *            Caller should make sure that the same file isn't registered
704  *            twice; this function doesn't check.
705  *
706  * Args:      <ns>         - new ssi index under construction.
707  *            <filename>   - filename to add to the index.
708  *            <fmt>        - format code to associate with <filename> (or 0)
709  *            <ret_fh>     - RETURN: filehandle associated with <filename>
710  *
711  * Returns:   <eslOK> on success;
712  *            <eslERANGE> if registering this file would exceed the
713  *                        maximum number of indexed files.
714  *
715  * Throws:    <eslEMEM> on allocation or reallocation error.
716  */
717 int
esl_newssi_AddFile(ESL_NEWSSI * ns,const char * filename,int fmt,uint16_t * ret_fh)718 esl_newssi_AddFile(ESL_NEWSSI *ns, const char *filename, int fmt, uint16_t *ret_fh)
719 {
720   int      status;
721   uint16_t fh;
722   int      i;
723   int      n;
724 
725   if (ns->nfiles >= eslSSI_MAXFILES) ESL_XFAIL(eslERANGE, ns->errbuf, "exceeded the maximum number of files an SSI index can store");
726 
727   n = strlen(filename);
728   if ((n+1) > ns->flen) ns->flen = n+1;
729 
730   if ((status = esl_FileTail(filename, FALSE, &(ns->filenames[ns->nfiles]))) != eslOK) goto ERROR;
731 
732   ns->fileformat[ns->nfiles] = fmt;
733   ns->bpl[ns->nfiles]        = 0;
734   ns->rpl[ns->nfiles]        = 0;
735   fh                         = ns->nfiles;   /* handle is simply = file number */
736   ns->nfiles++;
737 
738   if (ns->nfiles % eslSSI_FCHUNK == 0) {
739     ESL_REALLOC(ns->filenames,  sizeof(char *)   * (ns->nfiles+eslSSI_FCHUNK));
740     for (i = ns->nfiles; i < ns->nfiles+eslSSI_FCHUNK; i++) ns->filenames[i] = NULL;
741     ESL_REALLOC(ns->fileformat, sizeof(uint32_t) * (ns->nfiles+eslSSI_FCHUNK));
742     ESL_REALLOC(ns->bpl,        sizeof(uint32_t) * (ns->nfiles+eslSSI_FCHUNK));
743     ESL_REALLOC(ns->rpl,        sizeof(uint32_t) * (ns->nfiles+eslSSI_FCHUNK));
744   }
745 
746   *ret_fh = fh;
747   return eslOK;
748 
749  ERROR:
750   *ret_fh = 0;
751   return status;
752 }
753 
754 
755 
756 /* Function:  esl_newssi_SetSubseq()
757  * Synopsis:  Declare that file is suitable for fast subseq lookup.
758  *
759  * Purpose:   Declare that the file associated with handle <fh> is
760  *            suitable for fast subsequence lookup, because it has
761  *            a constant number of residues and bytes per (nonterminal)
762  *            data line, <rpl> and <bpl>, respectively.
763  *
764  *            Caller is responsible for this being true: <rpl> and
765  *            <bpl> must be constant for every nonterminal line of
766  *            every sequence in this file.
767  *
768  * Args:      <ns>   - ssi index under construction
769  *            <fh>   - handle on file to set fast subseq lookup on
770  *            <bpl>  - constant bytes per nonterminal line in <fh>
771  *            <rpl>  - constant residues per nonterminal line in <fh>
772  *
773  * Returns:   <eslOK> on success.
774  *
775  * Throws:    <eslEINVAL> on invalid argument(s).
776  */
777 int
esl_newssi_SetSubseq(ESL_NEWSSI * ns,uint16_t fh,uint32_t bpl,uint32_t rpl)778 esl_newssi_SetSubseq(ESL_NEWSSI *ns, uint16_t fh, uint32_t bpl, uint32_t rpl)
779 {
780   int status;
781 
782   if (fh >= ns->nfiles)      ESL_XEXCEPTION(eslEINVAL, "invalid file number");
783   if (bpl <= 0 || rpl <= 0)  ESL_XEXCEPTION(eslEINVAL, "invalid bpl or rpl");
784   ns->bpl[fh] = bpl;
785   ns->rpl[fh] = rpl;
786   return eslOK;
787 
788  ERROR:
789   return status;
790 }
791 
792 
793 /* Function: esl_newssi_AddKey()
794  * Synopsis: Add a primary key to a growing index.
795  * Date:     SRE, Tue Jan  2 11:50:54 2001 [St. Louis]
796  *
797  * Purpose:  Register primary key <key> in new index <ns>, while telling
798  *           the index that this primary key is in the file associated
799  *           with filehandle <fh> (the handle returned by a previous call
800  *           to <esl_newssi_AddFile()>); that its record starts at
801  *           offset <r_off> in the file; that its data (usually
802  *           sequence data) starts at offset <d_off> in the file (i.e.
803  *           after any record header); and that the record's data is
804  *           of length <L> (usually, the record is a sequence, and <L>
805  *           is its length in residues).
806  *
807  *           The data length <L> is technically optional as far as SSI
808  *           is concerned; <L> may be passed as 0 to leave it
809  *           unset. However, functions in the <sqio> module that use
810  *           SSI indices will assume that <L> is available.
811  *
812  *           <d_off> is also optional; it may be passed as <0> to
813  *           leave it unset. If provided, <d_off> gives an offset to
814  *           the data portion of the record. The interpretation of
815  *           this data offset may be implementation-defined and may
816  *           depend on the format of the datafile; for example, in how
817  *           <sqio> uses SSI indices, <d_off> is the offset to the
818  *           start of the first sequence line.
819  *
820  *           Both <d_off> and <L> must be provided, and additionally
821  *           <eslSSI_FASTSUBSEQ> must be set for this file, for fast
822  *           subsequence lookup to work.
823  *
824  * Args:     <ns>     - active index
825  *           <key>    - primary key to add
826  *           <fh>     - handle on file that this key's in
827  *           <r_off>  - offset to start of record
828  *           <d_off>  - offset to start of sequence data, or 0
829  *           <L>      - length of sequence, or 0
830  *
831  * Returns:  <eslOK>        on success;
832  *           <eslERANGE>    if registering this key would exceed the maximum
833  *                          number of primary keys;
834  *           <eslENOTFOUND> if we needed to open external tmp files, but
835  *                          the attempt to open them failed.
836  *
837  * Throws:   <eslEINVAL> on an invalid argument;
838  *           <eslEMEM>   on allocation failure;
839  *           <eslEWRITE> on any system error writing to tmp file, such
840  *                       as filling the filesystem.
841  */
842 int
esl_newssi_AddKey(ESL_NEWSSI * ns,const char * key,uint16_t fh,off_t r_off,off_t d_off,int64_t L)843 esl_newssi_AddKey(ESL_NEWSSI *ns, const char *key, uint16_t fh,
844 		  off_t r_off, off_t d_off, int64_t L)
845 {
846   int status;
847   int i;
848   int n;			/* a string length */
849 
850   if (fh >= eslSSI_MAXFILES)           ESL_XEXCEPTION(eslEINVAL, "invalid fh");
851   if (ns->nprimary >= eslSSI_MAXKEYS)  ESL_XFAIL(eslERANGE, ns->errbuf, "exceeded maximum number of primary keys allowed");
852 
853   /* Before adding the key: check how big our index is.
854    * If it's getting too large, switch to external mode.
855    */
856   if (!ns->external && current_newssi_size(ns) >= ns->max_ram)
857     if ((status = activate_external_sort(ns)) != eslOK) goto ERROR;
858 
859   /* Update maximum pkey length, if needed. (Inclusive of '\0').
860    */
861   n = strlen(key)+1;
862   if (n > ns->plen) ns->plen = n;
863 
864   /* External mode? Simply append to disk...
865    */
866   if (ns->external)
867     {
868       if (sizeof(off_t) == 4) {
869 	if (fprintf(ns->ptmp, "%s\t%d\t%" PRIu32 "\t%" PRIu32 "\t%" PRIi64 "\n",
870 		    key, fh, (uint32_t) r_off, (uint32_t) d_off, L) <= 0)
871 	  ESL_XEXCEPTION_SYS(eslEWRITE, "ssi key tmp file write failed");
872       } else {
873 	if (fprintf(ns->ptmp, "%s\t%d\t%" PRIu64 "\t%" PRIu64 "\t%" PRIi64 "\n",
874 		    key, fh, (uint64_t) r_off, (uint64_t) d_off, L) <= 0)
875 	  ESL_XEXCEPTION_SYS(eslEWRITE, "ssi key tmp file write failed");
876       }
877       ns->nprimary++;
878     }
879   else
880     {
881       /* Else: internal mode, keep keys in memory...
882        */
883       if ((status = esl_strdup(key, n, &(ns->pkeys[ns->nprimary].key))) != eslOK) goto ERROR;
884       ns->pkeys[ns->nprimary].fnum  = fh;
885       ns->pkeys[ns->nprimary].r_off = r_off;
886       ns->pkeys[ns->nprimary].d_off = d_off;
887       ns->pkeys[ns->nprimary].len   = L;
888       ns->nprimary++;
889 
890       /* Reallocate as needed. */
891       if (ns->nprimary % eslSSI_KCHUNK == 0) {
892 	ESL_REALLOC(ns->pkeys, sizeof(ESL_PKEY) * (ns->nprimary+eslSSI_KCHUNK));
893 	for (i = ns->nprimary; i < ns->nprimary + eslSSI_KCHUNK; i++)
894 	  ns->pkeys[i].key = NULL;
895       }
896     }
897   return eslOK;
898 
899  ERROR:
900   return status;
901 }
902 
903 /* Function:  esl_newssi_AddAlias()
904  * Synopsis:  Add a secondary key (alias) to a growing index.
905  *
906  * Purpose:   Registers secondary key <alias> in index <ns>, and
907  *            map it to the primary key <key>. <key> must already
908  *            have been registered. That is, when someone looks up <alias>,
909  *            we'll retrieve record <key>.
910  *
911  * Args:      <ns>    - ssi index being constructed
912  *            <alias> - secondary key to register
913  *            <key>   - primary key to associate with <skey>.
914  *
915  * Returns:   <eslOK>        on success;
916  *            <eslERANGE>    if registering this key would exceed the maximum
917  *                           number of secondary keys that can be stored;
918  *            <eslENOTFOUND> if we needed to open external tmp files, but
919  *                           the attempt to open them failed.
920  *
921  * Throws:    <eslEWRITE>   on any system error writing to tmp file, such
922  *                          as running out of space on the device.
923  */
924 int
esl_newssi_AddAlias(ESL_NEWSSI * ns,const char * alias,const char * key)925 esl_newssi_AddAlias(ESL_NEWSSI *ns, const char *alias, const char *key)
926 {
927   int status;
928   int i;
929   int n;			/* a string length */
930 
931   if (ns->nsecondary >= eslSSI_MAXKEYS) ESL_XFAIL(eslERANGE, ns->errbuf, "exceeded maximum number of secondary keys allowed");
932 
933   /* Before adding the key: check how big our index is.
934    * If it's getting too large, switch to external mode.
935    */
936   if (!ns->external && current_newssi_size(ns) >= ns->max_ram)
937     if ((status = activate_external_sort(ns)) != eslOK) goto ERROR;
938 
939   /* Update maximum secondary key length, if necessary. */
940   n = strlen(alias)+1;
941   if (n > ns->slen) ns->slen = n;
942 
943   /* if external mode: write info to disk. */
944   if (ns->external)
945     {
946       if (fprintf(ns->stmp, "%s\t%s\n", alias, key) <= 0) ESL_XEXCEPTION_SYS(eslEWRITE, "ssi alias tmp file write failed");
947       ns->nsecondary++;
948     }
949   else
950     { /* else, internal mode... store info in memory. */
951       if ((status = esl_strdup(alias, n, &(ns->skeys[ns->nsecondary].key))) != eslOK) goto ERROR;
952       if ((status = esl_strdup(key, -1, &(ns->skeys[ns->nsecondary].pkey))) != eslOK) goto ERROR;
953       ns->nsecondary++;
954 
955       if (ns->nsecondary % eslSSI_KCHUNK == 0) {
956 	ESL_REALLOC(ns->skeys, sizeof(ESL_SKEY) * (ns->nsecondary+eslSSI_KCHUNK));
957 	for (i = ns->nsecondary; i < ns->nsecondary+eslSSI_KCHUNK; i++) {
958 	  ns->skeys[i].key  = NULL;
959 	  ns->skeys[i].pkey = NULL;
960 	}
961       }
962     }
963   return eslOK;
964 
965  ERROR:
966   return status;
967 }
968 
969 
970 /* Function:  esl_newssi_Write()
971  * Synopsis:  Save a new index to an SSI file.
972  *
973  * Purpose:   Writes the complete index <ns> in SSI format to its file,
974  *            and closes the file.
975  *
976  *            Handles all necessary overhead of sorting the primary and
977  *            secondary keys, including any externally sorted tmpfiles that
978  *            may have been needed for large indices.
979  *
980  *            You only <_Write()> once. The open SSI file is closed.
981  *            After calling <_Write()>, you should <_Close()> the
982  *            <ESL_NEWSSI>.
983  *
984  *            Verifies that all primary and secondary keys are unique.
985  *            If not, returns <eslEDUP>.
986  *
987  *            On any error, the SSI file <ns->ssifile> is deleted.
988  *
989  * Args:      <ns>  - new SSI index to write
990  *
991  * Returns:   <eslOK>       on success;
992  *            <eslEDUP>     if primary or secondary keys aren't all unique
993  *            <eslERANGE>   if index size exceeds system's maximum file size;
994  *            <eslESYS>     if any of the steps of an external sort fail.
995  *
996  * Throws:    <eslEINVAL> on invalid argument, including too-long tmpfile names,
997  *                        or trying to _Write() the <ESL_NEWSSI> more than once;
998  *            <eslEMEM>   on buffer allocation failure;
999  *            <eslEWRITE> on any system write failure, including filled disk.
1000  *
1001  * Note:      It's O(1) memory to check for key duplications
1002  *            here, after keys are sorted, compared to O(N) in
1003  *            <esl_newssi_AddKey()>, where we would have to maintain a
1004  *            hash of all previous N keys in memory.
1005  */
1006 int
esl_newssi_Write(ESL_NEWSSI * ns)1007 esl_newssi_Write(ESL_NEWSSI *ns)
1008 {
1009   int      status, 		/* convention                               */
1010            i;			/* counter over files, keys                 */
1011   uint32_t header_flags,	/* bitflags in the header                   */
1012            file_flags,		/* bitflags for a file record               */
1013            frecsize, 		/* size of a file record (bytes)            */
1014            precsize, 		/* size of a primary key record (bytes)     */
1015            srecsize;		/* size of a secondary key record (bytes)   */
1016   off_t    foffset, 		/* offset to file section                   */
1017            poffset, 		/* offset to primary key section            */
1018            soffset;		/* offset to secondary key section          */
1019   char    *fk       = NULL,     /* fixed-width (flen) file name             */
1020           *pk       = NULL, 	/* fixed-width (plen) primary key string    */
1021           *sk       = NULL,	/* fixed-width (slen) secondary key string  */
1022           *buf      = NULL;	/* esl_fgets() growable buffer              */
1023   int      n        = 0;	/* esl_fgets() buffer size                  */
1024   ESL_PKEY pkey;		/* primary key info from external tmpfile   */
1025   ESL_SKEY skey;		/* secondary key info from external tmpfile */
1026 
1027   if (ns->nsecondary > 0 && ns->slen == 0)
1028     ESL_EXCEPTION(eslEINVAL, "zero secondary key length: shouldn't happen");
1029   if (ns->ssifp == NULL)
1030     ESL_EXCEPTION(eslEINVAL, "SSI data were already written.");
1031 
1032   /* We need fixed-width buffers to get our keys fwrite()'ten in their
1033    * full binary lengths; pkey->key (for instance) is not guaranteed
1034    * to be allocated for the final maximum plen. We use strncpy(), not
1035    * strcpy(), to fill these buffers, because strncpy() pads unused
1036    * bytes as NUL's, and valgrind will flag you if you attempt to
1037    * write uninitialized bytes from these buffers.
1038    */
1039   ESL_ALLOC(fk,   sizeof(char) * ns->flen);
1040   ESL_ALLOC(pk,   ESL_MAX(1, sizeof(char) * ns->plen));
1041   if (ns->nsecondary > 0) ESL_ALLOC(sk, sizeof(char) * ns->slen);
1042 
1043   /* How big is the index? If it's going to be > 2GB, we better have
1044    * 64-bit offsets. (2047 (instead of 2048) gives us
1045    * some slop room.) If not, abort here.
1046    *
1047    * aborting here is pretty brutal - we've processed hundreds of
1048    * millions of keys for nothing. Ah well.
1049    */
1050   if (current_newssi_size(ns) >= 2047 && sizeof(off_t) != 8)
1051     ESL_XFAIL(eslERANGE, ns->errbuf, "SSI index file file would be > 2G; your filesystem isn't capable of handling it");
1052 
1053   /* Magic-looking numbers come from adding up sizes
1054    * of things in bytes: they match current_newssi_size().
1055    */
1056   frecsize     = 4*sizeof(uint32_t) + ns->flen;
1057   precsize     = 2*sizeof(off_t) + sizeof(uint16_t) + sizeof(uint64_t) + ns->plen;
1058   srecsize     = ns->slen + ns->plen;
1059   header_flags = 0;
1060 
1061   /* Magic-looking numbers again come from adding up sizes
1062    * of things in bytes: matches current_newssi_size()
1063    */
1064   foffset = 9*sizeof(uint32_t)+2*sizeof(uint64_t)+sizeof(uint16_t)+3*sizeof(off_t);
1065   poffset = foffset + frecsize*ns->nfiles;
1066   soffset = poffset + precsize*ns->nprimary;
1067 
1068   /* Sort the keys.
1069    * If external mode, make system calls to UNIX/POSIX "sort" in place, then
1070    * open new sorted files for reading thru ptmp and stmp handles.
1071    * If internal mode, call qsort.
1072    *
1073    * Note that you'd better force a POSIX locale for the sort; else,
1074    * some silly distro (e.g. Mandrake Linux >=8.1) may have specified
1075    * LC_COLLATE=en_US, and this'll give a sort "bug" in which it doesn't
1076    * sort by byte order.
1077    */
1078   if (ns->external)
1079     {
1080       char cmd[1024];
1081 
1082       /* A last minute security check: make sure we won't overflow
1083        * sprintf() with the tmpfile names. They're hardcoded now, so
1084        * we know they don't overflow, but they might be configurable
1085        * in the future, and we wouldn't want a security hole to open
1086        * up.
1087        */
1088       if (strlen(ns->ptmpfile) > 256 || strlen(ns->ptmpfile) > 256)
1089 	ESL_XEXCEPTION(eslEINVAL, "tmpfile name too long");
1090 
1091       fclose(ns->ptmp);
1092       ns->ptmp = NULL;
1093       sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", ns->ptmpfile, ns->ptmpfile);
1094       if (system(cmd) != 0)                              ESL_XFAIL(eslESYS, ns->errbuf, "external sort of primary keys failed");
1095       if ((ns->ptmp = fopen(ns->ptmpfile, "r")) == NULL) ESL_XFAIL(eslESYS, ns->errbuf, "failed to reopen primary key tmp file after sort");
1096 
1097       fclose(ns->stmp);
1098       ns->stmp = NULL;
1099       sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", ns->stmpfile, ns->stmpfile);
1100       if (system(cmd) != 0)                              ESL_XFAIL(eslESYS, ns->errbuf, "external sort of secondary keys failed");
1101       if ((ns->stmp = fopen(ns->stmpfile, "r")) == NULL) ESL_XFAIL(eslESYS, ns->errbuf, "failed to reopen secondary key tmp file after sort");
1102     }
1103   else
1104     {
1105       qsort((void *) ns->pkeys, ns->nprimary,   sizeof(ESL_PKEY), pkeysort);
1106       qsort((void *) ns->skeys, ns->nsecondary, sizeof(ESL_SKEY), skeysort);
1107     }
1108 
1109   /* Write the header
1110    */
1111   if (esl_fwrite_u32(ns->ssifp, v30magic)      != eslOK ||
1112       esl_fwrite_u32(ns->ssifp, header_flags)  != eslOK ||
1113       esl_fwrite_u32(ns->ssifp, sizeof(off_t)) != eslOK ||
1114       esl_fwrite_u16(ns->ssifp, ns->nfiles)    != eslOK ||
1115       esl_fwrite_u64(ns->ssifp, ns->nprimary)  != eslOK ||
1116       esl_fwrite_u64(ns->ssifp, ns->nsecondary)!= eslOK ||
1117       esl_fwrite_u32(ns->ssifp, ns->flen)      != eslOK ||
1118       esl_fwrite_u32(ns->ssifp, ns->plen)      != eslOK ||
1119       esl_fwrite_u32(ns->ssifp, ns->slen)      != eslOK ||
1120       esl_fwrite_u32(ns->ssifp, frecsize)      != eslOK ||
1121       esl_fwrite_u32(ns->ssifp, precsize)      != eslOK ||
1122       esl_fwrite_u32(ns->ssifp, srecsize)      != eslOK ||
1123       esl_fwrite_offset(ns->ssifp, foffset)    != eslOK ||
1124       esl_fwrite_offset(ns->ssifp, poffset)    != eslOK ||
1125       esl_fwrite_offset(ns->ssifp, soffset)    != eslOK)
1126     ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1127 
1128   /* Write the file section
1129    */
1130   for (i = 0; i < ns->nfiles; i++)
1131     {
1132       file_flags = 0;
1133       if (ns->bpl[i] > 0 && ns->rpl[i] > 0) file_flags |= eslSSI_FASTSUBSEQ;
1134       strncpy(fk, ns->filenames[i], ns->flen);
1135 
1136       if (fwrite(fk, sizeof(char), ns->flen, ns->ssifp) != ns->flen ||
1137 	  esl_fwrite_u32(ns->ssifp, ns->fileformat[i])  != eslOK    ||
1138 	  esl_fwrite_u32(ns->ssifp, file_flags)         != eslOK    ||
1139 	  esl_fwrite_u32(ns->ssifp, ns->bpl[i])         != eslOK    ||
1140 	  esl_fwrite_u32(ns->ssifp, ns->rpl[i])         != eslOK)
1141 	ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1142     }
1143 
1144   /* Write the primary key section
1145    */
1146   if (ns->external)
1147     {
1148       if (ns->nprimary) strncpy(pk, "", ns->plen);
1149       for (i = 0; i < ns->nprimary; i++)
1150 	{
1151 	  if (esl_fgets(&buf, &n, ns->ptmp)  != eslOK)    ESL_XFAIL(eslESYS, ns->errbuf, "read from sorted primary key tmpfile failed");
1152 	  if (parse_pkey(buf, &pkey)         != eslOK)    ESL_XFAIL(eslESYS, ns->errbuf, "parse failed for a line of sorted primary key tmpfile failed");
1153           if (strcmp(pk, pkey.key)           == 0)        ESL_XFAIL(eslEDUP, ns->errbuf, "primary keys not unique: '%s' occurs more than once", pkey.key);
1154 	  strncpy(pk, pkey.key, ns->plen);   // strncpy() pads w/ nulls, and we count on that behavior.
1155 
1156 	  if (fwrite(pk,sizeof(char),ns->plen,ns->ssifp) != ns->plen ||
1157 	      esl_fwrite_u16(   ns->ssifp, pkey.fnum)    != eslOK    ||
1158 	      esl_fwrite_offset(ns->ssifp, pkey.r_off)   != eslOK    ||
1159 	      esl_fwrite_offset(ns->ssifp, pkey.d_off)   != eslOK    ||
1160 	      esl_fwrite_i64(   ns->ssifp, pkey.len)     != eslOK)
1161 	    ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1162 	}
1163     }
1164   else
1165     {
1166       if (ns->nprimary) strncpy(pk, "", ns->plen);
1167       for (i = 0; i < ns->nprimary; i++)
1168 	{
1169           if (strcmp(pk, ns->pkeys[i].key)  == 0)  ESL_XFAIL(eslEDUP, ns->errbuf, "primary keys not unique: '%s' occurs more than once", ns->pkeys[i].key);
1170 	  strncpy(pk, ns->pkeys[i].key, ns->plen);
1171 
1172 	  if (fwrite(pk,sizeof(char),ns->plen,ns->ssifp)       != ns->plen ||
1173 	      esl_fwrite_u16(   ns->ssifp, ns->pkeys[i].fnum)  != eslOK    ||
1174 	      esl_fwrite_offset(ns->ssifp, ns->pkeys[i].r_off) != eslOK    ||
1175 	      esl_fwrite_offset(ns->ssifp, ns->pkeys[i].d_off) != eslOK    ||
1176 	      esl_fwrite_i64(   ns->ssifp, ns->pkeys[i].len)   != eslOK)
1177 	    ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1178 	}
1179     }
1180 
1181 
1182   /* Write the secondary key section
1183    */
1184   if (ns->external)
1185     {
1186       if (ns->nsecondary) strncpy(sk, "", ns->slen);
1187       for (i = 0; i < ns->nsecondary; i++)
1188 	{
1189 	  if (esl_fgets(&buf, &n, ns->stmp) != eslOK) ESL_XFAIL(eslESYS, ns->errbuf, "read from sorted secondary key tmpfile failed");
1190 	  if (parse_skey(buf, &skey)        != eslOK) ESL_XFAIL(eslESYS, ns->errbuf, "parse failed for a line of sorted secondary key tmpfile failed");
1191           if (strcmp(sk, skey.key)          == 0)     ESL_XFAIL(eslEDUP, ns->errbuf, "secondary keys not unique: '%s' occurs more than once", skey.key);
1192 	  strncpy(sk, skey.key,  ns->slen);  // slen > 0 if there are any secondary keys.
1193 	  strncpy(pk, skey.pkey, ns->plen);
1194 
1195 	  if (fwrite(sk, sizeof(char), ns->slen, ns->ssifp) != ns->slen ||
1196 	      fwrite(pk, sizeof(char), ns->plen, ns->ssifp) != ns->plen)
1197 	    ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1198 	}
1199     }
1200   else
1201     {
1202       /* if ns->nsecondary=0, ns->slen=0 and sk=NULL */
1203       if (ns->nsecondary) strncpy(sk, "", ns->slen);
1204       for (i = 0; i < ns->nsecondary; i++)
1205 	{
1206           if (strcmp(sk, ns->skeys[i].key) == 0) ESL_XFAIL(eslEDUP, ns->errbuf, "secondary keys not unique: '%s' occurs more than once", ns->skeys[i].key);
1207 	  strncpy(sk, ns->skeys[i].key,  ns->slen);
1208 	  strncpy(pk, ns->skeys[i].pkey, ns->plen);
1209 
1210 	  if (fwrite(sk, sizeof(char), ns->slen, ns->ssifp) != ns->slen ||
1211 	      fwrite(pk, sizeof(char), ns->plen, ns->ssifp) != ns->plen)
1212 	    ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1213 	}
1214     }
1215 
1216   fclose(ns->ssifp);                // Closing <ssifp> makes it so we can only _Write() once.
1217   ns->ssifp = NULL;
1218   if (fk)       free(fk);
1219   if (pk)       free(pk);
1220   if (sk)       free(sk);
1221   if (buf)      free(buf);
1222   if (ns->ptmp) { fclose(ns->ptmp); ns->ptmp = NULL; }
1223   if (ns->stmp) { fclose(ns->stmp); ns->stmp = NULL; }
1224   return eslOK;
1225 
1226  ERROR:
1227   remove(ns->ssifile);               // Cleanup: delete failed <ssifile> on any error.
1228   if (ns->ssifp) { fclose(ns->ssifp); ns->ssifp = NULL; }
1229   if (fk)        free(fk);
1230   if (pk)        free(pk);
1231   if (sk)        free(sk);
1232   if (buf)       free(buf);
1233   if (ns->ptmp)  { fclose(ns->ptmp); ns->ptmp = NULL; }
1234   if (ns->stmp)  { fclose(ns->stmp); ns->stmp = NULL; }
1235   return status;
1236 }
1237 
1238 /* Function:  esl_newssi_Close()
1239  * Synopsis:  Free an <ESL_NEWSSI>.
1240  *
1241  * Purpose:   Frees a <ESL_NEWSSI>.
1242  */
1243 void
esl_newssi_Close(ESL_NEWSSI * ns)1244 esl_newssi_Close(ESL_NEWSSI *ns)
1245 {
1246   int i;
1247   if (ns == NULL) return;
1248 
1249   if (ns->external == FALSE)
1250     {
1251       if (ns->pkeys != NULL)
1252 	{
1253 	  for (i = 0; i < ns->nprimary; i++)
1254 	    if (ns->pkeys[i].key != NULL) free(ns->pkeys[i].key);
1255 	  free(ns->pkeys);
1256 	}
1257       if (ns->skeys != NULL)
1258 	{
1259 	  for (i = 0; i < ns->nsecondary; i++)
1260 	    {
1261 	      if (ns->skeys[i].key  != NULL) free(ns->skeys[i].key);
1262 	      if (ns->skeys[i].pkey != NULL) free(ns->skeys[i].pkey);
1263 	    }
1264 	  free(ns->skeys);
1265 	}
1266     }
1267   else
1268     {
1269       remove(ns->ptmpfile);
1270       remove(ns->stmpfile);
1271     }
1272 
1273   if (ns->filenames   != NULL)
1274     {
1275       for (i = 0; i < ns->nfiles; i++)
1276 	if (ns->filenames[i] != NULL) free(ns->filenames[i]);
1277       free(ns->filenames);
1278     }
1279 
1280   if (ns->stmp)       fclose(ns->stmp);
1281   if (ns->stmpfile)   free(ns->stmpfile);
1282   if (ns->ptmp)       fclose(ns->ptmp);
1283   if (ns->ptmpfile)   free(ns->ptmpfile);
1284   if (ns->fileformat) free(ns->fileformat);
1285   if (ns->bpl)        free(ns->bpl);
1286   if (ns->rpl)        free(ns->rpl);
1287   if (ns->ssifile)    free(ns->ssifile);
1288   if (ns->ssifp)      fclose(ns->ssifp);
1289   free(ns);
1290 }
1291 
1292 
1293 
1294 
1295 /* current_newssi_size()
1296  *
1297  * Calculates the size of the current index, in megabytes, in
1298  * its disk version (which is essentially the same as the
1299  * RAM it takes, modulo some small overhead for the structures
1300  * and ptrs).
1301  *
1302  * The header costs 10 uint32, 1 uint16, and 3 off_t's: 42 + (12 | 24).
1303  * Each file record costs 4 uint32 and flen chars;
1304  * each primary key costs us 2 off_t, 1 uint16, 1 uint32, and plen chars;
1305  * each sec key costs us  plen+slen chars.
1306  */
1307 static int
current_newssi_size(const ESL_NEWSSI * ns)1308 current_newssi_size(const ESL_NEWSSI *ns)
1309 {
1310   uint64_t frecsize, precsize, srecsize;
1311   uint64_t total;
1312 
1313   /* Magic-looking numbers come from adding up sizes
1314    * of things in bytes
1315    */
1316   frecsize = 4*sizeof(uint32_t) + ns->flen;
1317   precsize = 2*sizeof(off_t) + sizeof(uint16_t) + sizeof(uint64_t) + ns->plen;
1318   srecsize = ns->slen + ns->plen;
1319   total = (9*sizeof(uint32_t)+2*sizeof(uint64_t)+sizeof(uint16_t)+3*sizeof(off_t)+
1320 	   frecsize * ns->nfiles +      /* file section size                   */
1321 	   precsize * ns->nprimary +    /* primary key section size            */
1322 	   srecsize * ns->nsecondary) / /* secondary key section size          */
1323           1048576L;
1324   return (int) total;
1325 }
1326 
1327 /* activate_external_sort()
1328  *
1329  * Switch to external sort mode.
1330  * Open file handles for external index files (ptmp, stmp).
1331  * Flush current index information to these files.
1332  * Free current memory, turn over control to the tmpfiles.
1333  *
1334  * Return <eslOK>        on success;
1335  *        <eslENOTFOUND> if we can't open a tmpfile for writing.
1336  *
1337  * Throw  <eslEWRITE>    if a write fails.
1338  */
1339 static int
activate_external_sort(ESL_NEWSSI * ns)1340 activate_external_sort(ESL_NEWSSI *ns)
1341 {
1342   int status;
1343   int i;
1344 
1345   if (ns->external)                   return eslOK; /* we already are external, fool */
1346 
1347   if ((ns->ptmp = fopen(ns->ptmpfile, "w")) == NULL) ESL_XFAIL(eslENOTFOUND, ns->errbuf, "Failed to open primary key tmpfile for external sort");
1348   if ((ns->stmp = fopen(ns->stmpfile, "w")) == NULL) ESL_XFAIL(eslENOTFOUND, ns->errbuf, "Failed to open secondary key tmpfile for external sort");
1349 
1350   /* Flush the current indices. */
1351   for (i = 0; i < ns->nprimary; i++) {
1352     if (sizeof(off_t) == 4) {
1353       if (fprintf(ns->ptmp, "%s\t%u\t%lu\t%lu\t%lu\n",
1354 		  ns->pkeys[i].key,
1355 		  (unsigned int)  ns->pkeys[i].fnum,
1356 		  (unsigned long) ns->pkeys[i].r_off,
1357 		  (unsigned long) ns->pkeys[i].d_off,
1358 		  (unsigned long) ns->pkeys[i].len) <= 0)
1359 	ESL_XEXCEPTION_SYS(eslEWRITE, "ssi key tmp file write failed");
1360     } else {
1361       if (fprintf(ns->ptmp, "%s\t%u\t%llu\t%llu\t%lu\n",
1362 		  ns->pkeys[i].key,
1363 		  (unsigned int)       ns->pkeys[i].fnum,
1364 		  (unsigned long long) ns->pkeys[i].r_off,
1365 		  (unsigned long long) ns->pkeys[i].d_off,
1366 		  (unsigned long)      ns->pkeys[i].len) <= 0)
1367 	ESL_XEXCEPTION_SYS(eslEWRITE, "ssi key tmp file write failed");
1368     }
1369   }
1370   for (i = 0; i < ns->nsecondary; i++)
1371     if (fprintf(ns->stmp, "%s\t%s\n", ns->skeys[i].key, ns->skeys[i].pkey) <= 0)
1372       ESL_XEXCEPTION_SYS(eslEWRITE, "ssi alias tmp file write failed");
1373 
1374   /* Free the memory now that we've flushed our lists to disk
1375    */
1376   for (i = 0; i < ns->nprimary;   i++) free(ns->pkeys[i].key);
1377   for (i = 0; i < ns->nsecondary; i++) free(ns->skeys[i].key);
1378   for (i = 0; i < ns->nsecondary; i++) free(ns->skeys[i].pkey);
1379   if (ns->pkeys != NULL) free(ns->pkeys);
1380   if (ns->skeys != NULL) free(ns->skeys);
1381   ns->pkeys    = NULL;
1382   ns->skeys    = NULL;
1383   ns->external = TRUE;
1384   return eslOK;
1385 
1386  ERROR:
1387   if (ns->ptmp != NULL) { fclose(ns->ptmp); ns->ptmp = NULL; }
1388   if (ns->stmp != NULL) { fclose(ns->stmp); ns->stmp = NULL; }
1389   return status;
1390 }
1391 
1392 /* parse_pkey(), parse_skey()
1393  *
1394  * Given a <buf> containing a line read from the external
1395  * primary-key or secondary-key tmpfile; parse it, and fill in the fields of
1396  * <pkey> or <skey>
1397  *
1398  * <?key> is a ptr to a structure on the stack. It is assumed
1399  * to be in use only transiently.
1400  * <?key>'s strings become ptrs into <buf>'s space, so we don't have to
1401  * allocate new space for them. This means that the transient <?key> structure
1402  * is only usable until <buf> is modified or free'd.
1403  *
1404  * Returns <eslOK> on success.
1405  *
1406  * Throws  <eslEFORMAT>        on parse error (shouldn't happen; we created it!)
1407  *         <eslEINCONCEIVABLE> if we can't deal with off_t's size.
1408  */
1409 static int
parse_pkey(char * buf,ESL_PKEY * pkey)1410 parse_pkey(char *buf, ESL_PKEY *pkey)
1411 {
1412   int   status;
1413   char *s, *tok;
1414 
1415   s = buf;
1416   if (esl_strtok(&s, "\t\n", &(pkey->key)) != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1417   if (esl_strtok(&s, "\t\n", &tok)         != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1418 
1419   pkey->fnum = (uint16_t) atoi(tok);
1420   if (esl_strtok(&s, "\t\n", &tok)         != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1421   if      (sizeof(off_t) == 4) pkey->r_off  = (off_t) strtoul (tok, NULL, 10);
1422   else if (sizeof(off_t) == 8) pkey->r_off  = (off_t) strtoull(tok, NULL, 10);
1423   else                         ESL_XEXCEPTION(eslEINCONCEIVABLE, "whoa - weird off_t");
1424 
1425   if (esl_strtok(&s, "\t\n", &tok)         != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1426   if      (sizeof(off_t) == 4) pkey->d_off  = (off_t) strtoul (tok, NULL, 10);
1427   else if (sizeof(off_t) == 8) pkey->d_off  = (off_t) strtoull(tok, NULL, 10);
1428   else                         ESL_XEXCEPTION(eslEINCONCEIVABLE, "whoa - weird off_t");
1429 
1430   if (esl_strtok(&s, "\t\n", &tok)         != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1431   pkey->len = (uint64_t) strtoull(tok, NULL, 10);
1432   return eslOK;
1433 
1434  ERROR:
1435   return status;
1436 }
1437 static int
parse_skey(char * buf,ESL_SKEY * skey)1438 parse_skey(char *buf, ESL_SKEY *skey)
1439 {
1440   int   status;
1441   char *s;
1442 
1443   s = buf;
1444   if (esl_strtok(&s, "\t\n", &(skey->key))  != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1445   if (esl_strtok(&s, "\t\n", &(skey->pkey)) != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1446   return eslOK;
1447 
1448  ERROR:
1449   return status;
1450 }
1451 
1452 /* ordering functions needed for qsort() */
1453 static int
pkeysort(const void * k1,const void * k2)1454 pkeysort(const void *k1, const void *k2)
1455 {
1456   ESL_PKEY *key1;
1457   ESL_PKEY *key2;
1458   key1 = (ESL_PKEY *) k1;
1459   key2 = (ESL_PKEY *) k2;
1460   return strcmp(key1->key, key2->key);
1461 }
1462 static int
skeysort(const void * k1,const void * k2)1463 skeysort(const void *k1, const void *k2)
1464 {
1465   ESL_SKEY *key1;
1466   ESL_SKEY *key2;
1467   key1 = (ESL_SKEY *) k1;
1468   key2 = (ESL_SKEY *) k2;
1469   return strcmp(key1->key, key2->key);
1470 }
1471 
1472 
1473 /*****************************************************************
1474  *# 3. Portable binary i/o
1475  *****************************************************************/
1476 
1477 /* Function:  esl_byteswap()
1478  * Synopsis:  Swap between big-endian and little-endian, in place.
1479  *
1480  * Purpose:   Swap between big-endian and little-endian, in place.
1481  */
1482 void
esl_byteswap(char * swap,int nbytes)1483 esl_byteswap(char *swap, int nbytes)
1484 {
1485   int  x;
1486   char byte;
1487 
1488   for (x = 0; x < nbytes / 2; x++)
1489     {
1490       byte = swap[nbytes - x - 1];
1491       swap[nbytes - x - 1] = swap[x];
1492       swap[x] = byte;
1493     }
1494 }
1495 
1496 /* Function:  esl_ntoh16()
1497  * Synopsis:  Convert 2-byte integer from network-order to host-order.
1498  *
1499  * Purpose:   Convert a 2-byte integer from network-order to host-order,
1500  *            and return it.
1501  *
1502  *            <esl_ntoh32()> and <esl_ntoh64()> do the same, but for 4-byte
1503  *            and 8-byte integers, respectively.
1504  */
1505 uint16_t
esl_ntoh16(uint16_t netshort)1506 esl_ntoh16(uint16_t netshort)
1507 {
1508 #ifdef WORDS_BIGENDIAN
1509   return netshort;
1510 #else
1511   esl_byteswap((char *) &netshort, 2);
1512   return netshort;
1513 #endif
1514 }
1515 uint32_t
esl_ntoh32(uint32_t netlong)1516 esl_ntoh32(uint32_t netlong)
1517 {
1518 #ifdef WORDS_BIGENDIAN
1519   return netlong;
1520 #else
1521   esl_byteswap((char *) &netlong, 4);
1522   return netlong;
1523 #endif
1524 }
1525 uint64_t
esl_ntoh64(uint64_t net_int64)1526 esl_ntoh64(uint64_t net_int64)
1527 {
1528 #ifdef WORDS_BIGENDIAN
1529   return net_int64;
1530 #else
1531   esl_byteswap((char *) &net_int64, 8);
1532   return net_int64;
1533 #endif
1534 }
1535 
1536 /* Function:  esl_hton16()
1537  * Synopsis:  Convert 2-byte integer from host-order to network-order.
1538  *
1539  * Purpose:   Convert a 2-byte integer from host-order to network-order, and
1540  *            return it.
1541  *
1542  *            <esl_hton32()> and <esl_hton64()> do the same, but for 4-byte
1543  *            and 8-byte integers, respectively.
1544  */
1545 uint16_t
esl_hton16(uint16_t hostshort)1546 esl_hton16(uint16_t hostshort)
1547 {
1548 #ifdef WORDS_BIGENDIAN
1549   return hostshort;
1550 #else
1551   esl_byteswap((char *) &hostshort, 2);
1552   return hostshort;
1553 #endif
1554 }
1555 uint32_t
esl_hton32(uint32_t hostlong)1556 esl_hton32(uint32_t hostlong)
1557 {
1558 #ifdef WORDS_BIGENDIAN
1559   return hostlong;
1560 #else
1561   esl_byteswap((char *) &hostlong, 4);
1562   return hostlong;
1563 #endif
1564 }
1565 uint64_t
esl_hton64(uint64_t host_int64)1566 esl_hton64(uint64_t host_int64)
1567 {
1568 #ifdef WORDS_BIGENDIAN
1569   return host_int64;
1570 #else
1571   esl_byteswap((char *) &host_int64, 8);
1572   return host_int64;
1573 #endif
1574 }
1575 
1576 
1577 /* Function:  esl_fread_u16()
1578  * Synopsis:  Read network-order integer from a stream.
1579  *
1580  * Purpose:   Read a 2-byte network-order integer from <fp>, convert to
1581  *            host order, leave it in <ret_result>.
1582  *
1583  *            <esl_fread_u32()> and <esl_fread_u64()> do the same, but
1584  *            for 4-byte and 8-byte integers, respectively.
1585  *
1586  * Returns:   <eslOK> on success, and <eslFAIL> on <fread()> failure.
1587  */
1588 int
esl_fread_u16(FILE * fp,uint16_t * ret_result)1589 esl_fread_u16(FILE *fp, uint16_t *ret_result)
1590 {
1591   uint16_t result;
1592   if (fread(&result, sizeof(uint16_t), 1, fp) != 1) return eslFAIL;
1593   *ret_result = esl_ntoh16(result);
1594   return eslOK;
1595 }
1596 int
esl_fread_u32(FILE * fp,uint32_t * ret_result)1597 esl_fread_u32(FILE *fp, uint32_t *ret_result)
1598 {
1599   uint32_t result;
1600   if (fread(&result, sizeof(uint32_t), 1, fp) != 1) return eslFAIL;
1601   *ret_result = esl_ntoh32(result);
1602   return eslOK;
1603 }
1604 int
esl_fread_u64(FILE * fp,uint64_t * ret_result)1605 esl_fread_u64(FILE *fp, uint64_t *ret_result)
1606 {
1607   uint64_t result;
1608   if (fread(&result, sizeof(uint64_t), 1, fp) != 1) return eslFAIL;
1609   *ret_result = esl_ntoh64(result);
1610   return eslOK;
1611 }
1612 int
esl_fread_i16(FILE * fp,int16_t * ret_result)1613 esl_fread_i16(FILE *fp, int16_t *ret_result)
1614 {
1615   int16_t result;
1616   if (fread(&result, sizeof(int16_t), 1, fp) != 1) return eslFAIL;
1617   *ret_result = (int16_t) esl_ntoh16((uint16_t) result);
1618   return eslOK;
1619 }
1620 int
esl_fread_i32(FILE * fp,int32_t * ret_result)1621 esl_fread_i32(FILE *fp, int32_t *ret_result)
1622 {
1623   int32_t result;
1624   if (fread(&result, sizeof(int32_t), 1, fp) != 1) return eslFAIL;
1625   *ret_result = (int32_t) esl_ntoh32((uint32_t) result);
1626   return eslOK;
1627 }
1628 int
esl_fread_i64(FILE * fp,int64_t * ret_result)1629 esl_fread_i64(FILE *fp, int64_t *ret_result)
1630 {
1631   int64_t result;
1632   if (fread(&result, sizeof(int64_t), 1, fp) != 1) return eslFAIL;
1633   *ret_result = (int64_t) esl_ntoh64((uint64_t) result);
1634   return eslOK;
1635 }
1636 
1637 
1638 /* Function:  esl_fwrite_u16()
1639  * Synopsis:  Write an integer to a stream in network-order.
1640  *
1641  * Purpose:   Write a 2-byte host-order integer <n> to stream <fp>
1642  *            in network order.
1643  *
1644  *            <esl_fwrite_u32()> and <esl_fwrite_u64()> do the same, but
1645  *            for 4-byte and 8-byte integers, respectively.
1646  *
1647  * Returns:   <eslOK> on success, and <eslFAIL> on <fwrite()> failure.
1648  */
1649 int
esl_fwrite_u16(FILE * fp,uint16_t n)1650 esl_fwrite_u16(FILE *fp, uint16_t n)
1651 {
1652   n = esl_hton16(n);
1653   if (fwrite(&n, sizeof(uint16_t), 1, fp) != 1) return eslFAIL;
1654   return eslOK;
1655 }
1656 int
esl_fwrite_u32(FILE * fp,uint32_t n)1657 esl_fwrite_u32(FILE *fp, uint32_t n)
1658 {
1659   n = esl_hton32(n);
1660   if (fwrite(&n, sizeof(uint32_t), 1, fp) != 1) return eslFAIL;
1661   return eslOK;
1662 }
1663 int
esl_fwrite_u64(FILE * fp,uint64_t n)1664 esl_fwrite_u64(FILE *fp, uint64_t n)
1665 {
1666   n = esl_hton64(n);
1667   if (fwrite(&n, sizeof(uint64_t), 1, fp) != 1) return eslFAIL;
1668   return eslOK;
1669 }
1670 int
esl_fwrite_i16(FILE * fp,int16_t n)1671 esl_fwrite_i16(FILE *fp, int16_t n)
1672 {
1673   n = (int16_t) esl_hton16((uint16_t) n);
1674   if (fwrite(&n, sizeof(int16_t), 1, fp) != 1) return eslFAIL;
1675   return eslOK;
1676 }
1677 int
esl_fwrite_i32(FILE * fp,int32_t n)1678 esl_fwrite_i32(FILE *fp, int32_t n)
1679 {
1680   n = (int32_t) esl_hton32((uint32_t) n);
1681   if (fwrite(&n, sizeof(int32_t), 1, fp) != 1) return eslFAIL;
1682   return eslOK;
1683 }
1684 int
esl_fwrite_i64(FILE * fp,int64_t n)1685 esl_fwrite_i64(FILE *fp, int64_t n)
1686 {
1687   n = (int64_t) esl_hton64((uint64_t) n);
1688   if (fwrite(&n, sizeof(int64_t), 1, fp) != 1) return eslFAIL;
1689   return eslOK;
1690 }
1691 
1692 
1693 /* Function:  esl_fread_offset()
1694  * Synopsis:  Read an offset portably.
1695  *
1696  * Purpose:   Read a file offset from the stream <fp> (which would usually
1697  *            be a save file), and store it in <ret_offset>.
1698  *
1699  *            Offsets may have been saved by a different machine
1700  *            than the machine that reads them. The writer and the reader
1701  *            may differ in byte order and in width (<sizeof(off_t)>).
1702  *
1703  *            Byte order is dealt with by saving offsets in
1704  *            network byte order, and converting them to host byte order
1705  *            when they are read (if necessary).
1706  *
1707  *            Width is dealt with by the <sz> argument, which must be
1708  *            either 4 or 8, specifying that the saved offset is a
1709  *            32-bit versus 64-bit <off_t>. If the reading host
1710  *            <off_t> width matches the <sz> of the writer, no
1711  *            problem. If <sz> is 4 but the reading host has 64-bit
1712  *            <off_t>'s, this is also no problem; the conversion
1713  *            always works. If <sz> is 64 but the reading host has
1714  *            only 32-bit <off_t>, we cannot guarantee that we have
1715  *            sufficient dynamic range to represent the offset; if
1716  *            the stored offset is too large to represent in a 32-bit
1717  *            offset, we throw a fatal <eslEINCOMPAT> error.
1718  *
1719  * Returns:   <eslOK> on success; <eslFAIL> on a read failure.
1720  *
1721  * Throws:    <eslEINVAL> if <sz> is something other than 4 or 8;
1722  *            <eslEINCOMPAT> if the stored offset is too large for
1723  *            the reader to represent (the machine that wrote the
1724  *            SSI file used 64 bit offsets, the reader uses 32
1725  *            bit offsets, and this offset is too large to represent
1726  *            in a 32 bit offset).
1727  */
1728 int
esl_fread_offset(FILE * fp,int sz,off_t * ret_offset)1729 esl_fread_offset(FILE *fp, int sz, off_t *ret_offset)
1730 {
1731   int       status;
1732   uint32_t  x32;
1733   uint64_t  x64;
1734 
1735   if      (sz == 8)
1736     {
1737       if (esl_fread_u64(fp, &x64) != eslOK) { status = eslFAIL; goto ERROR; }
1738       if (sizeof(off_t) == 4 && x64 > INT32_MAX)
1739 	ESL_XEXCEPTION(eslEINCOMPAT, "can't read 64-bit off_t on this 32-bit host");
1740       *ret_offset = (off_t) x64;
1741     }
1742   else if (sz == 4)
1743     {
1744       if (esl_fread_u32(fp, &x32) != eslOK) { status = eslFAIL; goto ERROR; }
1745       *ret_offset = (off_t) x32;
1746     }
1747   else ESL_XEXCEPTION(eslEINVAL, "offsets must be 32 or 64 bits");
1748   return eslOK;
1749 
1750  ERROR:
1751   *ret_offset = 0;
1752   return status;
1753 }
1754 
1755 /* Function:  esl_fwrite_offset()
1756  * Synopsis:  Write an offset portably.
1757  *
1758  * Purpose:   Portably write (save) <offset> to the stream <fp>, in network
1759  *            byte order.
1760  *
1761  * Returns:   <eslOK> on success; <eslFAIL> on write failure.
1762  *
1763  * Throws:    <eslEINVAL> if <off_t> is something other than a 32-bit or
1764  *            64-bit integer on this machine, in which case we don't know
1765  *            how to deal with it portably.
1766  */
1767 int
esl_fwrite_offset(FILE * fp,off_t offset)1768 esl_fwrite_offset(FILE *fp, off_t offset)
1769 {
1770   if      (sizeof(off_t) == 4) return esl_fwrite_u32(fp, offset);
1771   else if (sizeof(off_t) == 8) return esl_fwrite_u64(fp, offset);
1772   else ESL_EXCEPTION(eslEINVAL, "off_t is neither 32-bit nor 64-bit");
1773   /*UNREACHED*/
1774   return eslEINCONCEIVABLE;
1775 }
1776 
1777 
1778 /*****************************************************************
1779  * 5. Unit tests
1780  *****************************************************************/
1781 #ifdef eslSSI_TESTDRIVE
1782 
1783 #include "esl_arr2.h"
1784 #include "esl_getopts.h"
1785 #include "esl_sq.h"
1786 #include "esl_sqio.h"
1787 #include "esl_random.h"
1788 #include "esl_randomseq.h"
1789 
1790 struct ssi_testdata {
1791   int    nfiles;        // generate this many files...
1792   int    nseq;          //  ... with this many sequences per file ...
1793   int    maxL;          //  ... with sequence lengths 1..maxL.
1794 
1795   char **sqfile;        // seq file names,   [0..nfiles-1]
1796   char **seqname;       // seq names,        [0..nfiles*nseq-1] = "seq%d-file%d"
1797   char **seqdesc;       // seq descriptions, [0..nfiles*nseq-1] = "desc%d-file%d", used as secondary keys
1798   int   *seqlen;        // seq lengths,      [0..nfiles*nseq-1]
1799   char **seq;
1800 };
1801 
1802 static struct ssi_testdata *
ssi_testdata_create(ESL_RANDOMNESS * rng,int max_nfiles,int max_nseq,int maxL,int do_dupname)1803 ssi_testdata_create(ESL_RANDOMNESS *rng, int max_nfiles, int max_nseq, int maxL, int do_dupname)
1804 {
1805   char   msg[] = "esl_ssi test data creation failed";
1806   struct ssi_testdata *td;
1807   double  p[4] = { 0.25, 0.25, 0.25, 0.25 };   // composition of random generated DNA seqs
1808   FILE   *fp;
1809   ESL_SQ *sq;
1810   int     i,j;
1811 
1812   if ( (td = malloc(sizeof(struct ssi_testdata))) == NULL) esl_fatal(msg);
1813   td->nfiles        = 1 + esl_rnd_Roll(rng, max_nfiles); // 1..max_nfiles
1814   td->nseq          = 2 + esl_rnd_Roll(rng, max_nseq-1); // 2..max_nseq   Need at least 2 for dup test to always be valid
1815   td->maxL          = maxL;
1816 
1817    /* Create <td->nfiles> sequence tmpfile names. */
1818   if ( (td->sqfile = malloc(sizeof(char *) * td->nfiles)) == NULL) esl_fatal(msg);
1819   for (j = 0; j < td->nfiles; j++)
1820     if ( esl_sprintf(&(td->sqfile[j]), "esltmpXXXXXX"  ) != eslOK) esl_fatal(msg);
1821 
1822   /* Create <td->nfiles*td->nseq> sequences with random lengths up to <maxL> */
1823   if ( (td->seq     = malloc(sizeof(char *) * td->nseq * td->nfiles)) == NULL) esl_fatal(msg);
1824   if ( (td->seqname = malloc(sizeof(char *) * td->nseq * td->nfiles)) == NULL) esl_fatal(msg);
1825   if ( (td->seqlen  = malloc(sizeof(int)    * td->nseq * td->nfiles)) == NULL) esl_fatal(msg);
1826   if ( (td->seqdesc = malloc(sizeof(char *) * td->nseq * td->nfiles)) == NULL) esl_fatal(msg);
1827   for (i = 0; i < td->nseq*td->nfiles; i++)
1828     {
1829       td->seqlen[i] = 1 + esl_rnd_Roll(rng, td->maxL); /* 1..maxL */
1830       if ( (td->seq[i] = malloc(sizeof(char) * (td->seqlen[i]+1)))        == NULL)  esl_fatal(msg);
1831       if ( esl_rsq_IID(rng, "ACGT", p, 4, td->seqlen[i], td->seq[i])      != eslOK) esl_fatal(msg);
1832       if ( esl_sprintf(&(td->seqname[i]), "seq%d-file%d",  i, i/td->nseq) != eslOK) esl_fatal(msg);
1833       if ( esl_sprintf(&(td->seqdesc[i]), "desc%d-file%d", i, i/td->nseq) != eslOK) esl_fatal(msg);
1834     }
1835 
1836   /* If we were asked to poison with a duplicate key, do it (randomly duplicating a primary or secondary key) */
1837   if (do_dupname)
1838     {
1839       i = esl_rnd_Roll(rng, td->nseq * td->nfiles - 1);             // 0..n-2
1840       j = i + 1 + esl_rnd_Roll(rng, td->nseq * td->nfiles - i - 1); // i+1..n-1
1841       if (esl_rnd_Roll(rng, 2) == 0)  {
1842         strcpy(td->seqname[i], "DUP");  // Allocated space is guaranteed to be enough,
1843         strcpy(td->seqname[j], "DUP");  //   because the original name was "seq%d-file%d"
1844       } else {
1845         strcpy(td->seqdesc[i], "DUP");
1846         strcpy(td->seqdesc[j], "DUP");
1847       }
1848     }
1849 
1850   /* Save them to files. */
1851   for (j = 0; j < td->nfiles; j++)
1852     {
1853       if (esl_tmpfile_named(td->sqfile[j], &fp) != eslOK) esl_fatal(msg);
1854       for (i = j*td->nseq; i < (j+1)*td->nseq; i++)
1855 	{
1856 	  if ( (sq = esl_sq_CreateFrom(td->seqname[i], td->seq[i], td->seqdesc[i], NULL, NULL)) == NULL)  esl_fatal(msg);
1857 	  if ( esl_sqio_Write(fp, sq, eslSQFILE_FASTA, FALSE) != eslOK) esl_fatal(msg);
1858 	  esl_sq_Destroy(sq);
1859 	}
1860       fclose(fp);
1861     }
1862 
1863   return td;
1864 }
1865 
1866 static void
ssi_testdata_destroy(struct ssi_testdata * td)1867 ssi_testdata_destroy(struct ssi_testdata *td)
1868 {
1869   int j;
1870   for (j = 0; j < td->nfiles; j++) remove(td->sqfile[j]);
1871   esl_arr2_Destroy((void **) td->sqfile,  td->nfiles);
1872   esl_arr2_Destroy((void **) td->seqname, td->nseq*td->nfiles);
1873   esl_arr2_Destroy((void **) td->seqdesc, td->nseq*td->nfiles);
1874   esl_arr2_Destroy((void **) td->seq,     td->nseq*td->nfiles);
1875   free(td->seqlen);
1876   free(td);
1877 }
1878 
1879 static void
utest_enchilada(ESL_GETOPTS * go,ESL_RANDOMNESS * rng,int do_external,int do_dupkeys)1880 utest_enchilada(ESL_GETOPTS *go, ESL_RANDOMNESS *rng, int do_external, int do_dupkeys)
1881 {
1882   char         msg[]      = "esl_ssi whole enchilada test failed";
1883   struct ssi_testdata *td = NULL;
1884   int          nq         = 10;      // number of SSI-based retrievals to test
1885   char        *ssifile    = NULL;    // Index creation: ssifile to create.
1886   ESL_NEWSSI  *ns         = NULL;    //   new SSI index data
1887   ESL_SQFILE  *sqfp       = NULL;    //   open FASTA file that we're indexing
1888   ESL_SQ      *sq         = NULL;    //   a sequence from <sqfp>
1889   uint16_t     fh;                   //   handle on an indexed fasta file, from _AddFile, for _AddKey
1890   ESL_SSI     *ssi        = NULL;    // Retrieval testing: open SSI index to use
1891   char         query[32];            //   name of sequence to retrieve
1892   char        *qfile;                //   retrieved name of file it's in
1893   int          qfmt;                 //   retrieved format of that file (fasta)
1894   off_t        roff;                 //   retrieved record offset of it
1895   int          i,j;
1896   int          status;
1897 
1898   td = ssi_testdata_create(rng,
1899                            esl_opt_GetInteger(go, "-F"),  // max # of seq files
1900                            esl_opt_GetInteger(go, "-N"),  // max # of seqs per file
1901                            esl_opt_GetInteger(go, "-L"),  // max seq length
1902                            do_dupkeys);                   // if you poison w/ dup keys, _Write should fail.
1903 
1904   /* Create an ssi index of all the FASTA files. */
1905   if (esl_strdup(td->sqfile[0], -1, &ssifile) != eslOK) esl_fatal(msg);
1906   if (esl_strcat(&ssifile,  -1, ".ssi", 4)    != eslOK) esl_fatal(msg);
1907   if (esl_newssi_Open(ssifile, TRUE, &ns)     != eslOK) esl_fatal(msg);
1908   if ((sq = esl_sq_Create())                  == NULL)  esl_fatal(msg);
1909   if (do_external)
1910     if (activate_external_sort(ns)            != eslOK) esl_fatal(msg);
1911   for (j = 0; j < td->nfiles; j++)
1912     {
1913       if (esl_sqfile_Open(td->sqfile[j], eslSQFILE_UNKNOWN, NULL, &sqfp) != eslOK) esl_fatal(msg);
1914       if (esl_newssi_AddFile(ns, td->sqfile[j], sqfp->format, &fh)       != eslOK) esl_fatal(msg);
1915       while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
1916 	{
1917 	  if (esl_newssi_AddKey  (ns, sq->name, fh, sq->roff, sq->doff, sq->L) != eslOK) esl_fatal(msg);
1918 	  if (esl_newssi_AddAlias(ns, sq->desc, sq->name)                      != eslOK) esl_fatal(msg);
1919 	  esl_sq_Reuse(sq);
1920 	}
1921       if (status != eslEOF) esl_fatal(msg);
1922       esl_sqfile_Close(sqfp);
1923     }
1924   esl_sq_Destroy(sq);
1925 
1926   /* Save the SSI index to a file. */
1927   status = esl_newssi_Write(ns);
1928   if (  do_dupkeys && status != eslEDUP) esl_fatal(msg);
1929   if (! do_dupkeys && status != eslOK)   esl_fatal(msg);
1930   esl_newssi_Close(ns);
1931 
1932   /* Open the SSI index - now we'll use it to retrieve <nq> random sequences. */
1933   if (! do_dupkeys)
1934     {
1935       if (esl_ssi_Open(ssifile, &ssi) != eslOK) esl_fatal(msg);
1936       sq = esl_sq_Create();
1937       while (nq--)
1938         {
1939           /* Choose a seq and file */
1940           i = esl_rnd_Roll(rng, td->nseq*td->nfiles);
1941           j = i/td->nseq;
1942           if (esl_rnd_Roll(rng, 2) == 0) sprintf(query, "seq%d-file%d",  i, j);  // by primary key
1943           else                           sprintf(query, "desc%d-file%d", i, j);  // by secondary key
1944 
1945           /* Retrieve it */
1946           if ( esl_ssi_FindName(ssi, query, &fh, &roff, NULL, NULL) != eslOK) esl_fatal(msg);
1947           if ( esl_ssi_FileInfo(ssi, fh, &qfile, &qfmt)             != eslOK) esl_fatal(msg);
1948           if ( esl_sqfile_Open(qfile, qfmt, NULL, &sqfp)            != eslOK) esl_fatal(msg);
1949           if ( esl_sqfile_Position(sqfp, roff)                      != eslOK) esl_fatal(msg);
1950           if ( esl_sqio_Read(sqfp, sq)                              != eslOK) esl_fatal(msg);
1951 
1952           /* Check that it's the right one */
1953           if (strcmp(sq->name, query) != 0 && strcmp(sq->desc, query) != 0) esl_fatal(msg);
1954           if (sq->n != td->seqlen[i])            esl_fatal(msg);
1955           if (strcmp(sq->seq, td->seq[i])  != 0) esl_fatal(msg);
1956           if (strcmp(qfile, td->sqfile[j]) != 0) esl_fatal(msg);
1957 
1958           esl_sq_Reuse(sq);
1959           esl_sqfile_Close(sqfp);
1960         }
1961       remove(ssifile);  // in the dup keys test, ssifile is removed by _Write().
1962       esl_sq_Destroy(sq);
1963       esl_ssi_Close(ssi);
1964     }
1965 
1966   ssi_testdata_destroy(td);
1967   free(ssifile);
1968 }
1969 #endif /*eslSSI_TESTDRIVE*/
1970 
1971 
1972 /*****************************************************************
1973  * 5. Test driver
1974  *****************************************************************/
1975 
1976 #ifdef eslSSI_TESTDRIVE
1977 #include "esl_config.h"
1978 
1979 #include <stdlib.h>
1980 #include <stdio.h>
1981 #include <string.h>
1982 
1983 #include "easel.h"
1984 #include "esl_getopts.h"
1985 #include "esl_sq.h"
1986 #include "esl_sqio.h"
1987 #include "esl_ssi.h"
1988 #include "esl_random.h"
1989 #include "esl_randomseq.h"
1990 
1991 static ESL_OPTIONS options[] = {
1992   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
1993   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",             0 },
1994   { "-F",        eslARG_INT,      "3",  NULL, NULL,  NULL,  NULL, NULL, "max number of test files",                         0 },
1995   { "-L",        eslARG_INT,   "1000",  NULL, NULL,  NULL,  NULL, NULL, "max length of test sequences",                     0 },
1996   { "-N",        eslARG_INT,     "10",  NULL, NULL,  NULL,  NULL, NULL, "max number of test sequences per file",            0 },
1997   { "-s",        eslARG_INT,      "0",  NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                    0 },
1998   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1999 };
2000 static char usage[]  = "[-options]";
2001 static char banner[] = "test driver for ssi module";
2002 
2003 int
main(int argc,char ** argv)2004 main(int argc, char **argv)
2005 {
2006   ESL_GETOPTS    *go  = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2007   ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
2008 
2009   fprintf(stderr, "## %s\n", argv[0]);
2010   fprintf(stderr, "#  rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
2011 
2012   /*                       do_external  do_dupkeys */
2013   utest_enchilada(go, rng, FALSE,       FALSE);
2014   utest_enchilada(go, rng, TRUE,        FALSE);
2015   utest_enchilada(go, rng, FALSE,       TRUE);
2016   utest_enchilada(go, rng, TRUE,        TRUE);
2017 
2018   esl_randomness_Destroy(rng);
2019   esl_getopts_Destroy(go);
2020 
2021   fprintf(stderr, "#  status = ok\n");
2022   return eslOK;
2023 }
2024 #endif /*eslSSI_TESTDRIVE*/
2025 
2026 
2027 
2028 /*****************************************************************
2029  * 5. Example code.
2030  ****************************************************************/
2031 #ifdef eslSSI_EXAMPLE
2032 /* gcc -o example -g -Wall -DeslSSI_EXAMPLE esl_ssi.c easel.c
2033  * esl-shuffle -o foo.fa -N 1000 -G --amino -L 400
2034  * ./example foo.fa
2035  */
2036 /*::cexcerpt::ssi_example::begin::*/
2037 #include <stdio.h>
2038 #include "easel.h"
2039 #include "esl_ssi.h"
2040 
2041 int
main(int argc,char ** argv)2042 main(int argc, char **argv)
2043 {
2044   ESL_NEWSSI *ns;
2045   char    *fafile;              /* name of FASTA file                   */
2046   FILE    *fp;                  /* opened FASTA file for reading        */
2047   char    *ssifile;             /* name of SSI file                     */
2048   uint16_t fh;                  /* file handle SSI associates w/ fafile */
2049   char    *buf = NULL;          /* growable buffer for esl_fgets()      */
2050   int      n   = 0;             /* length of buf                        */
2051   char    *s, *seqname;
2052   off_t    seq_offset;
2053   int      status;
2054 
2055   /* Open a new SSI index named <fafile>.ssi */
2056   fafile = argv[1];
2057   esl_strdup(fafile,   -1, &ssifile);
2058   esl_strcat(&ssifile, -1, ".ssi", 4);
2059   status = esl_newssi_Open(ssifile, FALSE, &ns);
2060   if      (status == eslENOTFOUND)   esl_fatal("failed to open SSI index %s", ssifile);
2061   else if (status == eslEOVERWRITE)  esl_fatal("SSI index %s already exists; delete or rename it", ssifile);
2062   else if (status != eslOK)          esl_fatal("failed to create a new SSI index");
2063 
2064   /* Collect the sequence names from a FASTA file into an index */
2065   if ((fp = fopen(fafile, "r"))              == NULL)  esl_fatal("failed to open %s", fafile);
2066   if (esl_newssi_AddFile(ns, fafile, 1, &fh) != eslOK) esl_fatal("failed to add %s to index: %s", fafile, ns->errbuf);
2067   seq_offset = ftello(fp);
2068   while (esl_fgets(&buf, &n, fp) == eslOK)
2069     {
2070       if (*buf == '>') {
2071 	s = buf+1;                           /* skip past >                */
2072 	esl_strtok(&s, " \t\n", &seqname);   /* name = 1st token on > line */
2073 	if (esl_newssi_AddKey(ns, seqname, fh, seq_offset, 0, 0) != eslOK)
2074 	  esl_fatal("failed to add key %s to index: %s", seqname, ns->errbuf);
2075       }
2076       seq_offset = ftello(fp);
2077     }
2078   free(buf);
2079   fclose(fp);
2080 
2081   /* Save the index to disk */
2082   status = esl_newssi_Write(ns);
2083   if      (status == eslEDUP)     esl_fatal("SSI index construction failed:\n  %s", ns->errbuf);
2084   else if (status == eslERANGE)   esl_fatal("SSI index file size exceeds maximum allowed by your filesystem");
2085   else if (status == eslESYS)     esl_fatal("SSI index sort failed:\n  %s", ns->errbuf);
2086   else if (status != eslOK)       esl_fatal("SSI indexing failed:\n  %s", ns->errbuf);
2087   esl_newssi_Close(ns);
2088   free(ssifile);
2089   return 0;
2090 }
2091 /*::cexcerpt::ssi_example::end::*/
2092 #endif /*eslSSI_EXAMPLE*/
2093 
2094 
2095 #ifdef eslSSI_EXAMPLE2
2096 /* gcc -o example2 -g -Wall -DeslSSI_EXAMPLE2 esl_ssi.c easel.c
2097  * ./example2 random77 foo.fa.ssi
2098  */
2099 /*::cexcerpt::ssi_example2::begin::*/
2100 #include <stdio.h>
2101 #include "easel.h"
2102 #include "esl_ssi.h"
2103 
main(int argc,char ** argv)2104 int main(int argc, char **argv)
2105 {
2106   ESL_SSI *ssi;
2107   char    *seqname;             /* name of sequence to retrieve         */
2108   char    *ssifile;             /* name of SSI file                     */
2109   uint16_t fh;                  /* file handle SSI associates w/ fafile */
2110   char    *fafile;              /* name of FASTA file                   */
2111   int      fmt;                 /* format code (1, in this example)     */
2112   off_t    offset;              /* disk offset of seqname in fafile     */
2113   FILE    *fp;                  /* opened FASTA file for reading        */
2114   char    *buf = NULL;          /* growable buffer for esl_fgets()      */
2115   int      n = 0;               /* size of buffer                       */
2116 
2117   seqname = argv[1];
2118   ssifile = argv[2];
2119 
2120   if (esl_ssi_Open(ssifile, &ssi)                              != eslOK) esl_fatal("open failed");
2121   if (esl_ssi_FindName(ssi, seqname, &fh, &offset, NULL, NULL) != eslOK) esl_fatal("find failed");
2122   if (esl_ssi_FileInfo(ssi, fh, &fafile, &fmt)                 != eslOK) esl_fatal("info failed");
2123   /* you can't close the ssi file yet - fafile is pointing into it! */
2124 
2125   if ((fp = fopen(fafile, "r"))     == NULL)  esl_fatal("failed to open %s", fafile);
2126   if (fseeko(fp, offset, SEEK_SET)  != 0)     esl_fatal("failed to position %s", fafile);
2127   if (esl_fgets(&buf, &n, fp)       != eslOK) esl_fatal("failed to get name/desc line");
2128   do {
2129     printf("%s", buf);
2130   } while (esl_fgets(&buf, &n, fp) == eslOK && *buf != '>');
2131 
2132   esl_ssi_Close(ssi);
2133   fclose(fp);
2134   free(buf);
2135   return 0;
2136 }
2137 /*::cexcerpt::ssi_example2::end::*/
2138 #endif /*eslSSI_EXAMPLE2*/
2139 
2140