1 /* Sequence/subsequence indices: fast lookup in large sequence files by keyword.
2 *
3 * Contents:
4 * 1. Using (reading) an SSI index.
5 * 2. Creating (writing) new SSI files.
6 * 3. Portable binary i/o.
7 * 4. Unit tests.
8 * 5. Test driver.
9 * 6. Example code.
10 */
11 #include "esl_config.h"
12
13 #include <stdio.h>
14 #include <string.h>
15
16 #include "easel.h"
17 #include "esl_ssi.h"
18
19 static uint32_t v30magic = 0xd3d3c9b3; /* SSI 3.0: "ssi3" + 0x80808080 */
20 static uint32_t v30swap = 0xb3c9d3d3; /* byteswapped */
21
22
23 /*****************************************************************
24 *# 1. Using (reading) an SSI index.
25 *****************************************************************/
26
27 static int binary_search(ESL_SSI *ssi, const char *key, uint32_t klen, off_t base,
28 uint32_t recsize, uint64_t maxidx);
29
30 /* Function: esl_ssi_Open()
31 * Synopsis: Open an SSI index as an <ESL_SSI>.
32 *
33 * Purpose: Open the SSI index file <filename>, and returns a pointer
34 * to the new <ESL_SSI> object in <ret_ssi>.
35 *
36 * Caller is responsible for closing the SSI file with
37 * <esl_ssi_Close()>.
38 *
39 * Args: <filename> - name of SSI index file to open.
40 * <ret_ssi> - RETURN: the new <ESL_SSI>.
41 *
42 * Returns: <eslOK> on success;
43 * <eslENOTFOUND> if <filename> cannot be opened for reading;
44 * <eslEFORMAT> if it's not in correct SSI file format;
45 * <eslERANGE> if it uses 64-bit file offsets, and we're on a system
46 * that doesn't support 64-bit file offsets.
47 *
48 * Throws: <eslEMEM> on allocation error.
49 */
50 int
esl_ssi_Open(const char * filename,ESL_SSI ** ret_ssi)51 esl_ssi_Open(const char *filename, ESL_SSI **ret_ssi)
52 {
53 ESL_SSI *ssi = NULL;
54 int status;
55 uint32_t magic; /* magic number that starts the SSI file */
56 uint16_t i; /* counter over files */
57
58 /* Initialize the SSI structure, null'ing so we can autocleanup.
59 */
60 ESL_ALLOC(ssi, sizeof(ESL_SSI));
61 ssi->fp = NULL;
62 ssi->filename = NULL;
63 ssi->fileformat = NULL;
64 ssi->fileflags = NULL;
65 ssi->bpl = NULL;
66 ssi->rpl = NULL;
67 ssi->nfiles = 0;
68
69 /* Open the file.
70 */
71 status = eslENOTFOUND;
72 if ((ssi->fp = fopen(filename, "rb")) == NULL) goto ERROR;
73
74 /* Read the magic number: make sure it's an SSI file, and determine
75 * whether it's byteswapped.
76 */
77 status = eslEFORMAT;
78 if (esl_fread_u32(ssi->fp, &magic) != eslOK) goto ERROR;
79 if (magic != v30magic && magic != v30swap) goto ERROR;
80 if (esl_fread_u32(ssi->fp, &(ssi->flags)) != eslOK) goto ERROR;
81 if (esl_fread_u32(ssi->fp, &(ssi->offsz)) != eslOK) goto ERROR;
82
83 status = eslERANGE;
84 if (ssi->offsz != 4 && ssi->offsz != 8) goto ERROR;
85 if (ssi->offsz > sizeof(off_t)) goto ERROR;
86
87 /* The header data. */
88 status = eslEFORMAT;
89 if (esl_fread_u16(ssi->fp, &(ssi->nfiles)) != eslOK) goto ERROR;
90 if (esl_fread_u64(ssi->fp, &(ssi->nprimary)) != eslOK) goto ERROR;
91 if (esl_fread_u64(ssi->fp, &(ssi->nsecondary)) != eslOK) goto ERROR;
92 if (esl_fread_u32(ssi->fp, &(ssi->flen)) != eslOK) goto ERROR;
93 if (esl_fread_u32(ssi->fp, &(ssi->plen)) != eslOK) goto ERROR;
94 if (esl_fread_u32(ssi->fp, &(ssi->slen)) != eslOK) goto ERROR;
95 if (esl_fread_u32(ssi->fp, &(ssi->frecsize)) != eslOK) goto ERROR;
96 if (esl_fread_u32(ssi->fp, &(ssi->precsize)) != eslOK) goto ERROR;
97 if (esl_fread_u32(ssi->fp, &(ssi->srecsize)) != eslOK) goto ERROR;
98
99 if (esl_fread_offset(ssi->fp, ssi->offsz, &(ssi->foffset)) != eslOK) goto ERROR;
100 if (esl_fread_offset(ssi->fp, ssi->offsz, &(ssi->poffset)) != eslOK) goto ERROR;
101 if (esl_fread_offset(ssi->fp, ssi->offsz, &(ssi->soffset)) != eslOK) goto ERROR;
102
103 /* The file information.
104 * We expect the number of files to be small, so reading it once
105 * should be advantageous overall. If SSI ever had to deal with
106 * large numbers of files, you'd probably want to read file
107 * information on demand.
108 */
109 status = eslEFORMAT;
110 if (ssi->nfiles == 0) goto ERROR;
111
112 ESL_ALLOC(ssi->filename, sizeof(char *) * ssi->nfiles);
113 for (i = 0; i < ssi->nfiles; i++) ssi->filename[i] = NULL;
114 ESL_ALLOC(ssi->fileformat, sizeof(uint32_t) * ssi->nfiles);
115 ESL_ALLOC(ssi->fileflags, sizeof(uint32_t) * ssi->nfiles);
116 ESL_ALLOC(ssi->bpl, sizeof(uint32_t) * ssi->nfiles);
117 ESL_ALLOC(ssi->rpl, sizeof(uint32_t) * ssi->nfiles);
118
119 /* (most) allocations done, now we read. */
120 for (i = 0; i < ssi->nfiles; i++)
121 {
122 ESL_ALLOC(ssi->filename[i], sizeof(char)* ssi->flen);
123 /* We do have to explicitly position, because header and file
124 * records may expand in the future; frecsize and foffset
125 * give us forwards compatibility.
126 */
127 status = eslEFORMAT;
128 if (fseeko(ssi->fp, ssi->foffset + (i * ssi->frecsize), SEEK_SET) != 0) goto ERROR;
129 if (fread(ssi->filename[i],sizeof(char),ssi->flen, ssi->fp)!=ssi->flen) goto ERROR;
130 if (esl_fread_u32(ssi->fp, &(ssi->fileformat[i]))) goto ERROR;
131 if (esl_fread_u32(ssi->fp, &(ssi->fileflags[i]))) goto ERROR;
132 if (esl_fread_u32(ssi->fp, &(ssi->bpl[i]))) goto ERROR;
133 if (esl_fread_u32(ssi->fp, &(ssi->rpl[i]))) goto ERROR;
134 }
135 *ret_ssi = ssi;
136 return eslOK;
137
138 ERROR:
139 if (ssi != NULL) esl_ssi_Close(ssi);
140 *ret_ssi = NULL;
141 return status;
142 }
143
144
145 /* Function: esl_ssi_FindName()
146 * Synopsis: Look up a primary or secondary key.
147 *
148 * Purpose: Looks up the string <key> in index <ssi>.
149 * <key> can be either a primary or secondary key. If <key>
150 * is found, <ret_fh> contains a unique handle on
151 * the file that contains <key> (suitable for an <esl_ssi_FileInfo()>
152 * call, or for comparison to the handle of the last file
153 * that was opened for retrieval), and <ret_offset> contains
154 * the offset of the sequence record in that file.
155 *
156 * Args: <ssi> - open index file
157 * <key> - name to search for
158 * <ret_fh> - RETURN: handle on file that key is in
159 * <ret_roff> - RETURN: offset of the start of that key's record
160 * <opt_doff> - optRETURN: data offset (may be 0 if unset)
161 * <opt_L> - optRETURN: length of data record (may be 0 if unset)
162 *
163 * Returns: <eslOK> on success;
164 * <eslENOTFOUND> if no such key is in the index;
165 * <eslEFORMAT> if an fread() or fseeko() fails, which almost
166 * certainly reflects some kind of misformatting of
167 * the index.
168 *
169 * Throws: <eslEMEM> on allocation error.
170 */
171 int
esl_ssi_FindName(ESL_SSI * ssi,const char * key,uint16_t * ret_fh,off_t * ret_roff,off_t * opt_doff,int64_t * opt_L)172 esl_ssi_FindName(ESL_SSI *ssi, const char *key, uint16_t *ret_fh, off_t *ret_roff, off_t *opt_doff, int64_t *opt_L)
173 {
174 int status;
175 off_t doff;
176 int64_t L;
177 char *pkey = NULL;
178
179 /* Look in the primary keys.
180 */
181 status = binary_search(ssi, key, ssi->plen, ssi->poffset, ssi->precsize,
182 ssi->nprimary);
183
184 if (status == eslOK)
185 { /* We found it as a primary key; get our data & return. */
186 status = eslEFORMAT;
187 if (esl_fread_u16(ssi->fp, ret_fh) != eslOK) goto ERROR;
188 if (esl_fread_offset(ssi->fp, ssi->offsz, ret_roff) != eslOK) goto ERROR;
189 if (esl_fread_offset(ssi->fp, ssi->offsz, &doff) != eslOK) goto ERROR;
190 if (esl_fread_i64 (ssi->fp, &L) != eslOK) goto ERROR;
191 }
192 else if (status == eslENOTFOUND)
193 { /* Not in the primary keys? OK, try the secondary keys. */
194 if (ssi->nsecondary > 0) {
195 if ((status = binary_search(ssi, key, ssi->slen, ssi->soffset, ssi->srecsize, ssi->nsecondary)) != eslOK) goto ERROR;
196
197 /* We have the secondary key; flip to its primary key, then look that up. */
198 ESL_ALLOC(pkey, sizeof(char) * ssi->plen);
199 status = eslEFORMAT;
200 if (fread(pkey, sizeof(char), ssi->plen, ssi->fp) != ssi->plen) goto ERROR;
201 if ((status = esl_ssi_FindName(ssi, pkey, ret_fh, ret_roff, &doff, &L)) != eslOK) goto ERROR;
202 } else goto ERROR; /* no secondary keys? pass along the ENOTFOUND error. */
203 } else goto ERROR; /* status from binary search was an error code. */
204
205 if (pkey != NULL) free(pkey);
206 if (opt_doff != NULL) *opt_doff = doff;
207 if (opt_L != NULL) *opt_L = L;
208 return eslOK;
209
210 ERROR:
211 if (pkey != NULL) free(pkey);
212 *ret_fh = 0;
213 *ret_roff = 0;
214 if (opt_doff != NULL) *opt_doff = 0;
215 if (opt_L != NULL) *opt_L = 0;
216 return status;
217 }
218
219
220
221 /* Function: esl_ssi_FindNumber()
222 * Synopsis: Look up the n'th primary key.
223 *
224 * Purpose: Looks up primary key number <nkey> in the open index
225 * <ssi>. <nkey> ranges from <0..ssi->nprimary-1>. When
226 * key <nkey> is found, any/all of several optional
227 * arguments point to results. <*opt_fh> contains a unique
228 * handle on the file that contains that key (suitable for
229 * an <esl_ssi_FileInfo()> call, or for comparison to the
230 * handle of the last file that was opened for retrieval).
231 * <*opt_roff> contains the record offset; <*opt_doff>
232 * contains the data offset; <*opt_L> contains the record
233 * length; and <*opt_pkey> points to the primary key name
234 * (a string, allocated here, that the caller becomes
235 * responsible for free'ing).
236 *
237 * Args: <ssi> - open index file
238 * <nkey> - primary key number to retrieve (0..nprimary-1)
239 * <opt_fh> - optRETURN: handle on file that key is in
240 * <opt_roff> - optRETURN: offset of the start of that key's record
241 * <opt_doff> - optRETURN: data offset (may be 0 if unset)
242 * <opt_L> - optRETURN: length of data record (may be 0 if unset)
243 * <opt_pkey> - optRETURN: primary key name (allocated here; caller must free)
244 *
245 * Returns: <eslOK> on success;
246 * <eslENOTFOUND> if there is no sequence record <nkey>;
247 * <eslEFORMAT> if a read or a seek fails, probably indicating
248 * some kind of file misformatting.
249 *
250 * Throws: <eslEMEM> on allocation error.
251 */
252 int
esl_ssi_FindNumber(ESL_SSI * ssi,int64_t nkey,uint16_t * opt_fh,off_t * opt_roff,off_t * opt_doff,int64_t * opt_L,char ** opt_pkey)253 esl_ssi_FindNumber(ESL_SSI *ssi, int64_t nkey, uint16_t *opt_fh, off_t *opt_roff, off_t *opt_doff, int64_t *opt_L, char **opt_pkey)
254 {
255 int status;
256 uint16_t fh;
257 off_t doff, roff;
258 uint64_t L;
259 char *pkey = NULL;
260
261 if (nkey >= ssi->nprimary) { status = eslENOTFOUND; goto ERROR; }
262 ESL_ALLOC(pkey, sizeof(char) * ssi->plen);
263
264 status = eslEFORMAT;
265 if (fseeko(ssi->fp, ssi->poffset+ssi->precsize*nkey, SEEK_SET)!= 0) goto ERROR;
266 if (fread(pkey, sizeof(char), ssi->plen, ssi->fp) != ssi->plen) goto ERROR;
267 if (esl_fread_u16(ssi->fp, &fh) != eslOK) goto ERROR;
268 if (esl_fread_offset(ssi->fp, ssi->offsz, &roff) != eslOK) goto ERROR;
269 if (esl_fread_offset(ssi->fp, ssi->offsz, &doff) != eslOK) goto ERROR;
270 if (esl_fread_u64 (ssi->fp, &L) != eslOK) goto ERROR;
271
272 if (opt_fh != NULL) *opt_fh = fh;
273 if (opt_roff != NULL) *opt_roff = roff;
274 if (opt_doff != NULL) *opt_doff = doff;
275 if (opt_L != NULL) *opt_L = L;
276 if (opt_pkey != NULL) *opt_pkey = pkey; else free(pkey);
277 return eslOK;
278
279 ERROR:
280 if (pkey != NULL) free(pkey);
281 if (opt_fh != NULL) *opt_fh = 0;
282 if (opt_roff != NULL) *opt_roff = 0;
283 if (opt_doff != NULL) *opt_doff = 0;
284 if (opt_L != NULL) *opt_L = 0;
285 if (opt_pkey != NULL) *opt_pkey = NULL;
286 return status;
287 }
288
289
290 /* Function: esl_ssi_FindSubseq()
291 * Synopsis: Look up a specific subsequence's start.
292 * Date: SRE, Mon Jan 1 19:49:31 2001 [St. Louis]
293 *
294 * Purpose: Fast subsequence retrieval: look up a primary or secondary
295 * <key> in the open index <ssi>, and ask for the nearest data
296 * offset to a subsequence starting at residue
297 * <requested_start> in the sequence (numbering the sequence
298 * <1..L>). If <key> is found, on return, <ret_fh> contains
299 * a unique handle on the file that contains <key>;
300 * <ret_roff> contains the disk offset to the start of the
301 * sequence record; <ret_doff> contains the disk offset
302 * (see below); and <ret_actual_start) contains the coordinate
303 * (1..L) of the first valid residue at or after
304 * <data_offset>. <ret_actual_start> is $\leq$
305 * <requested_start>.
306 *
307 * Depending on the file's characteristics, there are four
308 * possible outcomes.
309 *
310 * If the file has the <eslSSI_FASTSUBSEQ> flag set, a data
311 * offset was indexed for this key, and the data can be
312 * indexed at single residue resolution (because the file's
313 * lines contain only residues, no spaces), then <ret_doff>
314 * is exactly the position of residue <requested_start> on
315 * disk, and <ret_actual_start> is <requested_start>.
316 *
317 * If the file has the <eslSSI_FASTSUBSEQ> flag set, a data
318 * offset was indexed for this key, but the data can only be
319 * indexed at line resolution (because at least some of the
320 * file's lines contain spaces), then <ret_doff> is the
321 * position of the start of the line that <requested_start>
322 * is on, and <ret_actual_start> is the coord <1..L> of the
323 * first residue on that line.
324 *
325 * If the file does not have the <eslSSI_FASTSUBSEQ> flag
326 * set (because lines contain a variable number of residues
327 * and/or bytes), but a data offset was indexed for this
328 * key, then we can still at least return that data offset,
329 * but the caller is going to have to start from the
330 * beginning of the data and read residues until it reaches
331 * the desired <requested_start>. Now <ret_doff> is the
332 * offset to the start of the first line of the sequence
333 * data, and <ret_actual_start> is 1.
334 *
335 * If the key does not have a data offset indexed at all,
336 * then regardless of the file's <eslSSI_FASTSUBSEQ>
337 * setting, we can't calculate even the position of the
338 * first line. In this case, <ret_doff> is 0 (for
339 * unset/unknown), and <ret_actual_start> is <1>.
340 *
341 * A caller that's going to position the disk and read a
342 * subseq must check for all four possible outcomes (pardon
343 * redundancy with the above, but just to be clear, from the
344 * caller's perspective now):
345 *
346 * If <ret_doff> is 0, no data offset information can be
347 * calculated; the caller can still use <ret_roff> to
348 * position the disk to the start of <key>'s record, but it
349 * will need to parse the header to find the start of the
350 * sequence data; then it will need to parse the sequence
351 * data, skipping to residue <requested start>.
352 *
353 * If <ret_doff> is valid ($>0$), and <ret_actual_start> is
354 * 1, then caller may use <ret_doff> to position the disk to
355 * the start of the first sequence data line, but will still
356 * need to parse all the sequence data, counting and
357 * skipping to residue <requested start>. This is equivalent
358 * to (and in practice, not much more efficient than)
359 * positioning to the record start and parsing the header to
360 * locate the sequence data start.
361 *
362 * If <ret_doff> is valid ($>0$), and <ret_actual_start> is
363 * $>1$ but $<$ <requested_start>, then <ret_doff> is the
364 * offset to the first byte of a line on which the
365 * subsequence begins. The caller can position the disk
366 * there, then start parsing, skipping <requested_start -
367 * *ret_actual_start> residues to reach the
368 * <requested_start>. (In the case where the subsequence
369 * begins on the first line, then <ret_actual_start> will be
370 * 1, and the caller will have to handle this as the case
371 * above.)
372 *
373 * If <<ret_doff> is valid ($>0$), and <ret_actual_start> is
374 * $=$ <requested_start>, then <ret_doff> is the offset to a
375 * byte in the file, such that the requested subsequence
376 * starts at the next valid residue at or after that
377 * position. (The <ret_doff> would usually be exactly the
378 * first residue of the subsequence, because we used single
379 * residue resolution arithmetic to find it, but there's a
380 * case where <requested_start> happens to be the first
381 * residue of a line and we calculated <ret_doff> using
382 * line-resolution arithmetic; in this latter case,
383 * <ret_doff> could be pointing at a space before the first
384 * subseq residue.) The caller may position the disk there
385 * and start parsing immediately; the first valid residue
386 * will be the start of the subsequence.
387 *
388 * Args: <ssi> - open index file
389 * <key> - primary or secondary key to find
390 * <requested_start> - residue we'd like to start at (1..L)
391 * <ret_fh> - RETURN: handle for file the key is in
392 * <ret_roff> - RETURN: offset to start of sequence record
393 * <ret_doff> - RETURN: offset to closest start of subseq data, or 0.
394 * <ret_L> - RETURN: length of <key> in residues (may be 0 if unset)
395 * <ret_actual_start>- RETURN: coord (1..L) of residue at <ret_doff>
396 *
397 * Returns: <eslOK> on any of the four successful outcomes.
398 * <eslENOTFOUND> if no such key is found in the index;
399 * <eslEFORMAT> on a read or seek failure, presumably meaning that
400 * the file is misformatted somehow;
401 * <eslERANGE> if <requested_start> isn't somewhere in the range
402 * <1..len> for the target sequence.
403 *
404 * Throws: <eslEMEM> on allocation error.
405 */
406 int
esl_ssi_FindSubseq(ESL_SSI * ssi,const char * key,int64_t requested_start,uint16_t * ret_fh,off_t * ret_roff,off_t * ret_doff,int64_t * ret_L,int64_t * ret_actual_start)407 esl_ssi_FindSubseq(ESL_SSI *ssi, const char *key, int64_t requested_start,
408 uint16_t *ret_fh, off_t *ret_roff, off_t *ret_doff, int64_t *ret_L, int64_t *ret_actual_start)
409 {
410 int status;
411 uint64_t r, b, i, l; /* tmp variables for "clarity", to match docs */
412
413 /* Look up the key by name.
414 */
415 if ((status = esl_ssi_FindName(ssi, key, ret_fh, ret_roff, ret_doff, ret_L)) != eslOK) goto ERROR;
416 if (requested_start < 0 || requested_start > *ret_L) { status = eslERANGE; goto ERROR; }
417
418 /* Do we have a data offset for this key? If not, we're case 4. */
419 /* Can we do fast subseq lookup on this file? If no, we're case 3. */
420 if (*ret_doff == 0 || ! (ssi->fileflags[*ret_fh] & eslSSI_FASTSUBSEQ))
421 {
422 *ret_actual_start = 1;
423 return eslOK;
424 }
425
426 /* Set up tmp variables for clarity of equations below,
427 * and to make them match tex documentation
428 */
429 r = ssi->rpl[*ret_fh]; /* residues per line */
430 b = ssi->bpl[*ret_fh]; /* bytes per line */
431 i = requested_start; /* start position 1..L */
432 l = (i-1)/r; /* data line # (0..) that the residue is on */
433 if (r == 0 || b == 0) { status = eslEINVAL; goto ERROR; }
434
435 /* When b = r+1, there's nothing but sequence on each data line (and the \0).
436 * In this case, we know we can find each residue precisely: outcome #1.
437 */
438 if (b == r+1)
439 {
440 *ret_doff += l*b + (i-1)%r;
441 *ret_actual_start = requested_start;
442 }
443 /* else, there's other stuff on seq lines - probably spaces - so the best
444 * we can do (without figuring out the spacing pattern and checking that
445 * it's consistent everywhere) is to position at start of relevant line.
446 */
447 else
448 {
449 *ret_doff += l*b;
450 *ret_actual_start = 1 + l*r;
451 }
452 return eslOK;
453
454 ERROR:
455 *ret_fh = 0;
456 *ret_roff = 0;
457 *ret_doff = 0;
458 *ret_L = 0;
459 *ret_actual_start = 0;
460 return status;
461 }
462
463
464 /* Function: esl_ssi_FileInfo()
465 * Synopsis: Retrieve a file name and format code.
466 * Date: SRE, Tue Jan 2 10:31:01 2001 [St. Louis]
467 *
468 * Purpose: Given a file number <fh> in an open index file
469 * <ssi>, retrieve file name <ret_filename> and
470 * the file format <ret_format>.
471 *
472 * <ret_filename> is a pointer to a string maintained
473 * internally by <ssi>. It should not be free'd;
474 * <esl_ssi_Close(ssi)> will take care of it.
475 *
476 * Args: <ssi> - open index file
477 * <fh> - handle on file to look up
478 * <ret_filename> - RETURN: name of file n
479 * <ret_format> - RETURN: format code for file n
480 *
481 * Returns: <eslOK> on success.
482 *
483 * Throws: <eslEINVAL> if there is no such file number <fh>.
484 */
485 int
esl_ssi_FileInfo(ESL_SSI * ssi,uint16_t fh,char ** ret_filename,int * ret_format)486 esl_ssi_FileInfo(ESL_SSI *ssi, uint16_t fh, char **ret_filename, int *ret_format)
487 {
488 int status;
489
490 if (fh >= ssi->nfiles) ESL_XEXCEPTION(eslEINVAL, "no such file number");
491 *ret_filename = ssi->filename[fh];
492 *ret_format = ssi->fileformat[fh];
493 return eslOK;
494
495 ERROR:
496 *ret_filename = NULL;
497 *ret_format = 0;
498 return status;
499 }
500
501
502 /* Function: esl_ssi_Close()
503 * Synopsis: Close an SSI index.
504 *
505 * Purpose: Close an open SSI index <ssi>.
506 *
507 * Args: <ssi> - an open SSI index file.
508 */
509 void
esl_ssi_Close(ESL_SSI * ssi)510 esl_ssi_Close(ESL_SSI *ssi)
511 {
512 int i;
513
514 if (ssi == NULL) return;
515
516 if (ssi->fp != NULL) fclose(ssi->fp);
517 if (ssi->filename != NULL) {
518 for (i = 0; i < ssi->nfiles; i++)
519 if (ssi->filename[i] != NULL) free(ssi->filename[i]);
520 free(ssi->filename);
521 }
522 if (ssi->fileformat != NULL) free(ssi->fileformat);
523 if (ssi->fileflags != NULL) free(ssi->fileflags);
524 if (ssi->bpl != NULL) free(ssi->bpl);
525 if (ssi->rpl != NULL) free(ssi->rpl);
526 free(ssi);
527 }
528
529
530 /* binary_search()
531 * Date: SRE, Sun Dec 31 16:05:03 2000 [St. Louis]
532 *
533 * Purpose: Find <key> in an SSI index, by a binary search
534 * in an alphabetically sorted list of keys. If successful,
535 * return <eslOK>, and the index file is positioned to read
536 * the rest of the data for that key. If unsuccessful,
537 * return <eslFAIL>, and the positioning of the index file
538 * is left in an undefined state.
539 *
540 * Args: <ssi> - an open ESL_SSI
541 * <key> - key to find
542 * <klen> - key length to allocate (plen or slen from ssi)
543 * <base> - base offset (poffset or soffset)
544 * <recsize> - size of each key record in bytes (precsize or srecsize)
545 * <maxidx> - # of keys (nprimary or nsecondary)
546 *
547 * Returns: <eslOK> on success, and leaves file positioned for reading remaining
548 * data for the key.
549 *
550 * <eslENOTFOUND> if <key> is not found.
551 * <eslEFORMAT> if an fread() or fseeko() fails, probably indicating
552 * some kind of misformatting of the index file.
553 *
554 * Throws: <eslEMEM> on allocation failure.
555 *
556 */
557 static int
binary_search(ESL_SSI * ssi,const char * key,uint32_t klen,off_t base,uint32_t recsize,uint64_t maxidx)558 binary_search(ESL_SSI *ssi, const char *key, uint32_t klen, off_t base,
559 uint32_t recsize, uint64_t maxidx)
560 {
561 char *name;
562 uint64_t left, right, mid;
563 int cmp;
564 int status;
565
566 if (maxidx == 0) return eslENOTFOUND; /* special case: empty index */
567
568 ESL_ALLOC(name, (sizeof(char)*klen));
569
570 left = 0;
571 right = maxidx-1;
572 while (1) { /* A binary search: */
573 mid = (left+right) / 2; /* careful here. left+right potentially overflows if
574 we didn't limit unsigned vars to signed ranges. */
575 status = eslEFORMAT;
576 if (fseeko(ssi->fp, base + recsize*mid, SEEK_SET) != 0) goto ERROR;
577 if (fread(name, sizeof(char), klen, ssi->fp) != klen) goto ERROR;
578
579 status = eslENOTFOUND;
580 cmp = strcmp(name, key);
581 if (cmp == 0) break; /* found it! */
582 else if (left >= right) goto ERROR; /* no such key */
583 else if (cmp < 0) left = mid+1; /* it's still right of mid */
584 else if (cmp > 0) {
585 if (mid == 0) goto ERROR; /* beware left edge case */
586 else right = mid-1; /* it's left of mid */
587 }
588 }
589
590 if (name != NULL) free(name);
591 return eslOK; /* and ssi->fp is positioned to read the record. */
592
593 ERROR:
594 if (name != NULL) free(name);
595 return status;
596 }
597
598
599 /*****************************************************************
600 *# 2. Creating (writing) new SSI files.
601 *****************************************************************/
602 static int current_newssi_size(const ESL_NEWSSI *ns);
603 static int activate_external_sort(ESL_NEWSSI *ns);
604 static int parse_pkey(char *buf, ESL_PKEY *pkey);
605 static int parse_skey(char *buf, ESL_SKEY *skey);
606 static int pkeysort(const void *k1, const void *k2);
607 static int skeysort(const void *k1, const void *k2);
608
609 /* Function: esl_newssi_Open()
610 * Synopsis: Create a new <ESL_NEWSSI>.
611 *
612 * Purpose: Creates and returns a <ESL_NEWSSI>, in order to create a
613 * new SSI index file.
614 *
615 * Returns: <eslOK> on success, and <*ret_newssi> is a pointer to a
616 * new <ESL_NEWSSI> structure.
617 *
618 * Returns <eslENOTFOUND> if <ssifile> can't be opened.
619 *
620 * Returns <eslEOVERWRITE> if <allow_overwrite> is <FALSE>
621 * and <ssifile> (or any necessary tmp files) already
622 * exist, to block overwriting of an existing SSI file.
623 *
624 * Throws: <eslEMEM> on allocation error.
625 */
626 int
esl_newssi_Open(const char * ssifile,int allow_overwrite,ESL_NEWSSI ** ret_newssi)627 esl_newssi_Open(const char *ssifile, int allow_overwrite, ESL_NEWSSI **ret_newssi)
628 {
629 ESL_NEWSSI *ns = NULL;
630 int i;
631 int status;
632
633 ESL_ALLOC(ns, sizeof(ESL_NEWSSI));
634 ns->ssifile = NULL;
635 ns->ssifp = NULL;
636 ns->external = FALSE; /* we'll switch to external sort if... */
637 ns->max_ram = eslSSI_MAXRAM; /* ... if we exceed this memory limit in MB. */
638 ns->filenames = NULL;
639 ns->fileformat = NULL;
640 ns->bpl = NULL;
641 ns->rpl = NULL;
642 ns->flen = 0;
643 ns->nfiles = 0;
644 ns->pkeys = NULL;
645 ns->plen = 0;
646 ns->nprimary = 0;
647 ns->ptmpfile = NULL;
648 ns->ptmp = NULL;
649 ns->skeys = NULL;
650 ns->slen = 0;
651 ns->nsecondary = 0;
652 ns->stmpfile = NULL;
653 ns->stmp = NULL;
654 ns->errbuf[0] = '\0';
655
656 if ((status = esl_strdup(ssifile, -1, &(ns->ssifile))) != eslOK) goto ERROR;
657 if ((status = esl_strdup(ssifile, -1, &(ns->ptmpfile))) != eslOK) goto ERROR;
658 if ((status = esl_strdup(ssifile, -1, &(ns->stmpfile))) != eslOK) goto ERROR;
659 if ((status = esl_strcat(&ns->ptmpfile, -1, ".1", 2)) != eslOK) goto ERROR;
660 if ((status = esl_strcat(&ns->stmpfile, -1, ".2", 2)) != eslOK) goto ERROR;
661
662 if (! allow_overwrite)
663 {
664 if (esl_FileExists(ssifile) ||
665 esl_FileExists(ns->ptmpfile) ||
666 esl_FileExists(ns->stmpfile))
667 { status = eslEOVERWRITE; goto ERROR; }
668 }
669
670 if ((ns->ssifp = fopen(ssifile, "w")) == NULL) { status = eslENOTFOUND; goto ERROR; }
671
672 ESL_ALLOC(ns->filenames, sizeof(char *) * eslSSI_FCHUNK);
673 for (i = 0; i < eslSSI_FCHUNK; i++)
674 ns->filenames[i] = NULL;
675 ESL_ALLOC(ns->fileformat, sizeof(uint32_t) * eslSSI_FCHUNK);
676 ESL_ALLOC(ns->bpl, sizeof(uint32_t) * eslSSI_FCHUNK);
677 ESL_ALLOC(ns->rpl, sizeof(uint32_t) * eslSSI_FCHUNK);
678 ESL_ALLOC(ns->pkeys, sizeof(ESL_PKEY) * eslSSI_KCHUNK);
679 for (i = 0; i < eslSSI_KCHUNK; i++)
680 ns->pkeys[i].key = NULL;
681 ESL_ALLOC(ns->skeys, sizeof(ESL_SKEY) * eslSSI_KCHUNK);
682 for (i = 0; i < eslSSI_KCHUNK; i++) {
683 ns->skeys[i].key = NULL;
684 ns->skeys[i].pkey = NULL;
685 }
686 *ret_newssi = ns;
687 return eslOK;
688
689 ERROR:
690 esl_newssi_Close(ns); /* free the damaged structure */
691 return status;
692 }
693
694
695 /* Function: esl_newssi_AddFile()
696 * Synopsis: Add a filename to a growing index.
697 *
698 * Purpose: Registers the file <filename> into the new index <ns>,
699 * along with its format code <fmt>. The index assigns it
700 * a unique handle, which it returns in <ret_fh>. This
701 * handle is needed when registering primary keys.
702 *
703 * Caller should make sure that the same file isn't registered
704 * twice; this function doesn't check.
705 *
706 * Args: <ns> - new ssi index under construction.
707 * <filename> - filename to add to the index.
708 * <fmt> - format code to associate with <filename> (or 0)
709 * <ret_fh> - RETURN: filehandle associated with <filename>
710 *
711 * Returns: <eslOK> on success;
712 * <eslERANGE> if registering this file would exceed the
713 * maximum number of indexed files.
714 *
715 * Throws: <eslEMEM> on allocation or reallocation error.
716 */
717 int
esl_newssi_AddFile(ESL_NEWSSI * ns,const char * filename,int fmt,uint16_t * ret_fh)718 esl_newssi_AddFile(ESL_NEWSSI *ns, const char *filename, int fmt, uint16_t *ret_fh)
719 {
720 int status;
721 uint16_t fh;
722 int i;
723 int n;
724
725 if (ns->nfiles >= eslSSI_MAXFILES) ESL_XFAIL(eslERANGE, ns->errbuf, "exceeded the maximum number of files an SSI index can store");
726
727 n = strlen(filename);
728 if ((n+1) > ns->flen) ns->flen = n+1;
729
730 if ((status = esl_FileTail(filename, FALSE, &(ns->filenames[ns->nfiles]))) != eslOK) goto ERROR;
731
732 ns->fileformat[ns->nfiles] = fmt;
733 ns->bpl[ns->nfiles] = 0;
734 ns->rpl[ns->nfiles] = 0;
735 fh = ns->nfiles; /* handle is simply = file number */
736 ns->nfiles++;
737
738 if (ns->nfiles % eslSSI_FCHUNK == 0) {
739 ESL_REALLOC(ns->filenames, sizeof(char *) * (ns->nfiles+eslSSI_FCHUNK));
740 for (i = ns->nfiles; i < ns->nfiles+eslSSI_FCHUNK; i++) ns->filenames[i] = NULL;
741 ESL_REALLOC(ns->fileformat, sizeof(uint32_t) * (ns->nfiles+eslSSI_FCHUNK));
742 ESL_REALLOC(ns->bpl, sizeof(uint32_t) * (ns->nfiles+eslSSI_FCHUNK));
743 ESL_REALLOC(ns->rpl, sizeof(uint32_t) * (ns->nfiles+eslSSI_FCHUNK));
744 }
745
746 *ret_fh = fh;
747 return eslOK;
748
749 ERROR:
750 *ret_fh = 0;
751 return status;
752 }
753
754
755
756 /* Function: esl_newssi_SetSubseq()
757 * Synopsis: Declare that file is suitable for fast subseq lookup.
758 *
759 * Purpose: Declare that the file associated with handle <fh> is
760 * suitable for fast subsequence lookup, because it has
761 * a constant number of residues and bytes per (nonterminal)
762 * data line, <rpl> and <bpl>, respectively.
763 *
764 * Caller is responsible for this being true: <rpl> and
765 * <bpl> must be constant for every nonterminal line of
766 * every sequence in this file.
767 *
768 * Args: <ns> - ssi index under construction
769 * <fh> - handle on file to set fast subseq lookup on
770 * <bpl> - constant bytes per nonterminal line in <fh>
771 * <rpl> - constant residues per nonterminal line in <fh>
772 *
773 * Returns: <eslOK> on success.
774 *
775 * Throws: <eslEINVAL> on invalid argument(s).
776 */
777 int
esl_newssi_SetSubseq(ESL_NEWSSI * ns,uint16_t fh,uint32_t bpl,uint32_t rpl)778 esl_newssi_SetSubseq(ESL_NEWSSI *ns, uint16_t fh, uint32_t bpl, uint32_t rpl)
779 {
780 int status;
781
782 if (fh >= ns->nfiles) ESL_XEXCEPTION(eslEINVAL, "invalid file number");
783 if (bpl <= 0 || rpl <= 0) ESL_XEXCEPTION(eslEINVAL, "invalid bpl or rpl");
784 ns->bpl[fh] = bpl;
785 ns->rpl[fh] = rpl;
786 return eslOK;
787
788 ERROR:
789 return status;
790 }
791
792
793 /* Function: esl_newssi_AddKey()
794 * Synopsis: Add a primary key to a growing index.
795 * Date: SRE, Tue Jan 2 11:50:54 2001 [St. Louis]
796 *
797 * Purpose: Register primary key <key> in new index <ns>, while telling
798 * the index that this primary key is in the file associated
799 * with filehandle <fh> (the handle returned by a previous call
800 * to <esl_newssi_AddFile()>); that its record starts at
801 * offset <r_off> in the file; that its data (usually
802 * sequence data) starts at offset <d_off> in the file (i.e.
803 * after any record header); and that the record's data is
804 * of length <L> (usually, the record is a sequence, and <L>
805 * is its length in residues).
806 *
807 * The data length <L> is technically optional as far as SSI
808 * is concerned; <L> may be passed as 0 to leave it
809 * unset. However, functions in the <sqio> module that use
810 * SSI indices will assume that <L> is available.
811 *
812 * <d_off> is also optional; it may be passed as <0> to
813 * leave it unset. If provided, <d_off> gives an offset to
814 * the data portion of the record. The interpretation of
815 * this data offset may be implementation-defined and may
816 * depend on the format of the datafile; for example, in how
817 * <sqio> uses SSI indices, <d_off> is the offset to the
818 * start of the first sequence line.
819 *
820 * Both <d_off> and <L> must be provided, and additionally
821 * <eslSSI_FASTSUBSEQ> must be set for this file, for fast
822 * subsequence lookup to work.
823 *
824 * Args: <ns> - active index
825 * <key> - primary key to add
826 * <fh> - handle on file that this key's in
827 * <r_off> - offset to start of record
828 * <d_off> - offset to start of sequence data, or 0
829 * <L> - length of sequence, or 0
830 *
831 * Returns: <eslOK> on success;
832 * <eslERANGE> if registering this key would exceed the maximum
833 * number of primary keys;
834 * <eslENOTFOUND> if we needed to open external tmp files, but
835 * the attempt to open them failed.
836 *
837 * Throws: <eslEINVAL> on an invalid argument;
838 * <eslEMEM> on allocation failure;
839 * <eslEWRITE> on any system error writing to tmp file, such
840 * as filling the filesystem.
841 */
842 int
esl_newssi_AddKey(ESL_NEWSSI * ns,const char * key,uint16_t fh,off_t r_off,off_t d_off,int64_t L)843 esl_newssi_AddKey(ESL_NEWSSI *ns, const char *key, uint16_t fh,
844 off_t r_off, off_t d_off, int64_t L)
845 {
846 int status;
847 int i;
848 int n; /* a string length */
849
850 if (fh >= eslSSI_MAXFILES) ESL_XEXCEPTION(eslEINVAL, "invalid fh");
851 if (ns->nprimary >= eslSSI_MAXKEYS) ESL_XFAIL(eslERANGE, ns->errbuf, "exceeded maximum number of primary keys allowed");
852
853 /* Before adding the key: check how big our index is.
854 * If it's getting too large, switch to external mode.
855 */
856 if (!ns->external && current_newssi_size(ns) >= ns->max_ram)
857 if ((status = activate_external_sort(ns)) != eslOK) goto ERROR;
858
859 /* Update maximum pkey length, if needed. (Inclusive of '\0').
860 */
861 n = strlen(key)+1;
862 if (n > ns->plen) ns->plen = n;
863
864 /* External mode? Simply append to disk...
865 */
866 if (ns->external)
867 {
868 if (sizeof(off_t) == 4) {
869 if (fprintf(ns->ptmp, "%s\t%d\t%" PRIu32 "\t%" PRIu32 "\t%" PRIi64 "\n",
870 key, fh, (uint32_t) r_off, (uint32_t) d_off, L) <= 0)
871 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi key tmp file write failed");
872 } else {
873 if (fprintf(ns->ptmp, "%s\t%d\t%" PRIu64 "\t%" PRIu64 "\t%" PRIi64 "\n",
874 key, fh, (uint64_t) r_off, (uint64_t) d_off, L) <= 0)
875 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi key tmp file write failed");
876 }
877 ns->nprimary++;
878 }
879 else
880 {
881 /* Else: internal mode, keep keys in memory...
882 */
883 if ((status = esl_strdup(key, n, &(ns->pkeys[ns->nprimary].key))) != eslOK) goto ERROR;
884 ns->pkeys[ns->nprimary].fnum = fh;
885 ns->pkeys[ns->nprimary].r_off = r_off;
886 ns->pkeys[ns->nprimary].d_off = d_off;
887 ns->pkeys[ns->nprimary].len = L;
888 ns->nprimary++;
889
890 /* Reallocate as needed. */
891 if (ns->nprimary % eslSSI_KCHUNK == 0) {
892 ESL_REALLOC(ns->pkeys, sizeof(ESL_PKEY) * (ns->nprimary+eslSSI_KCHUNK));
893 for (i = ns->nprimary; i < ns->nprimary + eslSSI_KCHUNK; i++)
894 ns->pkeys[i].key = NULL;
895 }
896 }
897 return eslOK;
898
899 ERROR:
900 return status;
901 }
902
903 /* Function: esl_newssi_AddAlias()
904 * Synopsis: Add a secondary key (alias) to a growing index.
905 *
906 * Purpose: Registers secondary key <alias> in index <ns>, and
907 * map it to the primary key <key>. <key> must already
908 * have been registered. That is, when someone looks up <alias>,
909 * we'll retrieve record <key>.
910 *
911 * Args: <ns> - ssi index being constructed
912 * <alias> - secondary key to register
913 * <key> - primary key to associate with <skey>.
914 *
915 * Returns: <eslOK> on success;
916 * <eslERANGE> if registering this key would exceed the maximum
917 * number of secondary keys that can be stored;
918 * <eslENOTFOUND> if we needed to open external tmp files, but
919 * the attempt to open them failed.
920 *
921 * Throws: <eslEWRITE> on any system error writing to tmp file, such
922 * as running out of space on the device.
923 */
924 int
esl_newssi_AddAlias(ESL_NEWSSI * ns,const char * alias,const char * key)925 esl_newssi_AddAlias(ESL_NEWSSI *ns, const char *alias, const char *key)
926 {
927 int status;
928 int i;
929 int n; /* a string length */
930
931 if (ns->nsecondary >= eslSSI_MAXKEYS) ESL_XFAIL(eslERANGE, ns->errbuf, "exceeded maximum number of secondary keys allowed");
932
933 /* Before adding the key: check how big our index is.
934 * If it's getting too large, switch to external mode.
935 */
936 if (!ns->external && current_newssi_size(ns) >= ns->max_ram)
937 if ((status = activate_external_sort(ns)) != eslOK) goto ERROR;
938
939 /* Update maximum secondary key length, if necessary. */
940 n = strlen(alias)+1;
941 if (n > ns->slen) ns->slen = n;
942
943 /* if external mode: write info to disk. */
944 if (ns->external)
945 {
946 if (fprintf(ns->stmp, "%s\t%s\n", alias, key) <= 0) ESL_XEXCEPTION_SYS(eslEWRITE, "ssi alias tmp file write failed");
947 ns->nsecondary++;
948 }
949 else
950 { /* else, internal mode... store info in memory. */
951 if ((status = esl_strdup(alias, n, &(ns->skeys[ns->nsecondary].key))) != eslOK) goto ERROR;
952 if ((status = esl_strdup(key, -1, &(ns->skeys[ns->nsecondary].pkey))) != eslOK) goto ERROR;
953 ns->nsecondary++;
954
955 if (ns->nsecondary % eslSSI_KCHUNK == 0) {
956 ESL_REALLOC(ns->skeys, sizeof(ESL_SKEY) * (ns->nsecondary+eslSSI_KCHUNK));
957 for (i = ns->nsecondary; i < ns->nsecondary+eslSSI_KCHUNK; i++) {
958 ns->skeys[i].key = NULL;
959 ns->skeys[i].pkey = NULL;
960 }
961 }
962 }
963 return eslOK;
964
965 ERROR:
966 return status;
967 }
968
969
970 /* Function: esl_newssi_Write()
971 * Synopsis: Save a new index to an SSI file.
972 *
973 * Purpose: Writes the complete index <ns> in SSI format to its file,
974 * and closes the file.
975 *
976 * Handles all necessary overhead of sorting the primary and
977 * secondary keys, including any externally sorted tmpfiles that
978 * may have been needed for large indices.
979 *
980 * You only <_Write()> once. The open SSI file is closed.
981 * After calling <_Write()>, you should <_Close()> the
982 * <ESL_NEWSSI>.
983 *
984 * Verifies that all primary and secondary keys are unique.
985 * If not, returns <eslEDUP>.
986 *
987 * On any error, the SSI file <ns->ssifile> is deleted.
988 *
989 * Args: <ns> - new SSI index to write
990 *
991 * Returns: <eslOK> on success;
992 * <eslEDUP> if primary or secondary keys aren't all unique
993 * <eslERANGE> if index size exceeds system's maximum file size;
994 * <eslESYS> if any of the steps of an external sort fail.
995 *
996 * Throws: <eslEINVAL> on invalid argument, including too-long tmpfile names,
997 * or trying to _Write() the <ESL_NEWSSI> more than once;
998 * <eslEMEM> on buffer allocation failure;
999 * <eslEWRITE> on any system write failure, including filled disk.
1000 *
1001 * Note: It's O(1) memory to check for key duplications
1002 * here, after keys are sorted, compared to O(N) in
1003 * <esl_newssi_AddKey()>, where we would have to maintain a
1004 * hash of all previous N keys in memory.
1005 */
1006 int
esl_newssi_Write(ESL_NEWSSI * ns)1007 esl_newssi_Write(ESL_NEWSSI *ns)
1008 {
1009 int status, /* convention */
1010 i; /* counter over files, keys */
1011 uint32_t header_flags, /* bitflags in the header */
1012 file_flags, /* bitflags for a file record */
1013 frecsize, /* size of a file record (bytes) */
1014 precsize, /* size of a primary key record (bytes) */
1015 srecsize; /* size of a secondary key record (bytes) */
1016 off_t foffset, /* offset to file section */
1017 poffset, /* offset to primary key section */
1018 soffset; /* offset to secondary key section */
1019 char *fk = NULL, /* fixed-width (flen) file name */
1020 *pk = NULL, /* fixed-width (plen) primary key string */
1021 *sk = NULL, /* fixed-width (slen) secondary key string */
1022 *buf = NULL; /* esl_fgets() growable buffer */
1023 int n = 0; /* esl_fgets() buffer size */
1024 ESL_PKEY pkey; /* primary key info from external tmpfile */
1025 ESL_SKEY skey; /* secondary key info from external tmpfile */
1026
1027 if (ns->nsecondary > 0 && ns->slen == 0)
1028 ESL_EXCEPTION(eslEINVAL, "zero secondary key length: shouldn't happen");
1029 if (ns->ssifp == NULL)
1030 ESL_EXCEPTION(eslEINVAL, "SSI data were already written.");
1031
1032 /* We need fixed-width buffers to get our keys fwrite()'ten in their
1033 * full binary lengths; pkey->key (for instance) is not guaranteed
1034 * to be allocated for the final maximum plen. We use strncpy(), not
1035 * strcpy(), to fill these buffers, because strncpy() pads unused
1036 * bytes as NUL's, and valgrind will flag you if you attempt to
1037 * write uninitialized bytes from these buffers.
1038 */
1039 ESL_ALLOC(fk, sizeof(char) * ns->flen);
1040 ESL_ALLOC(pk, ESL_MAX(1, sizeof(char) * ns->plen));
1041 if (ns->nsecondary > 0) ESL_ALLOC(sk, sizeof(char) * ns->slen);
1042
1043 /* How big is the index? If it's going to be > 2GB, we better have
1044 * 64-bit offsets. (2047 (instead of 2048) gives us
1045 * some slop room.) If not, abort here.
1046 *
1047 * aborting here is pretty brutal - we've processed hundreds of
1048 * millions of keys for nothing. Ah well.
1049 */
1050 if (current_newssi_size(ns) >= 2047 && sizeof(off_t) != 8)
1051 ESL_XFAIL(eslERANGE, ns->errbuf, "SSI index file file would be > 2G; your filesystem isn't capable of handling it");
1052
1053 /* Magic-looking numbers come from adding up sizes
1054 * of things in bytes: they match current_newssi_size().
1055 */
1056 frecsize = 4*sizeof(uint32_t) + ns->flen;
1057 precsize = 2*sizeof(off_t) + sizeof(uint16_t) + sizeof(uint64_t) + ns->plen;
1058 srecsize = ns->slen + ns->plen;
1059 header_flags = 0;
1060
1061 /* Magic-looking numbers again come from adding up sizes
1062 * of things in bytes: matches current_newssi_size()
1063 */
1064 foffset = 9*sizeof(uint32_t)+2*sizeof(uint64_t)+sizeof(uint16_t)+3*sizeof(off_t);
1065 poffset = foffset + frecsize*ns->nfiles;
1066 soffset = poffset + precsize*ns->nprimary;
1067
1068 /* Sort the keys.
1069 * If external mode, make system calls to UNIX/POSIX "sort" in place, then
1070 * open new sorted files for reading thru ptmp and stmp handles.
1071 * If internal mode, call qsort.
1072 *
1073 * Note that you'd better force a POSIX locale for the sort; else,
1074 * some silly distro (e.g. Mandrake Linux >=8.1) may have specified
1075 * LC_COLLATE=en_US, and this'll give a sort "bug" in which it doesn't
1076 * sort by byte order.
1077 */
1078 if (ns->external)
1079 {
1080 char cmd[1024];
1081
1082 /* A last minute security check: make sure we won't overflow
1083 * sprintf() with the tmpfile names. They're hardcoded now, so
1084 * we know they don't overflow, but they might be configurable
1085 * in the future, and we wouldn't want a security hole to open
1086 * up.
1087 */
1088 if (strlen(ns->ptmpfile) > 256 || strlen(ns->ptmpfile) > 256)
1089 ESL_XEXCEPTION(eslEINVAL, "tmpfile name too long");
1090
1091 fclose(ns->ptmp);
1092 ns->ptmp = NULL;
1093 sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", ns->ptmpfile, ns->ptmpfile);
1094 if (system(cmd) != 0) ESL_XFAIL(eslESYS, ns->errbuf, "external sort of primary keys failed");
1095 if ((ns->ptmp = fopen(ns->ptmpfile, "r")) == NULL) ESL_XFAIL(eslESYS, ns->errbuf, "failed to reopen primary key tmp file after sort");
1096
1097 fclose(ns->stmp);
1098 ns->stmp = NULL;
1099 sprintf(cmd, "env LC_ALL=POSIX sort -o %s %s\n", ns->stmpfile, ns->stmpfile);
1100 if (system(cmd) != 0) ESL_XFAIL(eslESYS, ns->errbuf, "external sort of secondary keys failed");
1101 if ((ns->stmp = fopen(ns->stmpfile, "r")) == NULL) ESL_XFAIL(eslESYS, ns->errbuf, "failed to reopen secondary key tmp file after sort");
1102 }
1103 else
1104 {
1105 qsort((void *) ns->pkeys, ns->nprimary, sizeof(ESL_PKEY), pkeysort);
1106 qsort((void *) ns->skeys, ns->nsecondary, sizeof(ESL_SKEY), skeysort);
1107 }
1108
1109 /* Write the header
1110 */
1111 if (esl_fwrite_u32(ns->ssifp, v30magic) != eslOK ||
1112 esl_fwrite_u32(ns->ssifp, header_flags) != eslOK ||
1113 esl_fwrite_u32(ns->ssifp, sizeof(off_t)) != eslOK ||
1114 esl_fwrite_u16(ns->ssifp, ns->nfiles) != eslOK ||
1115 esl_fwrite_u64(ns->ssifp, ns->nprimary) != eslOK ||
1116 esl_fwrite_u64(ns->ssifp, ns->nsecondary)!= eslOK ||
1117 esl_fwrite_u32(ns->ssifp, ns->flen) != eslOK ||
1118 esl_fwrite_u32(ns->ssifp, ns->plen) != eslOK ||
1119 esl_fwrite_u32(ns->ssifp, ns->slen) != eslOK ||
1120 esl_fwrite_u32(ns->ssifp, frecsize) != eslOK ||
1121 esl_fwrite_u32(ns->ssifp, precsize) != eslOK ||
1122 esl_fwrite_u32(ns->ssifp, srecsize) != eslOK ||
1123 esl_fwrite_offset(ns->ssifp, foffset) != eslOK ||
1124 esl_fwrite_offset(ns->ssifp, poffset) != eslOK ||
1125 esl_fwrite_offset(ns->ssifp, soffset) != eslOK)
1126 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1127
1128 /* Write the file section
1129 */
1130 for (i = 0; i < ns->nfiles; i++)
1131 {
1132 file_flags = 0;
1133 if (ns->bpl[i] > 0 && ns->rpl[i] > 0) file_flags |= eslSSI_FASTSUBSEQ;
1134 strncpy(fk, ns->filenames[i], ns->flen);
1135
1136 if (fwrite(fk, sizeof(char), ns->flen, ns->ssifp) != ns->flen ||
1137 esl_fwrite_u32(ns->ssifp, ns->fileformat[i]) != eslOK ||
1138 esl_fwrite_u32(ns->ssifp, file_flags) != eslOK ||
1139 esl_fwrite_u32(ns->ssifp, ns->bpl[i]) != eslOK ||
1140 esl_fwrite_u32(ns->ssifp, ns->rpl[i]) != eslOK)
1141 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1142 }
1143
1144 /* Write the primary key section
1145 */
1146 if (ns->external)
1147 {
1148 if (ns->nprimary) strncpy(pk, "", ns->plen);
1149 for (i = 0; i < ns->nprimary; i++)
1150 {
1151 if (esl_fgets(&buf, &n, ns->ptmp) != eslOK) ESL_XFAIL(eslESYS, ns->errbuf, "read from sorted primary key tmpfile failed");
1152 if (parse_pkey(buf, &pkey) != eslOK) ESL_XFAIL(eslESYS, ns->errbuf, "parse failed for a line of sorted primary key tmpfile failed");
1153 if (strcmp(pk, pkey.key) == 0) ESL_XFAIL(eslEDUP, ns->errbuf, "primary keys not unique: '%s' occurs more than once", pkey.key);
1154 strncpy(pk, pkey.key, ns->plen); // strncpy() pads w/ nulls, and we count on that behavior.
1155
1156 if (fwrite(pk,sizeof(char),ns->plen,ns->ssifp) != ns->plen ||
1157 esl_fwrite_u16( ns->ssifp, pkey.fnum) != eslOK ||
1158 esl_fwrite_offset(ns->ssifp, pkey.r_off) != eslOK ||
1159 esl_fwrite_offset(ns->ssifp, pkey.d_off) != eslOK ||
1160 esl_fwrite_i64( ns->ssifp, pkey.len) != eslOK)
1161 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1162 }
1163 }
1164 else
1165 {
1166 if (ns->nprimary) strncpy(pk, "", ns->plen);
1167 for (i = 0; i < ns->nprimary; i++)
1168 {
1169 if (strcmp(pk, ns->pkeys[i].key) == 0) ESL_XFAIL(eslEDUP, ns->errbuf, "primary keys not unique: '%s' occurs more than once", ns->pkeys[i].key);
1170 strncpy(pk, ns->pkeys[i].key, ns->plen);
1171
1172 if (fwrite(pk,sizeof(char),ns->plen,ns->ssifp) != ns->plen ||
1173 esl_fwrite_u16( ns->ssifp, ns->pkeys[i].fnum) != eslOK ||
1174 esl_fwrite_offset(ns->ssifp, ns->pkeys[i].r_off) != eslOK ||
1175 esl_fwrite_offset(ns->ssifp, ns->pkeys[i].d_off) != eslOK ||
1176 esl_fwrite_i64( ns->ssifp, ns->pkeys[i].len) != eslOK)
1177 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1178 }
1179 }
1180
1181
1182 /* Write the secondary key section
1183 */
1184 if (ns->external)
1185 {
1186 if (ns->nsecondary) strncpy(sk, "", ns->slen);
1187 for (i = 0; i < ns->nsecondary; i++)
1188 {
1189 if (esl_fgets(&buf, &n, ns->stmp) != eslOK) ESL_XFAIL(eslESYS, ns->errbuf, "read from sorted secondary key tmpfile failed");
1190 if (parse_skey(buf, &skey) != eslOK) ESL_XFAIL(eslESYS, ns->errbuf, "parse failed for a line of sorted secondary key tmpfile failed");
1191 if (strcmp(sk, skey.key) == 0) ESL_XFAIL(eslEDUP, ns->errbuf, "secondary keys not unique: '%s' occurs more than once", skey.key);
1192 strncpy(sk, skey.key, ns->slen); // slen > 0 if there are any secondary keys.
1193 strncpy(pk, skey.pkey, ns->plen);
1194
1195 if (fwrite(sk, sizeof(char), ns->slen, ns->ssifp) != ns->slen ||
1196 fwrite(pk, sizeof(char), ns->plen, ns->ssifp) != ns->plen)
1197 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1198 }
1199 }
1200 else
1201 {
1202 /* if ns->nsecondary=0, ns->slen=0 and sk=NULL */
1203 if (ns->nsecondary) strncpy(sk, "", ns->slen);
1204 for (i = 0; i < ns->nsecondary; i++)
1205 {
1206 if (strcmp(sk, ns->skeys[i].key) == 0) ESL_XFAIL(eslEDUP, ns->errbuf, "secondary keys not unique: '%s' occurs more than once", ns->skeys[i].key);
1207 strncpy(sk, ns->skeys[i].key, ns->slen);
1208 strncpy(pk, ns->skeys[i].pkey, ns->plen);
1209
1210 if (fwrite(sk, sizeof(char), ns->slen, ns->ssifp) != ns->slen ||
1211 fwrite(pk, sizeof(char), ns->plen, ns->ssifp) != ns->plen)
1212 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi write failed");
1213 }
1214 }
1215
1216 fclose(ns->ssifp); // Closing <ssifp> makes it so we can only _Write() once.
1217 ns->ssifp = NULL;
1218 if (fk) free(fk);
1219 if (pk) free(pk);
1220 if (sk) free(sk);
1221 if (buf) free(buf);
1222 if (ns->ptmp) { fclose(ns->ptmp); ns->ptmp = NULL; }
1223 if (ns->stmp) { fclose(ns->stmp); ns->stmp = NULL; }
1224 return eslOK;
1225
1226 ERROR:
1227 remove(ns->ssifile); // Cleanup: delete failed <ssifile> on any error.
1228 if (ns->ssifp) { fclose(ns->ssifp); ns->ssifp = NULL; }
1229 if (fk) free(fk);
1230 if (pk) free(pk);
1231 if (sk) free(sk);
1232 if (buf) free(buf);
1233 if (ns->ptmp) { fclose(ns->ptmp); ns->ptmp = NULL; }
1234 if (ns->stmp) { fclose(ns->stmp); ns->stmp = NULL; }
1235 return status;
1236 }
1237
1238 /* Function: esl_newssi_Close()
1239 * Synopsis: Free an <ESL_NEWSSI>.
1240 *
1241 * Purpose: Frees a <ESL_NEWSSI>.
1242 */
1243 void
esl_newssi_Close(ESL_NEWSSI * ns)1244 esl_newssi_Close(ESL_NEWSSI *ns)
1245 {
1246 int i;
1247 if (ns == NULL) return;
1248
1249 if (ns->external == FALSE)
1250 {
1251 if (ns->pkeys != NULL)
1252 {
1253 for (i = 0; i < ns->nprimary; i++)
1254 if (ns->pkeys[i].key != NULL) free(ns->pkeys[i].key);
1255 free(ns->pkeys);
1256 }
1257 if (ns->skeys != NULL)
1258 {
1259 for (i = 0; i < ns->nsecondary; i++)
1260 {
1261 if (ns->skeys[i].key != NULL) free(ns->skeys[i].key);
1262 if (ns->skeys[i].pkey != NULL) free(ns->skeys[i].pkey);
1263 }
1264 free(ns->skeys);
1265 }
1266 }
1267 else
1268 {
1269 remove(ns->ptmpfile);
1270 remove(ns->stmpfile);
1271 }
1272
1273 if (ns->filenames != NULL)
1274 {
1275 for (i = 0; i < ns->nfiles; i++)
1276 if (ns->filenames[i] != NULL) free(ns->filenames[i]);
1277 free(ns->filenames);
1278 }
1279
1280 if (ns->stmp) fclose(ns->stmp);
1281 if (ns->stmpfile) free(ns->stmpfile);
1282 if (ns->ptmp) fclose(ns->ptmp);
1283 if (ns->ptmpfile) free(ns->ptmpfile);
1284 if (ns->fileformat) free(ns->fileformat);
1285 if (ns->bpl) free(ns->bpl);
1286 if (ns->rpl) free(ns->rpl);
1287 if (ns->ssifile) free(ns->ssifile);
1288 if (ns->ssifp) fclose(ns->ssifp);
1289 free(ns);
1290 }
1291
1292
1293
1294
1295 /* current_newssi_size()
1296 *
1297 * Calculates the size of the current index, in megabytes, in
1298 * its disk version (which is essentially the same as the
1299 * RAM it takes, modulo some small overhead for the structures
1300 * and ptrs).
1301 *
1302 * The header costs 10 uint32, 1 uint16, and 3 off_t's: 42 + (12 | 24).
1303 * Each file record costs 4 uint32 and flen chars;
1304 * each primary key costs us 2 off_t, 1 uint16, 1 uint32, and plen chars;
1305 * each sec key costs us plen+slen chars.
1306 */
1307 static int
current_newssi_size(const ESL_NEWSSI * ns)1308 current_newssi_size(const ESL_NEWSSI *ns)
1309 {
1310 uint64_t frecsize, precsize, srecsize;
1311 uint64_t total;
1312
1313 /* Magic-looking numbers come from adding up sizes
1314 * of things in bytes
1315 */
1316 frecsize = 4*sizeof(uint32_t) + ns->flen;
1317 precsize = 2*sizeof(off_t) + sizeof(uint16_t) + sizeof(uint64_t) + ns->plen;
1318 srecsize = ns->slen + ns->plen;
1319 total = (9*sizeof(uint32_t)+2*sizeof(uint64_t)+sizeof(uint16_t)+3*sizeof(off_t)+
1320 frecsize * ns->nfiles + /* file section size */
1321 precsize * ns->nprimary + /* primary key section size */
1322 srecsize * ns->nsecondary) / /* secondary key section size */
1323 1048576L;
1324 return (int) total;
1325 }
1326
1327 /* activate_external_sort()
1328 *
1329 * Switch to external sort mode.
1330 * Open file handles for external index files (ptmp, stmp).
1331 * Flush current index information to these files.
1332 * Free current memory, turn over control to the tmpfiles.
1333 *
1334 * Return <eslOK> on success;
1335 * <eslENOTFOUND> if we can't open a tmpfile for writing.
1336 *
1337 * Throw <eslEWRITE> if a write fails.
1338 */
1339 static int
activate_external_sort(ESL_NEWSSI * ns)1340 activate_external_sort(ESL_NEWSSI *ns)
1341 {
1342 int status;
1343 int i;
1344
1345 if (ns->external) return eslOK; /* we already are external, fool */
1346
1347 if ((ns->ptmp = fopen(ns->ptmpfile, "w")) == NULL) ESL_XFAIL(eslENOTFOUND, ns->errbuf, "Failed to open primary key tmpfile for external sort");
1348 if ((ns->stmp = fopen(ns->stmpfile, "w")) == NULL) ESL_XFAIL(eslENOTFOUND, ns->errbuf, "Failed to open secondary key tmpfile for external sort");
1349
1350 /* Flush the current indices. */
1351 for (i = 0; i < ns->nprimary; i++) {
1352 if (sizeof(off_t) == 4) {
1353 if (fprintf(ns->ptmp, "%s\t%u\t%lu\t%lu\t%lu\n",
1354 ns->pkeys[i].key,
1355 (unsigned int) ns->pkeys[i].fnum,
1356 (unsigned long) ns->pkeys[i].r_off,
1357 (unsigned long) ns->pkeys[i].d_off,
1358 (unsigned long) ns->pkeys[i].len) <= 0)
1359 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi key tmp file write failed");
1360 } else {
1361 if (fprintf(ns->ptmp, "%s\t%u\t%llu\t%llu\t%lu\n",
1362 ns->pkeys[i].key,
1363 (unsigned int) ns->pkeys[i].fnum,
1364 (unsigned long long) ns->pkeys[i].r_off,
1365 (unsigned long long) ns->pkeys[i].d_off,
1366 (unsigned long) ns->pkeys[i].len) <= 0)
1367 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi key tmp file write failed");
1368 }
1369 }
1370 for (i = 0; i < ns->nsecondary; i++)
1371 if (fprintf(ns->stmp, "%s\t%s\n", ns->skeys[i].key, ns->skeys[i].pkey) <= 0)
1372 ESL_XEXCEPTION_SYS(eslEWRITE, "ssi alias tmp file write failed");
1373
1374 /* Free the memory now that we've flushed our lists to disk
1375 */
1376 for (i = 0; i < ns->nprimary; i++) free(ns->pkeys[i].key);
1377 for (i = 0; i < ns->nsecondary; i++) free(ns->skeys[i].key);
1378 for (i = 0; i < ns->nsecondary; i++) free(ns->skeys[i].pkey);
1379 if (ns->pkeys != NULL) free(ns->pkeys);
1380 if (ns->skeys != NULL) free(ns->skeys);
1381 ns->pkeys = NULL;
1382 ns->skeys = NULL;
1383 ns->external = TRUE;
1384 return eslOK;
1385
1386 ERROR:
1387 if (ns->ptmp != NULL) { fclose(ns->ptmp); ns->ptmp = NULL; }
1388 if (ns->stmp != NULL) { fclose(ns->stmp); ns->stmp = NULL; }
1389 return status;
1390 }
1391
1392 /* parse_pkey(), parse_skey()
1393 *
1394 * Given a <buf> containing a line read from the external
1395 * primary-key or secondary-key tmpfile; parse it, and fill in the fields of
1396 * <pkey> or <skey>
1397 *
1398 * <?key> is a ptr to a structure on the stack. It is assumed
1399 * to be in use only transiently.
1400 * <?key>'s strings become ptrs into <buf>'s space, so we don't have to
1401 * allocate new space for them. This means that the transient <?key> structure
1402 * is only usable until <buf> is modified or free'd.
1403 *
1404 * Returns <eslOK> on success.
1405 *
1406 * Throws <eslEFORMAT> on parse error (shouldn't happen; we created it!)
1407 * <eslEINCONCEIVABLE> if we can't deal with off_t's size.
1408 */
1409 static int
parse_pkey(char * buf,ESL_PKEY * pkey)1410 parse_pkey(char *buf, ESL_PKEY *pkey)
1411 {
1412 int status;
1413 char *s, *tok;
1414
1415 s = buf;
1416 if (esl_strtok(&s, "\t\n", &(pkey->key)) != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1417 if (esl_strtok(&s, "\t\n", &tok) != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1418
1419 pkey->fnum = (uint16_t) atoi(tok);
1420 if (esl_strtok(&s, "\t\n", &tok) != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1421 if (sizeof(off_t) == 4) pkey->r_off = (off_t) strtoul (tok, NULL, 10);
1422 else if (sizeof(off_t) == 8) pkey->r_off = (off_t) strtoull(tok, NULL, 10);
1423 else ESL_XEXCEPTION(eslEINCONCEIVABLE, "whoa - weird off_t");
1424
1425 if (esl_strtok(&s, "\t\n", &tok) != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1426 if (sizeof(off_t) == 4) pkey->d_off = (off_t) strtoul (tok, NULL, 10);
1427 else if (sizeof(off_t) == 8) pkey->d_off = (off_t) strtoull(tok, NULL, 10);
1428 else ESL_XEXCEPTION(eslEINCONCEIVABLE, "whoa - weird off_t");
1429
1430 if (esl_strtok(&s, "\t\n", &tok) != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1431 pkey->len = (uint64_t) strtoull(tok, NULL, 10);
1432 return eslOK;
1433
1434 ERROR:
1435 return status;
1436 }
1437 static int
parse_skey(char * buf,ESL_SKEY * skey)1438 parse_skey(char *buf, ESL_SKEY *skey)
1439 {
1440 int status;
1441 char *s;
1442
1443 s = buf;
1444 if (esl_strtok(&s, "\t\n", &(skey->key)) != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1445 if (esl_strtok(&s, "\t\n", &(skey->pkey)) != eslOK) ESL_XEXCEPTION(eslEFORMAT, "parse failed");
1446 return eslOK;
1447
1448 ERROR:
1449 return status;
1450 }
1451
1452 /* ordering functions needed for qsort() */
1453 static int
pkeysort(const void * k1,const void * k2)1454 pkeysort(const void *k1, const void *k2)
1455 {
1456 ESL_PKEY *key1;
1457 ESL_PKEY *key2;
1458 key1 = (ESL_PKEY *) k1;
1459 key2 = (ESL_PKEY *) k2;
1460 return strcmp(key1->key, key2->key);
1461 }
1462 static int
skeysort(const void * k1,const void * k2)1463 skeysort(const void *k1, const void *k2)
1464 {
1465 ESL_SKEY *key1;
1466 ESL_SKEY *key2;
1467 key1 = (ESL_SKEY *) k1;
1468 key2 = (ESL_SKEY *) k2;
1469 return strcmp(key1->key, key2->key);
1470 }
1471
1472
1473 /*****************************************************************
1474 *# 3. Portable binary i/o
1475 *****************************************************************/
1476
1477 /* Function: esl_byteswap()
1478 * Synopsis: Swap between big-endian and little-endian, in place.
1479 *
1480 * Purpose: Swap between big-endian and little-endian, in place.
1481 */
1482 void
esl_byteswap(char * swap,int nbytes)1483 esl_byteswap(char *swap, int nbytes)
1484 {
1485 int x;
1486 char byte;
1487
1488 for (x = 0; x < nbytes / 2; x++)
1489 {
1490 byte = swap[nbytes - x - 1];
1491 swap[nbytes - x - 1] = swap[x];
1492 swap[x] = byte;
1493 }
1494 }
1495
1496 /* Function: esl_ntoh16()
1497 * Synopsis: Convert 2-byte integer from network-order to host-order.
1498 *
1499 * Purpose: Convert a 2-byte integer from network-order to host-order,
1500 * and return it.
1501 *
1502 * <esl_ntoh32()> and <esl_ntoh64()> do the same, but for 4-byte
1503 * and 8-byte integers, respectively.
1504 */
1505 uint16_t
esl_ntoh16(uint16_t netshort)1506 esl_ntoh16(uint16_t netshort)
1507 {
1508 #ifdef WORDS_BIGENDIAN
1509 return netshort;
1510 #else
1511 esl_byteswap((char *) &netshort, 2);
1512 return netshort;
1513 #endif
1514 }
1515 uint32_t
esl_ntoh32(uint32_t netlong)1516 esl_ntoh32(uint32_t netlong)
1517 {
1518 #ifdef WORDS_BIGENDIAN
1519 return netlong;
1520 #else
1521 esl_byteswap((char *) &netlong, 4);
1522 return netlong;
1523 #endif
1524 }
1525 uint64_t
esl_ntoh64(uint64_t net_int64)1526 esl_ntoh64(uint64_t net_int64)
1527 {
1528 #ifdef WORDS_BIGENDIAN
1529 return net_int64;
1530 #else
1531 esl_byteswap((char *) &net_int64, 8);
1532 return net_int64;
1533 #endif
1534 }
1535
1536 /* Function: esl_hton16()
1537 * Synopsis: Convert 2-byte integer from host-order to network-order.
1538 *
1539 * Purpose: Convert a 2-byte integer from host-order to network-order, and
1540 * return it.
1541 *
1542 * <esl_hton32()> and <esl_hton64()> do the same, but for 4-byte
1543 * and 8-byte integers, respectively.
1544 */
1545 uint16_t
esl_hton16(uint16_t hostshort)1546 esl_hton16(uint16_t hostshort)
1547 {
1548 #ifdef WORDS_BIGENDIAN
1549 return hostshort;
1550 #else
1551 esl_byteswap((char *) &hostshort, 2);
1552 return hostshort;
1553 #endif
1554 }
1555 uint32_t
esl_hton32(uint32_t hostlong)1556 esl_hton32(uint32_t hostlong)
1557 {
1558 #ifdef WORDS_BIGENDIAN
1559 return hostlong;
1560 #else
1561 esl_byteswap((char *) &hostlong, 4);
1562 return hostlong;
1563 #endif
1564 }
1565 uint64_t
esl_hton64(uint64_t host_int64)1566 esl_hton64(uint64_t host_int64)
1567 {
1568 #ifdef WORDS_BIGENDIAN
1569 return host_int64;
1570 #else
1571 esl_byteswap((char *) &host_int64, 8);
1572 return host_int64;
1573 #endif
1574 }
1575
1576
1577 /* Function: esl_fread_u16()
1578 * Synopsis: Read network-order integer from a stream.
1579 *
1580 * Purpose: Read a 2-byte network-order integer from <fp>, convert to
1581 * host order, leave it in <ret_result>.
1582 *
1583 * <esl_fread_u32()> and <esl_fread_u64()> do the same, but
1584 * for 4-byte and 8-byte integers, respectively.
1585 *
1586 * Returns: <eslOK> on success, and <eslFAIL> on <fread()> failure.
1587 */
1588 int
esl_fread_u16(FILE * fp,uint16_t * ret_result)1589 esl_fread_u16(FILE *fp, uint16_t *ret_result)
1590 {
1591 uint16_t result;
1592 if (fread(&result, sizeof(uint16_t), 1, fp) != 1) return eslFAIL;
1593 *ret_result = esl_ntoh16(result);
1594 return eslOK;
1595 }
1596 int
esl_fread_u32(FILE * fp,uint32_t * ret_result)1597 esl_fread_u32(FILE *fp, uint32_t *ret_result)
1598 {
1599 uint32_t result;
1600 if (fread(&result, sizeof(uint32_t), 1, fp) != 1) return eslFAIL;
1601 *ret_result = esl_ntoh32(result);
1602 return eslOK;
1603 }
1604 int
esl_fread_u64(FILE * fp,uint64_t * ret_result)1605 esl_fread_u64(FILE *fp, uint64_t *ret_result)
1606 {
1607 uint64_t result;
1608 if (fread(&result, sizeof(uint64_t), 1, fp) != 1) return eslFAIL;
1609 *ret_result = esl_ntoh64(result);
1610 return eslOK;
1611 }
1612 int
esl_fread_i16(FILE * fp,int16_t * ret_result)1613 esl_fread_i16(FILE *fp, int16_t *ret_result)
1614 {
1615 int16_t result;
1616 if (fread(&result, sizeof(int16_t), 1, fp) != 1) return eslFAIL;
1617 *ret_result = (int16_t) esl_ntoh16((uint16_t) result);
1618 return eslOK;
1619 }
1620 int
esl_fread_i32(FILE * fp,int32_t * ret_result)1621 esl_fread_i32(FILE *fp, int32_t *ret_result)
1622 {
1623 int32_t result;
1624 if (fread(&result, sizeof(int32_t), 1, fp) != 1) return eslFAIL;
1625 *ret_result = (int32_t) esl_ntoh32((uint32_t) result);
1626 return eslOK;
1627 }
1628 int
esl_fread_i64(FILE * fp,int64_t * ret_result)1629 esl_fread_i64(FILE *fp, int64_t *ret_result)
1630 {
1631 int64_t result;
1632 if (fread(&result, sizeof(int64_t), 1, fp) != 1) return eslFAIL;
1633 *ret_result = (int64_t) esl_ntoh64((uint64_t) result);
1634 return eslOK;
1635 }
1636
1637
1638 /* Function: esl_fwrite_u16()
1639 * Synopsis: Write an integer to a stream in network-order.
1640 *
1641 * Purpose: Write a 2-byte host-order integer <n> to stream <fp>
1642 * in network order.
1643 *
1644 * <esl_fwrite_u32()> and <esl_fwrite_u64()> do the same, but
1645 * for 4-byte and 8-byte integers, respectively.
1646 *
1647 * Returns: <eslOK> on success, and <eslFAIL> on <fwrite()> failure.
1648 */
1649 int
esl_fwrite_u16(FILE * fp,uint16_t n)1650 esl_fwrite_u16(FILE *fp, uint16_t n)
1651 {
1652 n = esl_hton16(n);
1653 if (fwrite(&n, sizeof(uint16_t), 1, fp) != 1) return eslFAIL;
1654 return eslOK;
1655 }
1656 int
esl_fwrite_u32(FILE * fp,uint32_t n)1657 esl_fwrite_u32(FILE *fp, uint32_t n)
1658 {
1659 n = esl_hton32(n);
1660 if (fwrite(&n, sizeof(uint32_t), 1, fp) != 1) return eslFAIL;
1661 return eslOK;
1662 }
1663 int
esl_fwrite_u64(FILE * fp,uint64_t n)1664 esl_fwrite_u64(FILE *fp, uint64_t n)
1665 {
1666 n = esl_hton64(n);
1667 if (fwrite(&n, sizeof(uint64_t), 1, fp) != 1) return eslFAIL;
1668 return eslOK;
1669 }
1670 int
esl_fwrite_i16(FILE * fp,int16_t n)1671 esl_fwrite_i16(FILE *fp, int16_t n)
1672 {
1673 n = (int16_t) esl_hton16((uint16_t) n);
1674 if (fwrite(&n, sizeof(int16_t), 1, fp) != 1) return eslFAIL;
1675 return eslOK;
1676 }
1677 int
esl_fwrite_i32(FILE * fp,int32_t n)1678 esl_fwrite_i32(FILE *fp, int32_t n)
1679 {
1680 n = (int32_t) esl_hton32((uint32_t) n);
1681 if (fwrite(&n, sizeof(int32_t), 1, fp) != 1) return eslFAIL;
1682 return eslOK;
1683 }
1684 int
esl_fwrite_i64(FILE * fp,int64_t n)1685 esl_fwrite_i64(FILE *fp, int64_t n)
1686 {
1687 n = (int64_t) esl_hton64((uint64_t) n);
1688 if (fwrite(&n, sizeof(int64_t), 1, fp) != 1) return eslFAIL;
1689 return eslOK;
1690 }
1691
1692
1693 /* Function: esl_fread_offset()
1694 * Synopsis: Read an offset portably.
1695 *
1696 * Purpose: Read a file offset from the stream <fp> (which would usually
1697 * be a save file), and store it in <ret_offset>.
1698 *
1699 * Offsets may have been saved by a different machine
1700 * than the machine that reads them. The writer and the reader
1701 * may differ in byte order and in width (<sizeof(off_t)>).
1702 *
1703 * Byte order is dealt with by saving offsets in
1704 * network byte order, and converting them to host byte order
1705 * when they are read (if necessary).
1706 *
1707 * Width is dealt with by the <sz> argument, which must be
1708 * either 4 or 8, specifying that the saved offset is a
1709 * 32-bit versus 64-bit <off_t>. If the reading host
1710 * <off_t> width matches the <sz> of the writer, no
1711 * problem. If <sz> is 4 but the reading host has 64-bit
1712 * <off_t>'s, this is also no problem; the conversion
1713 * always works. If <sz> is 64 but the reading host has
1714 * only 32-bit <off_t>, we cannot guarantee that we have
1715 * sufficient dynamic range to represent the offset; if
1716 * the stored offset is too large to represent in a 32-bit
1717 * offset, we throw a fatal <eslEINCOMPAT> error.
1718 *
1719 * Returns: <eslOK> on success; <eslFAIL> on a read failure.
1720 *
1721 * Throws: <eslEINVAL> if <sz> is something other than 4 or 8;
1722 * <eslEINCOMPAT> if the stored offset is too large for
1723 * the reader to represent (the machine that wrote the
1724 * SSI file used 64 bit offsets, the reader uses 32
1725 * bit offsets, and this offset is too large to represent
1726 * in a 32 bit offset).
1727 */
1728 int
esl_fread_offset(FILE * fp,int sz,off_t * ret_offset)1729 esl_fread_offset(FILE *fp, int sz, off_t *ret_offset)
1730 {
1731 int status;
1732 uint32_t x32;
1733 uint64_t x64;
1734
1735 if (sz == 8)
1736 {
1737 if (esl_fread_u64(fp, &x64) != eslOK) { status = eslFAIL; goto ERROR; }
1738 if (sizeof(off_t) == 4 && x64 > INT32_MAX)
1739 ESL_XEXCEPTION(eslEINCOMPAT, "can't read 64-bit off_t on this 32-bit host");
1740 *ret_offset = (off_t) x64;
1741 }
1742 else if (sz == 4)
1743 {
1744 if (esl_fread_u32(fp, &x32) != eslOK) { status = eslFAIL; goto ERROR; }
1745 *ret_offset = (off_t) x32;
1746 }
1747 else ESL_XEXCEPTION(eslEINVAL, "offsets must be 32 or 64 bits");
1748 return eslOK;
1749
1750 ERROR:
1751 *ret_offset = 0;
1752 return status;
1753 }
1754
1755 /* Function: esl_fwrite_offset()
1756 * Synopsis: Write an offset portably.
1757 *
1758 * Purpose: Portably write (save) <offset> to the stream <fp>, in network
1759 * byte order.
1760 *
1761 * Returns: <eslOK> on success; <eslFAIL> on write failure.
1762 *
1763 * Throws: <eslEINVAL> if <off_t> is something other than a 32-bit or
1764 * 64-bit integer on this machine, in which case we don't know
1765 * how to deal with it portably.
1766 */
1767 int
esl_fwrite_offset(FILE * fp,off_t offset)1768 esl_fwrite_offset(FILE *fp, off_t offset)
1769 {
1770 if (sizeof(off_t) == 4) return esl_fwrite_u32(fp, offset);
1771 else if (sizeof(off_t) == 8) return esl_fwrite_u64(fp, offset);
1772 else ESL_EXCEPTION(eslEINVAL, "off_t is neither 32-bit nor 64-bit");
1773 /*UNREACHED*/
1774 return eslEINCONCEIVABLE;
1775 }
1776
1777
1778 /*****************************************************************
1779 * 5. Unit tests
1780 *****************************************************************/
1781 #ifdef eslSSI_TESTDRIVE
1782
1783 #include "esl_arr2.h"
1784 #include "esl_getopts.h"
1785 #include "esl_sq.h"
1786 #include "esl_sqio.h"
1787 #include "esl_random.h"
1788 #include "esl_randomseq.h"
1789
1790 struct ssi_testdata {
1791 int nfiles; // generate this many files...
1792 int nseq; // ... with this many sequences per file ...
1793 int maxL; // ... with sequence lengths 1..maxL.
1794
1795 char **sqfile; // seq file names, [0..nfiles-1]
1796 char **seqname; // seq names, [0..nfiles*nseq-1] = "seq%d-file%d"
1797 char **seqdesc; // seq descriptions, [0..nfiles*nseq-1] = "desc%d-file%d", used as secondary keys
1798 int *seqlen; // seq lengths, [0..nfiles*nseq-1]
1799 char **seq;
1800 };
1801
1802 static struct ssi_testdata *
ssi_testdata_create(ESL_RANDOMNESS * rng,int max_nfiles,int max_nseq,int maxL,int do_dupname)1803 ssi_testdata_create(ESL_RANDOMNESS *rng, int max_nfiles, int max_nseq, int maxL, int do_dupname)
1804 {
1805 char msg[] = "esl_ssi test data creation failed";
1806 struct ssi_testdata *td;
1807 double p[4] = { 0.25, 0.25, 0.25, 0.25 }; // composition of random generated DNA seqs
1808 FILE *fp;
1809 ESL_SQ *sq;
1810 int i,j;
1811
1812 if ( (td = malloc(sizeof(struct ssi_testdata))) == NULL) esl_fatal(msg);
1813 td->nfiles = 1 + esl_rnd_Roll(rng, max_nfiles); // 1..max_nfiles
1814 td->nseq = 2 + esl_rnd_Roll(rng, max_nseq-1); // 2..max_nseq Need at least 2 for dup test to always be valid
1815 td->maxL = maxL;
1816
1817 /* Create <td->nfiles> sequence tmpfile names. */
1818 if ( (td->sqfile = malloc(sizeof(char *) * td->nfiles)) == NULL) esl_fatal(msg);
1819 for (j = 0; j < td->nfiles; j++)
1820 if ( esl_sprintf(&(td->sqfile[j]), "esltmpXXXXXX" ) != eslOK) esl_fatal(msg);
1821
1822 /* Create <td->nfiles*td->nseq> sequences with random lengths up to <maxL> */
1823 if ( (td->seq = malloc(sizeof(char *) * td->nseq * td->nfiles)) == NULL) esl_fatal(msg);
1824 if ( (td->seqname = malloc(sizeof(char *) * td->nseq * td->nfiles)) == NULL) esl_fatal(msg);
1825 if ( (td->seqlen = malloc(sizeof(int) * td->nseq * td->nfiles)) == NULL) esl_fatal(msg);
1826 if ( (td->seqdesc = malloc(sizeof(char *) * td->nseq * td->nfiles)) == NULL) esl_fatal(msg);
1827 for (i = 0; i < td->nseq*td->nfiles; i++)
1828 {
1829 td->seqlen[i] = 1 + esl_rnd_Roll(rng, td->maxL); /* 1..maxL */
1830 if ( (td->seq[i] = malloc(sizeof(char) * (td->seqlen[i]+1))) == NULL) esl_fatal(msg);
1831 if ( esl_rsq_IID(rng, "ACGT", p, 4, td->seqlen[i], td->seq[i]) != eslOK) esl_fatal(msg);
1832 if ( esl_sprintf(&(td->seqname[i]), "seq%d-file%d", i, i/td->nseq) != eslOK) esl_fatal(msg);
1833 if ( esl_sprintf(&(td->seqdesc[i]), "desc%d-file%d", i, i/td->nseq) != eslOK) esl_fatal(msg);
1834 }
1835
1836 /* If we were asked to poison with a duplicate key, do it (randomly duplicating a primary or secondary key) */
1837 if (do_dupname)
1838 {
1839 i = esl_rnd_Roll(rng, td->nseq * td->nfiles - 1); // 0..n-2
1840 j = i + 1 + esl_rnd_Roll(rng, td->nseq * td->nfiles - i - 1); // i+1..n-1
1841 if (esl_rnd_Roll(rng, 2) == 0) {
1842 strcpy(td->seqname[i], "DUP"); // Allocated space is guaranteed to be enough,
1843 strcpy(td->seqname[j], "DUP"); // because the original name was "seq%d-file%d"
1844 } else {
1845 strcpy(td->seqdesc[i], "DUP");
1846 strcpy(td->seqdesc[j], "DUP");
1847 }
1848 }
1849
1850 /* Save them to files. */
1851 for (j = 0; j < td->nfiles; j++)
1852 {
1853 if (esl_tmpfile_named(td->sqfile[j], &fp) != eslOK) esl_fatal(msg);
1854 for (i = j*td->nseq; i < (j+1)*td->nseq; i++)
1855 {
1856 if ( (sq = esl_sq_CreateFrom(td->seqname[i], td->seq[i], td->seqdesc[i], NULL, NULL)) == NULL) esl_fatal(msg);
1857 if ( esl_sqio_Write(fp, sq, eslSQFILE_FASTA, FALSE) != eslOK) esl_fatal(msg);
1858 esl_sq_Destroy(sq);
1859 }
1860 fclose(fp);
1861 }
1862
1863 return td;
1864 }
1865
1866 static void
ssi_testdata_destroy(struct ssi_testdata * td)1867 ssi_testdata_destroy(struct ssi_testdata *td)
1868 {
1869 int j;
1870 for (j = 0; j < td->nfiles; j++) remove(td->sqfile[j]);
1871 esl_arr2_Destroy((void **) td->sqfile, td->nfiles);
1872 esl_arr2_Destroy((void **) td->seqname, td->nseq*td->nfiles);
1873 esl_arr2_Destroy((void **) td->seqdesc, td->nseq*td->nfiles);
1874 esl_arr2_Destroy((void **) td->seq, td->nseq*td->nfiles);
1875 free(td->seqlen);
1876 free(td);
1877 }
1878
1879 static void
utest_enchilada(ESL_GETOPTS * go,ESL_RANDOMNESS * rng,int do_external,int do_dupkeys)1880 utest_enchilada(ESL_GETOPTS *go, ESL_RANDOMNESS *rng, int do_external, int do_dupkeys)
1881 {
1882 char msg[] = "esl_ssi whole enchilada test failed";
1883 struct ssi_testdata *td = NULL;
1884 int nq = 10; // number of SSI-based retrievals to test
1885 char *ssifile = NULL; // Index creation: ssifile to create.
1886 ESL_NEWSSI *ns = NULL; // new SSI index data
1887 ESL_SQFILE *sqfp = NULL; // open FASTA file that we're indexing
1888 ESL_SQ *sq = NULL; // a sequence from <sqfp>
1889 uint16_t fh; // handle on an indexed fasta file, from _AddFile, for _AddKey
1890 ESL_SSI *ssi = NULL; // Retrieval testing: open SSI index to use
1891 char query[32]; // name of sequence to retrieve
1892 char *qfile; // retrieved name of file it's in
1893 int qfmt; // retrieved format of that file (fasta)
1894 off_t roff; // retrieved record offset of it
1895 int i,j;
1896 int status;
1897
1898 td = ssi_testdata_create(rng,
1899 esl_opt_GetInteger(go, "-F"), // max # of seq files
1900 esl_opt_GetInteger(go, "-N"), // max # of seqs per file
1901 esl_opt_GetInteger(go, "-L"), // max seq length
1902 do_dupkeys); // if you poison w/ dup keys, _Write should fail.
1903
1904 /* Create an ssi index of all the FASTA files. */
1905 if (esl_strdup(td->sqfile[0], -1, &ssifile) != eslOK) esl_fatal(msg);
1906 if (esl_strcat(&ssifile, -1, ".ssi", 4) != eslOK) esl_fatal(msg);
1907 if (esl_newssi_Open(ssifile, TRUE, &ns) != eslOK) esl_fatal(msg);
1908 if ((sq = esl_sq_Create()) == NULL) esl_fatal(msg);
1909 if (do_external)
1910 if (activate_external_sort(ns) != eslOK) esl_fatal(msg);
1911 for (j = 0; j < td->nfiles; j++)
1912 {
1913 if (esl_sqfile_Open(td->sqfile[j], eslSQFILE_UNKNOWN, NULL, &sqfp) != eslOK) esl_fatal(msg);
1914 if (esl_newssi_AddFile(ns, td->sqfile[j], sqfp->format, &fh) != eslOK) esl_fatal(msg);
1915 while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
1916 {
1917 if (esl_newssi_AddKey (ns, sq->name, fh, sq->roff, sq->doff, sq->L) != eslOK) esl_fatal(msg);
1918 if (esl_newssi_AddAlias(ns, sq->desc, sq->name) != eslOK) esl_fatal(msg);
1919 esl_sq_Reuse(sq);
1920 }
1921 if (status != eslEOF) esl_fatal(msg);
1922 esl_sqfile_Close(sqfp);
1923 }
1924 esl_sq_Destroy(sq);
1925
1926 /* Save the SSI index to a file. */
1927 status = esl_newssi_Write(ns);
1928 if ( do_dupkeys && status != eslEDUP) esl_fatal(msg);
1929 if (! do_dupkeys && status != eslOK) esl_fatal(msg);
1930 esl_newssi_Close(ns);
1931
1932 /* Open the SSI index - now we'll use it to retrieve <nq> random sequences. */
1933 if (! do_dupkeys)
1934 {
1935 if (esl_ssi_Open(ssifile, &ssi) != eslOK) esl_fatal(msg);
1936 sq = esl_sq_Create();
1937 while (nq--)
1938 {
1939 /* Choose a seq and file */
1940 i = esl_rnd_Roll(rng, td->nseq*td->nfiles);
1941 j = i/td->nseq;
1942 if (esl_rnd_Roll(rng, 2) == 0) sprintf(query, "seq%d-file%d", i, j); // by primary key
1943 else sprintf(query, "desc%d-file%d", i, j); // by secondary key
1944
1945 /* Retrieve it */
1946 if ( esl_ssi_FindName(ssi, query, &fh, &roff, NULL, NULL) != eslOK) esl_fatal(msg);
1947 if ( esl_ssi_FileInfo(ssi, fh, &qfile, &qfmt) != eslOK) esl_fatal(msg);
1948 if ( esl_sqfile_Open(qfile, qfmt, NULL, &sqfp) != eslOK) esl_fatal(msg);
1949 if ( esl_sqfile_Position(sqfp, roff) != eslOK) esl_fatal(msg);
1950 if ( esl_sqio_Read(sqfp, sq) != eslOK) esl_fatal(msg);
1951
1952 /* Check that it's the right one */
1953 if (strcmp(sq->name, query) != 0 && strcmp(sq->desc, query) != 0) esl_fatal(msg);
1954 if (sq->n != td->seqlen[i]) esl_fatal(msg);
1955 if (strcmp(sq->seq, td->seq[i]) != 0) esl_fatal(msg);
1956 if (strcmp(qfile, td->sqfile[j]) != 0) esl_fatal(msg);
1957
1958 esl_sq_Reuse(sq);
1959 esl_sqfile_Close(sqfp);
1960 }
1961 remove(ssifile); // in the dup keys test, ssifile is removed by _Write().
1962 esl_sq_Destroy(sq);
1963 esl_ssi_Close(ssi);
1964 }
1965
1966 ssi_testdata_destroy(td);
1967 free(ssifile);
1968 }
1969 #endif /*eslSSI_TESTDRIVE*/
1970
1971
1972 /*****************************************************************
1973 * 5. Test driver
1974 *****************************************************************/
1975
1976 #ifdef eslSSI_TESTDRIVE
1977 #include "esl_config.h"
1978
1979 #include <stdlib.h>
1980 #include <stdio.h>
1981 #include <string.h>
1982
1983 #include "easel.h"
1984 #include "esl_getopts.h"
1985 #include "esl_sq.h"
1986 #include "esl_sqio.h"
1987 #include "esl_ssi.h"
1988 #include "esl_random.h"
1989 #include "esl_randomseq.h"
1990
1991 static ESL_OPTIONS options[] = {
1992 /* name type default env range toggles reqs incomp help docgroup*/
1993 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1994 { "-F", eslARG_INT, "3", NULL, NULL, NULL, NULL, NULL, "max number of test files", 0 },
1995 { "-L", eslARG_INT, "1000", NULL, NULL, NULL, NULL, NULL, "max length of test sequences", 0 },
1996 { "-N", eslARG_INT, "10", NULL, NULL, NULL, NULL, NULL, "max number of test sequences per file", 0 },
1997 { "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
1998 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1999 };
2000 static char usage[] = "[-options]";
2001 static char banner[] = "test driver for ssi module";
2002
2003 int
main(int argc,char ** argv)2004 main(int argc, char **argv)
2005 {
2006 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
2007 ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
2008
2009 fprintf(stderr, "## %s\n", argv[0]);
2010 fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
2011
2012 /* do_external do_dupkeys */
2013 utest_enchilada(go, rng, FALSE, FALSE);
2014 utest_enchilada(go, rng, TRUE, FALSE);
2015 utest_enchilada(go, rng, FALSE, TRUE);
2016 utest_enchilada(go, rng, TRUE, TRUE);
2017
2018 esl_randomness_Destroy(rng);
2019 esl_getopts_Destroy(go);
2020
2021 fprintf(stderr, "# status = ok\n");
2022 return eslOK;
2023 }
2024 #endif /*eslSSI_TESTDRIVE*/
2025
2026
2027
2028 /*****************************************************************
2029 * 5. Example code.
2030 ****************************************************************/
2031 #ifdef eslSSI_EXAMPLE
2032 /* gcc -o example -g -Wall -DeslSSI_EXAMPLE esl_ssi.c easel.c
2033 * esl-shuffle -o foo.fa -N 1000 -G --amino -L 400
2034 * ./example foo.fa
2035 */
2036 /*::cexcerpt::ssi_example::begin::*/
2037 #include <stdio.h>
2038 #include "easel.h"
2039 #include "esl_ssi.h"
2040
2041 int
main(int argc,char ** argv)2042 main(int argc, char **argv)
2043 {
2044 ESL_NEWSSI *ns;
2045 char *fafile; /* name of FASTA file */
2046 FILE *fp; /* opened FASTA file for reading */
2047 char *ssifile; /* name of SSI file */
2048 uint16_t fh; /* file handle SSI associates w/ fafile */
2049 char *buf = NULL; /* growable buffer for esl_fgets() */
2050 int n = 0; /* length of buf */
2051 char *s, *seqname;
2052 off_t seq_offset;
2053 int status;
2054
2055 /* Open a new SSI index named <fafile>.ssi */
2056 fafile = argv[1];
2057 esl_strdup(fafile, -1, &ssifile);
2058 esl_strcat(&ssifile, -1, ".ssi", 4);
2059 status = esl_newssi_Open(ssifile, FALSE, &ns);
2060 if (status == eslENOTFOUND) esl_fatal("failed to open SSI index %s", ssifile);
2061 else if (status == eslEOVERWRITE) esl_fatal("SSI index %s already exists; delete or rename it", ssifile);
2062 else if (status != eslOK) esl_fatal("failed to create a new SSI index");
2063
2064 /* Collect the sequence names from a FASTA file into an index */
2065 if ((fp = fopen(fafile, "r")) == NULL) esl_fatal("failed to open %s", fafile);
2066 if (esl_newssi_AddFile(ns, fafile, 1, &fh) != eslOK) esl_fatal("failed to add %s to index: %s", fafile, ns->errbuf);
2067 seq_offset = ftello(fp);
2068 while (esl_fgets(&buf, &n, fp) == eslOK)
2069 {
2070 if (*buf == '>') {
2071 s = buf+1; /* skip past > */
2072 esl_strtok(&s, " \t\n", &seqname); /* name = 1st token on > line */
2073 if (esl_newssi_AddKey(ns, seqname, fh, seq_offset, 0, 0) != eslOK)
2074 esl_fatal("failed to add key %s to index: %s", seqname, ns->errbuf);
2075 }
2076 seq_offset = ftello(fp);
2077 }
2078 free(buf);
2079 fclose(fp);
2080
2081 /* Save the index to disk */
2082 status = esl_newssi_Write(ns);
2083 if (status == eslEDUP) esl_fatal("SSI index construction failed:\n %s", ns->errbuf);
2084 else if (status == eslERANGE) esl_fatal("SSI index file size exceeds maximum allowed by your filesystem");
2085 else if (status == eslESYS) esl_fatal("SSI index sort failed:\n %s", ns->errbuf);
2086 else if (status != eslOK) esl_fatal("SSI indexing failed:\n %s", ns->errbuf);
2087 esl_newssi_Close(ns);
2088 free(ssifile);
2089 return 0;
2090 }
2091 /*::cexcerpt::ssi_example::end::*/
2092 #endif /*eslSSI_EXAMPLE*/
2093
2094
2095 #ifdef eslSSI_EXAMPLE2
2096 /* gcc -o example2 -g -Wall -DeslSSI_EXAMPLE2 esl_ssi.c easel.c
2097 * ./example2 random77 foo.fa.ssi
2098 */
2099 /*::cexcerpt::ssi_example2::begin::*/
2100 #include <stdio.h>
2101 #include "easel.h"
2102 #include "esl_ssi.h"
2103
main(int argc,char ** argv)2104 int main(int argc, char **argv)
2105 {
2106 ESL_SSI *ssi;
2107 char *seqname; /* name of sequence to retrieve */
2108 char *ssifile; /* name of SSI file */
2109 uint16_t fh; /* file handle SSI associates w/ fafile */
2110 char *fafile; /* name of FASTA file */
2111 int fmt; /* format code (1, in this example) */
2112 off_t offset; /* disk offset of seqname in fafile */
2113 FILE *fp; /* opened FASTA file for reading */
2114 char *buf = NULL; /* growable buffer for esl_fgets() */
2115 int n = 0; /* size of buffer */
2116
2117 seqname = argv[1];
2118 ssifile = argv[2];
2119
2120 if (esl_ssi_Open(ssifile, &ssi) != eslOK) esl_fatal("open failed");
2121 if (esl_ssi_FindName(ssi, seqname, &fh, &offset, NULL, NULL) != eslOK) esl_fatal("find failed");
2122 if (esl_ssi_FileInfo(ssi, fh, &fafile, &fmt) != eslOK) esl_fatal("info failed");
2123 /* you can't close the ssi file yet - fafile is pointing into it! */
2124
2125 if ((fp = fopen(fafile, "r")) == NULL) esl_fatal("failed to open %s", fafile);
2126 if (fseeko(fp, offset, SEEK_SET) != 0) esl_fatal("failed to position %s", fafile);
2127 if (esl_fgets(&buf, &n, fp) != eslOK) esl_fatal("failed to get name/desc line");
2128 do {
2129 printf("%s", buf);
2130 } while (esl_fgets(&buf, &n, fp) == eslOK && *buf != '>');
2131
2132 esl_ssi_Close(ssi);
2133 fclose(fp);
2134 free(buf);
2135 return 0;
2136 }
2137 /*::cexcerpt::ssi_example2::end::*/
2138 #endif /*eslSSI_EXAMPLE2*/
2139
2140