1 /* esl_dsqdata : faster sequence input
2 *
3 * Implements a predigitized binary file format for biological
4 * sequences. Sequence data are packed bitwise into 32-bit packets,
5 * where each packet contains either six 5-bit residues or fifteen
6 * 2-bit residues, plus two control bits. Input is asynchronous,
7 * using POSIX threads, with a "loader" thread doing disk reads and
8 * "unpacker" thread(s) preparing chunks of sequences for
9 * analysis. Sequence data and metadata are stored in separate files,
10 * which may allow further input acceleration by deferring
11 * metadata accesses until they're actually needed.
12 *
13 * All thread synchronization is handled internally. A caller does not
14 * need to worry about the internal parallelism; it just calls
15 * <esl_dsqdata_Read()>. Caller can create multiple threads, each
16 * calling <esl_dsqdata_Read()>.
17 *
18 * A DSQDATA database <basename> is stored in four files:
19 * - basename : a human-readable stub
20 * - basename.dsqi : index file, enabling random access & parallel chunking
21 * - basename.dsqm : metadata including names, accessions, descs, taxonomy
22 * - basename.dsqs : sequences, in a packed binary format
23 *
24 * Contents:
25 * 1. ESL_DSQDATA: reading dsqdata format
26 * 2. Creating dsqdata format from a sequence file
27 * 3. ESL_DSQDATA_CHUNK, a chunk of input sequence data
28 * 4. Loader and unpacker, the input threads
29 * 5. Packing sequences and unpacking chunks
30 * 6. Notes and references
31 * 7. Unit tests
32 * 8. Test driver
33 * 9. Examples
34 */
35 #include "esl_config.h"
36
37 #include <stdio.h>
38 #include <string.h>
39 #include <stdint.h>
40 #include <pthread.h>
41
42 #include "easel.h"
43 #include "esl_alphabet.h"
44 #include "esl_random.h"
45 #include "esl_sq.h"
46 #include "esl_sqio.h"
47
48 #include "esl_dsqdata.h"
49
50 static ESL_DSQDATA_CHUNK *dsqdata_chunk_Create (ESL_DSQDATA *dd);
51 static void dsqdata_chunk_Destroy(ESL_DSQDATA_CHUNK *chu);
52
53 static void *dsqdata_loader_thread (void *p);
54 static void *dsqdata_unpacker_thread(void *p);
55
56 static int dsqdata_unpack_chunk(ESL_DSQDATA_CHUNK *chu, int do_pack5);
57 static int dsqdata_unpack5(uint32_t *psq, ESL_DSQ *dsq, int *ret_L, int *ret_P);
58 static int dsqdata_unpack2(uint32_t *psq, ESL_DSQ *dsq, int *ret_L, int *ret_P);
59 static int dsqdata_pack5 (ESL_DSQ *dsq, int L, uint32_t *psq, int *ret_P);
60 static int dsqdata_pack2 (ESL_DSQ *dsq, int L, uint32_t *psq, int *ret_P);
61
62
63 /* Embedded magic numbers allow us to validate the correct binary
64 * format, with version (if needed in the future), and to detect
65 * byteswapping.
66 */
67 static uint32_t eslDSQDATA_MAGIC_V1 = 0xc4d3d1b1; // "dsq1" + 0x80808080
68 static uint32_t eslDSQDATA_MAGIC_V1SWAP = 0xb1d1d3c4; // ... as above, but byteswapped.
69
70 /*****************************************************************
71 *# 1. <ESL_DSQDATA>: reading dsqdata format
72 *****************************************************************/
73
74 /* Function: esl_dsqdata_Open()
75 * Synopsis: Open a digital sequence database for reading
76 * Incept: SRE, Wed Jan 20 09:50:00 2016 [Amtrak 2150, NYP-BOS]
77 *
78 * Purpose: Open dsqdata database <basename> for reading. The file
79 * <basename> is a stub describing the database. The bulk
80 * of the data are in three accompanying binary files: the
81 * index file <basename>.dsqi, the metadata file
82 * <basename>.dsqm, and the sequence file <basename>.dsqs.
83 *
84 * <nconsumers> is an upper bound on the number of threads
85 * in which the caller plans to be calling
86 * <esl_dsqdata_Read()> -- or, more precisely, the maximum
87 * number of data chunks that the caller could be working
88 * on at any given instant. This is a hint, not a
89 * commitment. The dsqdata loader uses it to determine the
90 * maximum number of data chunks that can be in play at
91 * once (including chunks it is juggling internally, plus
92 * if all the caller's reader threads are busy on
93 * theirs). If <nconsumers> is set too small, the loader
94 * may stall waiting for chunks to come back for recycling.
95 *
96 * Reading digital sequence data requires a digital
97 * alphabet. You can either provide one (in which case we
98 * validate that it matches the alphabet used by the
99 * dsqdata) or, as a convenience, <esl_dsqdata_Open()> can
100 * create one for you. Either way, you pass a pointer to an
101 * <ESL_ALPHABET> structure <abc>, in <byp_abc>. <byp_abc>
102 * uses a partial Easel "bypass" idiom: if <*byp_abc> is
103 * NULL, we allocate and return a new alphabet; if
104 * <*byp_abc> is a ptr to an existing alphabet, we use it
105 * for validation. That is, you have two choices:
106 *
107 * ```
108 * ESL_ALPHABET *abc = NULL;
109 * esl_dsqdata_Open(&abc, basename...)
110 * // <abc> is now the alphabet of <basename>;
111 * // now you're responsible for Destroy'ing it
112 * ```
113 *
114 * or:
115 *
116 * ```
117 * ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
118 * status = esl_dsqdata_Open(&abc, basename);
119 * // if status == eslEINCOMPAT, alphabet in basename
120 * // doesn't match caller's expectation
121 * ```
122 *
123 * Args: byp_abc : expected or created alphabet; pass &abc, abc=NULL or abc=expected alphabet
124 * basename : data are in files <basename> and <basename.dsq[ism]>
125 * nconsumers : upper bound on number of consumer threads caller is going to Read() with
126 * ret_dd : RETURN : the new ESL_DSQDATA object.
127 *
128 * Returns: <eslOK> on success.
129 *
130 * <eslENOTFOUND> if one or more of the expected datafiles
131 * aren't there or can't be opened.
132 *
133 * <eslEFORMAT> if something looks wrong in parsing file
134 * formats. Includes problems in headers, and also the
135 * case where caller provides a digital alphabet in
136 * <*byp_abc> and it doesn't match the database's alphabet.
137 *
138 * On any normal error, <*ret_dd> is still returned, but in
139 * an error state, and <dd->errbuf> is a user-directed
140 * error message that the caller can relay to the user. Other
141 * than the <errbuf>, the rest of the contents are undefined.
142 *
143 * Caller is responsible for destroying <*byp_abc>.
144 *
145 * Throws: <eslEMEM> on allocation error.
146 * <eslESYS> on system call failure.
147 * <eslEUNIMPLEMENTED> if data are byteswapped
148 * TODO: handle byteswapping
149 *
150 * On any thrown exception, <*ret_dd> is returned NULL.
151 *
152 * On <eslESYS> exceptions, some thread resources may
153 * not be fully freed, leading to some memory leakage.
154 */
155 int
esl_dsqdata_Open(ESL_ALPHABET ** byp_abc,char * basename,int nconsumers,ESL_DSQDATA ** ret_dd)156 esl_dsqdata_Open(ESL_ALPHABET **byp_abc, char *basename, int nconsumers, ESL_DSQDATA **ret_dd)
157 {
158 ESL_DSQDATA *dd = NULL;
159 int bufsize = 4096;
160 uint32_t magic = 0;
161 uint32_t tag = 0;
162 uint32_t alphatype = eslUNKNOWN;
163 char *p; // used for strtok() parsing of fields on a line
164 char buf[4096];
165 int u;
166 int status;
167
168 ESL_DASSERT1(( nconsumers > 0 ));
169 ESL_DASSERT1(( byp_abc != NULL )); // either *byp_abc == NULL or *byp_abc = the caller's expected alphabet.
170
171 ESL_ALLOC(dd, sizeof(ESL_DSQDATA));
172 dd->stubfp = NULL;
173 dd->ifp = NULL;
174 dd->sfp = NULL;
175 dd->mfp = NULL;
176 dd->abc_r = *byp_abc; // This may be NULL; if so, we create it later.
177
178 dd->magic = 0;
179 dd->uniquetag = 0;
180 dd->flags = 0;
181 dd->max_namelen = 0;
182 dd->max_acclen = 0;
183 dd->max_desclen = 0;
184 dd->max_seqlen = 0;
185 dd->nseq = 0;
186 dd->nres = 0;
187
188 dd->chunk_maxseq = eslDSQDATA_CHUNK_MAXSEQ; // someday we may want to allow tuning these
189 dd->chunk_maxpacket = eslDSQDATA_CHUNK_MAXPACKET;
190 dd->do_byteswap = FALSE;
191 dd->pack5 = FALSE;
192
193 dd->nconsumers = nconsumers;
194 dd->n_unpackers = eslDSQDATA_UNPACKERS; // we'll want to allow tuning this too
195 dd->errbuf[0] = '\0';
196
197 /* Open the four files.
198 */
199 ESL_ALLOC( dd->basename, sizeof(char) * (strlen(basename) + 6)); // +5 for .dsqx; +1 for \0
200 if ( sprintf(dd->basename, "%s.dsqi", basename) <= 0) ESL_XEXCEPTION_SYS(eslESYS, "sprintf() failure");
201 if (( dd->ifp = fopen(dd->basename, "rb")) == NULL) ESL_XFAIL(eslENOTFOUND, dd->errbuf, "Failed to find or open index file %s\n", dd->basename);
202
203 if ( sprintf(dd->basename, "%s.dsqm", basename) <= 0) ESL_XEXCEPTION_SYS(eslESYS, "sprintf() failure");
204 if (( dd->mfp = fopen(dd->basename, "rb")) == NULL) ESL_XFAIL(eslENOTFOUND, dd->errbuf, "Failed to find or open metadata file %s\n", dd->basename);
205
206 if ( sprintf(dd->basename, "%s.dsqs", basename) <= 0) ESL_XEXCEPTION_SYS(eslESYS, "sprintf() failure");
207 if (( dd->sfp = fopen(dd->basename, "rb")) == NULL) ESL_XFAIL(eslENOTFOUND, dd->errbuf, "Failed to find or open sequence file %s\n", dd->basename);
208
209 strcpy(dd->basename, basename);
210 if (( dd->stubfp = fopen(dd->basename, "r")) == NULL) ESL_XFAIL(eslENOTFOUND, dd->errbuf, "Failed to find or open stub file %s\n", dd->basename);
211
212 /* The stub file is unparsed, intended to be human readable, with one exception:
213 * The first line contains the unique tag that we use to validate linkage of the 4 files.
214 * The format of that first line is:
215 * Easel dsqdata v123 x0000000000
216 */
217 if ( fgets(buf, bufsize, dd->stubfp) == NULL) ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file is empty - no tag line found");
218 if (( p = strtok(buf, " \t\n\r")) == NULL) ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format: tag line has no data");
219 if ( strcmp(p, "Easel") != 0) ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
220 if (( p = strtok(NULL, " \t\n\r")) == NULL) ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
221 if ( strcmp(p, "dsqdata") != 0) ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
222 if (( p = strtok(NULL, " \t\n\r")) == NULL) ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
223 if ( *p != 'v') ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format: no v on version");
224 if ( ! esl_str_IsInteger(p+1)) ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file had bad format: no version number");
225 // version number is currently unused: there's only 1
226 if (( p = strtok(NULL, " \t\n\r")) == NULL) ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
227 if ( *p != 'x') ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format: no x on tag");
228 if ( ! esl_str_IsInteger(p+1)) ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file had bad format: no integer tag");
229 dd->uniquetag = strtoul(p+1, NULL, 10);
230
231 /* Index file has a header of 7 uint32's, 3 uint64's */
232 if ( fread(&(dd->magic), sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file has no header - is empty?");
233 if ( fread(&tag, sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no tag");
234 if ( fread(&alphatype, sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no alphatype");
235 if ( fread(&(dd->flags), sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no flags");
236 if ( fread(&(dd->max_namelen), sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no max name len");
237 if ( fread(&(dd->max_acclen), sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no max accession len");
238 if ( fread(&(dd->max_desclen), sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no max description len");
239
240 if ( fread(&(dd->max_seqlen), sizeof(uint64_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no max seq len");
241 if ( fread(&(dd->nseq), sizeof(uint64_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no nseq");
242 if ( fread(&(dd->nres), sizeof(uint64_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no nres");
243
244 /* Check the magic and the tag */
245 if (tag != dd->uniquetag) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file has bad tag, doesn't go with stub file");
246 // Eventually we would set dd->do_byteswap = TRUE; below.
247 if (dd->magic == eslDSQDATA_MAGIC_V1SWAP) ESL_XEXCEPTION(eslEUNIMPLEMENTED, "dsqdata cannot yet read data in different byte orders");
248 else if (dd->magic != eslDSQDATA_MAGIC_V1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file has bad magic");
249
250 /* Either validate, or create the alphabet */
251 if (dd->abc_r)
252 {
253 if (alphatype != dd->abc_r->type)
254 ESL_XFAIL(eslEFORMAT, dd->errbuf, "data files use %s alphabet; expected %s alphabet",
255 esl_abc_DecodeType(alphatype),
256 esl_abc_DecodeType(dd->abc_r->type));
257 }
258 else
259 {
260 if ( esl_abc_ValidateType(alphatype) != eslOK) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file has invalid alphabet type %d", alphatype);
261 if (( dd->abc_r = esl_alphabet_Create(alphatype)) == NULL) ESL_XEXCEPTION(eslEMEM, "alphabet creation failed");
262 }
263
264 /* If it's protein, flip the switch to expect all 5-bit packing */
265 if (dd->abc_r->type == eslAMINO) dd->pack5 = TRUE;
266
267 /* Metadata file has a header of 2 uint32's, magic and uniquetag */
268 if (( fread(&magic, sizeof(uint32_t), 1, dd->mfp)) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "metadata file has no header - is empty?");
269 if (( fread(&tag, sizeof(uint32_t), 1, dd->mfp)) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "metadata file header truncated - no tag?");
270 if ( magic != dd->magic) ESL_XFAIL(eslEFORMAT, dd->errbuf, "metadata file has bad magic");
271 if ( tag != dd->uniquetag) ESL_XFAIL(eslEFORMAT, dd->errbuf, "metadata file has bad tag, doesn't match stub");
272
273 /* Sequence file also has a header of 2 uint32's, magic and uniquetag */
274 if (( fread(&magic, sizeof(uint32_t), 1, dd->sfp)) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "sequence file has no header - is empty?");
275 if (( fread(&tag, sizeof(uint32_t), 1, dd->sfp)) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "sequence file header truncated - no tag?");
276 if ( magic != dd->magic) ESL_XFAIL(eslEFORMAT, dd->errbuf, "sequence file has bad magic");
277 if ( tag != dd->uniquetag) ESL_XFAIL(eslEFORMAT, dd->errbuf, "sequence file has bad tag, doesn't match stub");
278
279 /* unpacker inboxes and outboxes */
280 for (u = 0; u < dd->n_unpackers; u++)
281 {
282 dd->inbox[u] = NULL;
283 if ( pthread_mutex_init(&(dd->inbox_mutex[u]), NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed");
284 if ( pthread_cond_init (&(dd->inbox_cv[u]), NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_init() failed");
285 dd->inbox_eod[u] = FALSE;
286
287 dd->outbox[u] = NULL;
288 if ( pthread_mutex_init(&(dd->outbox_mutex[u]), NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed");
289 if ( pthread_cond_init (&(dd->outbox_cv[u]), NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_init() failed");
290 dd->outbox_eod[u] = FALSE;
291 }
292
293 /* consumers share access to <nchunk> counter */
294 dd->nchunk = 0;
295 if ( pthread_mutex_init(&dd->nchunk_mutex, NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed");
296
297 /* chunk recycling stack */
298 dd->recycling = NULL;
299 if ( pthread_mutex_init(&dd->recycling_mutex, NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed");
300 if ( pthread_cond_init(&dd->recycling_cv, NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_init() failed");
301
302 /* Create the "initialization is complete" signaling mechanism
303 * before creating any threads, and lock the initialization mutex
304 * while we're creating them. The issue here is that unpackers
305 * identify their [u] index by comparing their self thread id to the
306 * master list of thread ids, so make sure that master list actually
307 * exists.
308 */
309 dd->go = FALSE;
310 if ( pthread_mutex_init(&dd->go_mutex, NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed on go_mutex");
311 if ( pthread_cond_init( &dd->go_cv, NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_init() failed on go_cv");
312 if ( pthread_mutex_lock(&dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock() failed on go_mutex");
313
314 /* Create the loader and the unpackers.
315 * They will wait to start until we signal through go->cv.
316 */
317 if ( pthread_create(&dd->loader_t, NULL, dsqdata_loader_thread, dd) != 0) ESL_XEXCEPTION(eslESYS, "pthread_create() failed");
318 for (u = 0; u < dd->n_unpackers; u++)
319 if ( pthread_create(&(dd->unpacker_t[u]), NULL, dsqdata_unpacker_thread, dd) != 0) ESL_XEXCEPTION(eslESYS, "pthread_create() failed");
320
321 /* All threads started, initialization complete - broadcast "go" signal to all threads */
322 dd->go = TRUE;
323 if ( pthread_mutex_unlock(&dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_unlock() failed on go_mutex");
324 if ( pthread_cond_broadcast(&dd->go_cv) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_broadcast() failed on go_cv");
325
326 *ret_dd = dd;
327 *byp_abc = dd->abc_r; // If caller provided <*byp_abc> this is a no-op, because we set abc_r = *byp_abc.
328 return eslOK; // .. otherwise we're passing the created <abc> back to caller, caller's
329 // responsibility, we just keep the reference to it.
330 ERROR:
331 if (status == eslENOTFOUND || status == eslEFORMAT || status == eslEINCOMPAT)
332 { /* on normal errors, we return <dd> with its <errbuf>, don't change *byp_abc */
333 *ret_dd = dd;
334 if (*byp_abc == NULL && dd->abc_r) esl_alphabet_Destroy(dd->abc_r);
335 return status;
336 }
337 else if (status != eslESYS)
338 { /* on most exceptions, we free <dd>, return it NULL, don't change *byp_abc */
339 esl_dsqdata_Close(dd);
340 *ret_dd = NULL;
341 if (*byp_abc == NULL && dd->abc_r) esl_alphabet_Destroy(dd->abc_r);
342 return status;
343 }
344 else
345 { /* on eslESYS exceptions - pthread initializations failing - we can't assume we can _Close() correctly. */
346 *ret_dd = NULL;
347 if (*byp_abc == NULL && dd->abc_r) esl_alphabet_Destroy(dd->abc_r);
348 return status;
349 }
350 }
351
352
353
354 /* Function: esl_dsqdata_Read()
355 * Synopsis: Read next chunk of sequence data.
356 * Incept: SRE, Thu Jan 21 11:21:38 2016 [Harvard]
357 *
358 * Purpose: Read the next chunk from <dd>, return a pointer to it in
359 * <*ret_chu>, and return <eslOK>. When data are exhausted,
360 * return <eslEOF>, and <*ret_chu> is <NULL>.
361 *
362 * Threadsafe. All thread operations in the dsqdata reader
363 * are handled internally. Caller does not have to worry
364 * about wrapping this in a mutex. Multiple caller threads
365 * can call <esl_dsqdata_Read()>.
366 *
367 * All chunk allocation and deallocation is handled
368 * internally. After using a chunk, caller gives it back to
369 * the reader using <esl_dsqdata_Recycle()>.
370 *
371 * Args: dd : open dsqdata object to read from
372 * ret_chu : RETURN : next chunk of seq data
373 *
374 * Returns: <eslOK> on success. <*ret_chu> is a chunk of seq data.
375 * Caller must call <esl_dsqdata_Recycle()> on each chunk
376 * that it Read()'s.
377 *
378 * <eslEOF> if we've reached the end of the input file;
379 * <*ret_chu> is NULL.
380 *
381 * Throws: <eslESYS> if a pthread call fails.
382 * Caller should treat this as disastrous. Without correctly
383 * working pthread calls, we cannot read, and we may not be able
384 * to correctly clean up and close the reader. Caller should
385 * treat <dd> as toxic, clean up whatever else it may need to,
386 * and exit.
387 */
388 int
esl_dsqdata_Read(ESL_DSQDATA * dd,ESL_DSQDATA_CHUNK ** ret_chu)389 esl_dsqdata_Read(ESL_DSQDATA *dd, ESL_DSQDATA_CHUNK **ret_chu)
390 {
391 ESL_DSQDATA_CHUNK *chu = NULL;
392 int u;
393
394 /* First, determine which slot the next chunk is in, using the consumer-shared <nchunk> counter */
395 if ( pthread_mutex_lock(&dd->nchunk_mutex) != 0) ESL_EXCEPTION(eslESYS, "failed to lock reader mutex");
396 u = (int) (dd->nchunk % dd->n_unpackers);
397
398 /* Acquire a lock that outbox, wait for it to be in full or EOD state */
399 if ( pthread_mutex_lock(&(dd->outbox_mutex[u])) != 0) ESL_EXCEPTION(eslESYS, "failed to lock outbox[u] mutex");
400 while (! dd->outbox_eod[u] && dd->outbox[u] == NULL) {
401 if ( pthread_cond_wait(&(dd->outbox_cv[u]), &(dd->outbox_mutex[u])) != 0) ESL_EXCEPTION(eslESYS, "failed to wait on outbox[u] signal");
402 }
403
404 /* Get the chunk from outbox. */
405 chu = dd->outbox[u];
406 dd->outbox[u] = NULL;
407 if (chu) dd->nchunk++; // chu==NULL if we're EOD. (and eod flag is up too, but we already know that by getting here)
408
409 /* Release the outbox lock and signal back to unpacker (skip signalling if we're EOD; unpacker[u] is done) */
410 if ( pthread_mutex_unlock(&(dd->outbox_mutex[u])) != 0) ESL_EXCEPTION(eslESYS, "failed to unlock outbox[u] mutex");
411 if ( chu && pthread_cond_signal (&(dd->outbox_cv[u])) != 0) ESL_EXCEPTION(eslESYS, "failed to signal outbox[u] is empty");
412
413 /* Release the reader lock that protects dd->nchunk counter */
414 if ( pthread_mutex_unlock(&dd->nchunk_mutex) != 0) ESL_EXCEPTION(eslESYS, "failed to unlock reader mutex");
415
416 *ret_chu = chu;
417 return (chu ? eslOK : eslEOF);
418 }
419
420
421 /* Function: esl_dsqdata_Recycle()
422 * Synopsis: Give a chunk back to the reader.
423 * Incept: SRE, Thu Feb 11 19:24:33 2016
424 *
425 * Purpose: Recycle chunk <chu> back to the reader <dd>. The reader
426 * is responsible for all allocation and deallocation of
427 * chunks. The reader will either reuse the chunk's memory
428 * if more chunks remain to be read, or it will free it.
429 *
430 * Args: dd : dsqdata reader
431 * chu : chunk to recycle
432 *
433 * Returns: <eslOK> on success.
434 *
435 * Throws: <eslESYS> on a pthread call failure. Caller should regard
436 * such an error as disastrous; if pthread calls are
437 * failing, you cannot depend on the reader to be working
438 * at all, and you should treat <dd> as toxic. Do whatever
439 * desperate things you need to do and exit.
440 */
441 int
esl_dsqdata_Recycle(ESL_DSQDATA * dd,ESL_DSQDATA_CHUNK * chu)442 esl_dsqdata_Recycle(ESL_DSQDATA *dd, ESL_DSQDATA_CHUNK *chu)
443 {
444 if (chu)
445 {
446 if ( pthread_mutex_lock(&dd->recycling_mutex) != 0) ESL_EXCEPTION(eslESYS, "pthread mutex lock failed");
447 chu->nxt = dd->recycling; // Push chunk onto head of recycling stack
448 dd->recycling = chu;
449 if ( pthread_mutex_unlock(&dd->recycling_mutex) != 0) ESL_EXCEPTION(eslESYS, "pthread mutex unlock failed");
450 if ( pthread_cond_signal(&dd->recycling_cv) != 0) ESL_EXCEPTION(eslESYS, "pthread cond signal failed"); // loader may wait for this signal
451 }
452 return eslOK;
453 }
454
455
456
457 /* Function: esl_dsqdata_Close()
458 * Synopsis: Close a dsqdata reader.
459 * Incept: SRE, Thu Feb 11 19:32:54 2016
460 *
461 * Purpose: Close a dsqdata reader.
462 *
463 * Returns: <eslOK> on success.
464
465 * Throws: <eslESYS> on a system call failure, including pthread
466 * calls and fclose(). Caller should regard such a failure
467 * as disastrous: treat <dd> as toxic and exit as soon as
468 * possible without making any other system calls, if possible.
469 *
470 * Note: Normally Easel would call _Close() not only on a correctly
471 * opened object (after the Open returns) but also from
472 * within the Open call to clean up on failure. This relies
473 * on being able to tell which parts of an incomplete
474 * object have been initialized; this is why Easel sets
475 * ptrs to NULL before allocating them, for example. POSIX
476 * threads doesn't seem to give us a way to check whether a
477 * mutex, condition variable, or thread has been
478 * initialized; and destroying an uninitialized mutex or CV
479 * results in undefined behavior. We could include a
480 * boolean flag for each pthreads component -- or one for
481 * all of them -- but instead, we simply say that a failure
482 * of pthreads initialization in the Open function is so
483 * catastrophic and unexpected that we exit that function
484 * without cleaning up the incomplete ESL_DSQDATA
485 * structure. We expect that the caller is simply going to
486 * have to quit anyway, so we don't need to worry about
487 * memory leakage.
488 */
489 int
esl_dsqdata_Close(ESL_DSQDATA * dd)490 esl_dsqdata_Close(ESL_DSQDATA *dd)
491 {
492 int u;
493
494 if (dd)
495 {
496 /* out of abundance of caution - wait for threads to join before breaking down <dd> */
497 if ( pthread_join(dd->loader_t, NULL) != 0) ESL_EXCEPTION(eslESYS, "pthread join failed");
498 for (u = 0; u < dd->n_unpackers; u++)
499 if ( pthread_join(dd->unpacker_t[u], NULL) != 0) ESL_EXCEPTION(eslESYS, "pthread join failed");
500
501 if (dd->basename) free(dd->basename);
502 if (dd->stubfp) { if ( fclose(dd->stubfp) != 0) ESL_EXCEPTION(eslESYS, "fclose failed"); }
503 if (dd->ifp) { if ( fclose(dd->ifp) != 0) ESL_EXCEPTION(eslESYS, "fclose failed"); }
504 if (dd->sfp) { if ( fclose(dd->sfp) != 0) ESL_EXCEPTION(eslESYS, "fclose failed"); }
505 if (dd->mfp) { if ( fclose(dd->mfp) != 0) ESL_EXCEPTION(eslESYS, "fclose failed"); }
506
507 for (u = 0; u < dd->n_unpackers; u++)
508 {
509 if ( pthread_mutex_destroy(&(dd->inbox_mutex[u])) != 0) ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
510 if ( pthread_cond_destroy(&(dd->inbox_cv[u])) != 0) ESL_EXCEPTION(eslESYS, "pthread cond destroy failed");
511 if ( pthread_mutex_destroy(&(dd->outbox_mutex[u])) != 0) ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
512 if ( pthread_cond_destroy(&(dd->outbox_cv[u])) != 0) ESL_EXCEPTION(eslESYS, "pthread cond destroy failed");
513 }
514 if ( pthread_mutex_destroy(&dd->nchunk_mutex) != 0) ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
515 if ( pthread_mutex_destroy(&dd->recycling_mutex) != 0) ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
516 if ( pthread_cond_destroy(&dd->recycling_cv) != 0) ESL_EXCEPTION(eslESYS, "pthread cond destroy failed");
517 if ( pthread_mutex_destroy(&dd->go_mutex) != 0) ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
518 if ( pthread_cond_destroy(&dd->go_cv) != 0) ESL_EXCEPTION(eslESYS, "pthread cond destroy failed");
519
520 /* Loader thread is responsible for freeing all chunks it created, even on error. */
521 #if (eslDEBUGLEVEL >= 1)
522 for (u = 0; u < dd->n_unpackers; u++) {
523 assert( dd->inbox[u] == NULL );
524 assert( dd->outbox[u] == NULL );
525 }
526 assert(dd->recycling == NULL );
527 #endif
528 free(dd);
529 }
530 return eslOK;
531 }
532
533
534 /*****************************************************************
535 *# 2. Creating dsqdata format from a sequence file
536 *****************************************************************/
537
538 /* Function: esl_dsqdata_Write()
539 * Synopsis: Create a dsqdata database
540 * Incept: SRE, Sat Feb 13 07:33:30 2016 [AGBT 2016, Orlando]
541 *
542 * Purpose: Caller has just opened <sqfp>, in digital mode.
543 * Create a dsqdata database <basename> from the sequence
544 * data in <sqfp>.
545 *
546 * <sqfp> must be protein, DNA, or RNA sequence data. It
547 * must be rewindable (i.e. a file), because we have to
548 * read it twice. It must be newly opened (i.e. positioned
549 * at the start).
550 *
551 * Args: sqfp - newly opened sequence data file
552 * basename - base name of dsqdata files to create
553 * errbuf - user-directed error message on normal errors
554 *
555 * Returns: <eslOK> on success.
556 *
557 * <eslEWRITE> if an output file can't be opened. <errbuf>
558 * contains user-directed error message.
559 *
560 * <eslEFORMAT> if a parse error is encountered while
561 * reading <sqfp>.
562 *
563 *
564 * Throws: <eslESYS> A system call failed, such as fwrite().
565 * <eslEINVAL> Sequence handle <sqfp> isn't digital and rewindable.
566 * <eslEMEM> Allocation failure
567 * <eslEUNIMPLEMENTED> Sequence is too long to be encoded.
568 * (TODO: chromosome-scale DNA sequences)
569 */
570 int
esl_dsqdata_Write(ESL_SQFILE * sqfp,char * basename,char * errbuf)571 esl_dsqdata_Write(ESL_SQFILE *sqfp, char *basename, char *errbuf)
572 {
573 ESL_RANDOMNESS *rng = NULL;
574 ESL_SQ *sq = NULL;
575 FILE *stubfp = NULL;
576 FILE *ifp = NULL;
577 FILE *mfp = NULL;
578 FILE *sfp = NULL;
579 char *outfile = NULL;
580 uint32_t magic = eslDSQDATA_MAGIC_V1;
581 uint32_t uniquetag;
582 uint32_t alphatype;
583 uint32_t flags = 0;
584 uint32_t max_namelen = 0;
585 uint32_t max_acclen = 0;
586 uint32_t max_desclen = 0;
587 uint64_t max_seqlen = 0;
588 uint64_t nseq = 0;
589 uint64_t nres = 0;
590 int do_pack5 = FALSE;
591 uint32_t *psq;
592 ESL_DSQDATA_RECORD idx; // one index record to write
593 int plen;
594 int64_t spos = 0;
595 int64_t mpos = 0;
596 int n;
597 int status;
598
599 if (! esl_sqfile_IsRewindable(sqfp)) ESL_EXCEPTION(eslEINVAL, "sqfp must be rewindable (e.g. an open file)");
600 if (! sqfp->abc) ESL_EXCEPTION(eslEINVAL, "sqfp must be digital");
601 // Could also check that it's positioned at the start.
602 if ( (sq = esl_sq_CreateDigital(sqfp->abc)) == NULL) { status = eslEMEM; goto ERROR; }
603
604 /* First pass over the sequence file, to get statistics.
605 * Read it now, before opening any files, in case we find any parse errors.
606 */
607 while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
608 {
609 if (sq->n >= 6 * eslDSQDATA_CHUNK_MAXPACKET) // guaranteed limit
610 ESL_EXCEPTION(eslEUNIMPLEMENTED, "dsqdata cannot currently deal with large sequences");
611
612 nseq++;
613 nres += sq->n;
614 if (sq->n > max_seqlen) max_seqlen = sq->n;
615 n = strlen(sq->name); if (n > max_namelen) max_namelen = n;
616 n = strlen(sq->acc); if (n > max_acclen) max_acclen = n;
617 n = strlen(sq->desc); if (n > max_desclen) max_desclen = n;
618 esl_sq_Reuse(sq);
619 }
620 if (status == eslEFORMAT) ESL_XFAIL(eslEFORMAT, errbuf, sqfp->get_error(sqfp));
621 else if (status != eslEOF) return status;
622
623 if ((status = esl_sqfile_Position(sqfp, 0)) != eslOK) return status;
624
625
626 if (( rng = esl_randomness_Create(0) ) == NULL) { status = eslEMEM; goto ERROR; }
627 uniquetag = esl_random_uint32(rng);
628 alphatype = sqfp->abc->type;
629
630 if (alphatype == eslAMINO) do_pack5 = TRUE;
631 else if (alphatype != eslDNA && alphatype != eslRNA) ESL_EXCEPTION(eslEINVAL, "alphabet must be protein or nucleic");
632
633
634 if (( status = esl_sprintf(&outfile, "%s.dsqi", basename)) != eslOK) goto ERROR;
635 if (( ifp = fopen(outfile, "wb")) == NULL) ESL_XFAIL(eslEWRITE, errbuf, "failed to open dsqdata index file %s for writing", outfile);
636 sprintf(outfile, "%s.dsqm", basename);
637 if (( mfp = fopen(outfile, "wb")) == NULL) ESL_XFAIL(eslEWRITE, errbuf, "failed to open dsqdata metadata file %s for writing", outfile);
638 sprintf(outfile, "%s.dsqs", basename);
639 if (( sfp = fopen(outfile, "wb")) == NULL) ESL_XFAIL(eslEWRITE, errbuf, "failed to open dsqdata sequence file %s for writing", outfile);
640 if (( stubfp = fopen(basename, "w")) == NULL) ESL_XFAIL(eslEWRITE, errbuf, "failed to open dsqdata stub file %s for writing", basename);
641
642
643
644
645 /* Header: index file */
646 if (fwrite(&magic, sizeof(uint32_t), 1, ifp) != 1 ||
647 fwrite(&uniquetag, sizeof(uint32_t), 1, ifp) != 1 ||
648 fwrite(&alphatype, sizeof(uint32_t), 1, ifp) != 1 ||
649 fwrite(&flags, sizeof(uint32_t), 1, ifp) != 1 ||
650 fwrite(&max_namelen, sizeof(uint32_t), 1, ifp) != 1 ||
651 fwrite(&max_acclen, sizeof(uint32_t), 1, ifp) != 1 ||
652 fwrite(&max_desclen, sizeof(uint32_t), 1, ifp) != 1 ||
653 fwrite(&max_seqlen, sizeof(uint64_t), 1, ifp) != 1 ||
654 fwrite(&nseq, sizeof(uint64_t), 1, ifp) != 1 ||
655 fwrite(&nres, sizeof(uint64_t), 1, ifp) != 1)
656 ESL_XEXCEPTION_SYS(eslESYS, "fwrite() failed, index file header");
657
658 /* Header: metadata file */
659 if (fwrite(&magic, sizeof(uint32_t), 1, mfp) != 1 ||
660 fwrite(&uniquetag, sizeof(uint32_t), 1, mfp) != 1)
661 ESL_XEXCEPTION_SYS(eslESYS, "fwrite() failed, metadata file header");
662
663 /* Header: sequence file */
664 if (fwrite(&magic, sizeof(uint32_t), 1, sfp) != 1 ||
665 fwrite(&uniquetag, sizeof(uint32_t), 1, sfp) != 1)
666 ESL_XEXCEPTION_SYS(eslESYS, "fwrite() failed, metadata file header");
667
668 /* Second pass: index, metadata, and sequence files */
669 while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
670 {
671 /* Packed sequence */
672 psq = (uint32_t *) sq->dsq; // pack-in-place
673 ESL_DASSERT1(( sq->salloc >= 4 )); // required min space for pack-in-place
674 if (do_pack5) dsqdata_pack5(sq->dsq, sq->n, psq, &plen);
675 else dsqdata_pack2(sq->dsq, sq->n, psq, &plen);
676 if ( fwrite(psq, sizeof(uint32_t), plen, sfp) != plen)
677 ESL_XEXCEPTION(eslESYS, "fwrite() failed, packed seq");
678 spos += plen;
679
680 /* Metadata */
681 n = strlen(sq->name);
682 if ( fwrite(sq->name, sizeof(char), n+1, mfp) != n+1)
683 ESL_XEXCEPTION(eslESYS, "fwrite () failed, metadata, name");
684 mpos += n+1;
685
686 n = strlen(sq->acc);
687 if ( fwrite(sq->acc, sizeof(char), n+1, mfp) != n+1)
688 ESL_XEXCEPTION(eslESYS, "fwrite () failed, metadata, accession");
689 mpos += n+1;
690
691 n = strlen(sq->desc);
692 if ( fwrite(sq->desc, sizeof(char), n+1, mfp) != n+1)
693 ESL_XEXCEPTION(eslESYS, "fwrite () failed, metadata, description");
694 mpos += n+1;
695
696 if ( fwrite( &(sq->tax_id), sizeof(int32_t), 1, mfp) != 1)
697 ESL_XEXCEPTION(eslESYS, "fwrite () failed, metadata, taxonomy id");
698 mpos += sizeof(int32_t);
699
700 /* Index file */
701 idx.psq_end = spos-1; // could be -1, on 1st seq, if 1st seq L=0.
702 idx.metadata_end = mpos-1;
703 if ( fwrite(&idx, sizeof(ESL_DSQDATA_RECORD), 1, ifp) != 1)
704 ESL_XEXCEPTION(eslESYS, "fwrite () failed, index file");
705
706 esl_sq_Reuse(sq);
707 }
708
709 /* Stub file */
710 fprintf(stubfp, "Easel dsqdata v1 x%" PRIu32 "\n", uniquetag);
711 fprintf(stubfp, "\n");
712 fprintf(stubfp, "Original file: %s\n", sqfp->filename);
713 fprintf(stubfp, "Original format: %s\n", esl_sqio_DecodeFormat(sqfp->format));
714 fprintf(stubfp, "Type: %s\n", esl_abc_DecodeType(sqfp->abc->type));
715 fprintf(stubfp, "Sequences: %" PRIu64 "\n", nseq);
716 fprintf(stubfp, "Residues: %" PRIu64 "\n", nres);
717
718 esl_sq_Destroy(sq);
719 esl_randomness_Destroy(rng);
720 free(outfile);
721 fclose(stubfp);
722 fclose(ifp);
723 fclose(mfp);
724 fclose(sfp);
725 return eslOK;
726
727 ERROR:
728 if (sq) esl_sq_Destroy(sq);
729 if (rng) esl_randomness_Destroy(rng);
730 if (outfile) free(outfile);
731 if (stubfp) fclose(stubfp);
732 if (ifp) fclose(ifp);
733 if (mfp) fclose(mfp);
734 if (sfp) fclose(sfp);
735 return status;
736 }
737
738
739
740 /*****************************************************************
741 * 3. ESL_DSQDATA_CHUNK: a chunk of input sequence data
742 *****************************************************************/
743
744 static ESL_DSQDATA_CHUNK *
dsqdata_chunk_Create(ESL_DSQDATA * dd)745 dsqdata_chunk_Create(ESL_DSQDATA *dd)
746 {
747 ESL_DSQDATA_CHUNK *chu = NULL;
748 int U; // max size of unpacked seq data, in bytes (smem allocation)
749 int status;
750
751 ESL_ALLOC(chu, sizeof(ESL_DSQDATA_CHUNK));
752 chu->i0 = 0;
753 chu->N = 0;
754 chu->pn = 0;
755 chu->dsq = NULL;
756 chu->name = NULL;
757 chu->acc = NULL;
758 chu->desc = NULL;
759 chu->taxid = NULL;
760 chu->L = NULL;
761 chu->metadata = NULL;
762 chu->smem = NULL;
763 chu->nxt = NULL;
764
765 /* dsq, name, acc, desc are arrays of pointers into smem, metadata.
766 * taxid is cast to int, from the metadata.
767 * L is figured out by the unpacker.
768 * All of these are set by the unpacker.
769 */
770 ESL_ALLOC(chu->dsq, dd->chunk_maxseq * sizeof(ESL_DSQ *));
771 ESL_ALLOC(chu->name, dd->chunk_maxseq * sizeof(char *));
772 ESL_ALLOC(chu->acc, dd->chunk_maxseq * sizeof(char *));
773 ESL_ALLOC(chu->desc, dd->chunk_maxseq * sizeof(char *));
774 ESL_ALLOC(chu->taxid, dd->chunk_maxseq * sizeof(int));
775 ESL_ALLOC(chu->L, dd->chunk_maxseq * sizeof(int64_t));
776
777 /* On the <smem> allocation, and the <dsq> and <psq> pointers into it:
778 *
779 * <maxpacket> (in uint32's) sets the maximum single fread() size:
780 * one load of a new chunk of packed sequence, up to maxpacket*4
781 * bytes. <smem> needs to be able to hold both that and the fully
782 * unpacked sequence, because we unpack in place. Each packet
783 * unpacks to at most 6 or 15 residues (5-bit or 2-bit packing) We
784 * don't pack sentinels, so the maximum unpacked size includes
785 * <maxseq>+1 sentinels... because we concat the digital seqs so
786 * that the trailing sentinel of seq i is the leading sentinel of
787 * seq i+1.
788 *
789 * The packed seq (max of P bytes) loads overlap with the unpacked
790 * data (max of U bytes):
791 * psq
792 * v[ P bytes ]
793 * smem: 0........0........0..........0
794 * ^[ U bytes ]
795 * ^dsq[0] ^dsq[1] ^dsq[2]
796 *
797 * and as long as we unpack psq left to right -- and as long as we
798 * read the last packet before we write the last unpacked residues
799 * to smem - we're guaranteed that the unpacking works without
800 * overwriting any unpacked data.
801 */
802 U = (dd->pack5 ? 6 * dd->chunk_maxpacket : 15 * dd->chunk_maxpacket);
803 U += dd->chunk_maxseq + 1;
804 ESL_ALLOC(chu->smem, sizeof(ESL_DSQ) * U);
805 chu->psq = (uint32_t *) (chu->smem + U - 4*dd->chunk_maxpacket);
806
807 /* We don't have any guarantees about the amount of metadata
808 * associated with the N sequences, so <metadata> has to be a
809 * reallocatable space. We make a lowball guess for the initial
810 * alloc, on the off chance that the metadata size is small (names
811 * only, no acc/desc): minimally, say 12 bytes of name, 3 \0's, and
812 * 4 bytes for the taxid integer: call it 20.
813 */
814 chu->mdalloc = 20 * dd->chunk_maxseq;
815 ESL_ALLOC(chu->metadata, sizeof(char) * chu->mdalloc);
816
817 return chu;
818
819 ERROR:
820 dsqdata_chunk_Destroy(chu);
821 return NULL;
822 }
823
824
825 static void
dsqdata_chunk_Destroy(ESL_DSQDATA_CHUNK * chu)826 dsqdata_chunk_Destroy(ESL_DSQDATA_CHUNK *chu)
827 {
828 if (chu)
829 {
830 if (chu->metadata) free(chu->metadata);
831 if (chu->smem) free(chu->smem);
832 if (chu->L) free(chu->L);
833 if (chu->taxid) free(chu->taxid);
834 if (chu->desc) free(chu->desc);
835 if (chu->acc) free(chu->acc);
836 if (chu->name) free(chu->name);
837 if (chu->dsq) free(chu->dsq);
838 free(chu);
839 }
840 }
841
842
843 /*****************************************************************
844 * 4. Loader and unpacker, the input threads
845 *****************************************************************/
846
847 static void *
dsqdata_loader_thread(void * p)848 dsqdata_loader_thread(void *p)
849 {
850 ESL_DSQDATA *dd = (ESL_DSQDATA *) p;
851 ESL_DSQDATA_RECORD *idx = NULL;
852 ESL_DSQDATA_CHUNK *chu = NULL;
853 int64_t nchunk = 0; // total number of chunks loaded (including empty EOF)
854 int nalloc = 0; // number of chunks we create, and need to destroy.
855 int nidx = 0; // how many records in <idx>: usually MAXSEQ, until end
856 int nload = 0; // how many sequences we load: >=1, <=nidx
857 int ncarried = 0; // how many records carry over to next iteration: nidx-nload
858 int nread = 0; // fread()'s return value
859 int nmeta = 0; // how many bytes of metadata we want to read for this chunk
860 int i0 = 0; // absolute index of first record in <idx>, 0-offset
861 int64_t psq_last = -1; // psq_end for record i0-1
862 int64_t meta_last = -1; // metadata_end for record i0-1
863 int u; // which unpacker outbox we put this chunk in
864 int status;
865
866 /* Don't use <dd> until we get the structure-is-ready signal */
867 if ( pthread_mutex_lock(&dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock failed on go_mutex");
868 while (! dd->go) {
869 if ( pthread_cond_wait(&dd->go_cv, &dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_wait failed on go_cv");
870 }
871 if ( pthread_mutex_unlock(&dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock failed on go_mutex");
872
873 /* We can begin. */
874 ESL_ALLOC(idx, sizeof(ESL_DSQDATA_RECORD) * dd->chunk_maxseq);
875 while (1)
876 {
877 //printf("loader: working on chunk %d\n", (int) nchunk+1);
878
879 /* Get a chunk structure we can use - either by creating it, or recycling it.
880 * We probably don't benefit from having more than <nconsumers> + 3*<n_unpackers> + 2,
881 * which is enough to have all threads working, all in/outboxes full, and at least 1
882 * waiting in recycling.
883 * SRE TODO: test, how many is optimal, does it matter?
884 * the two limits below perform comparably in `esl_dsqdata_example -n`, but that
885 * doesn't have high-cpu reader threads.
886 */
887 //if (nalloc < dd->nconsumers + dd->n_unpackers + 1)
888 if (nalloc < dd->nconsumers + 3 * dd->n_unpackers + 2)
889 {
890 //printf("loader: creating a new chunk\n");
891
892 if ( (chu = dsqdata_chunk_Create(dd)) == NULL) { status = eslEMEM; goto ERROR; }
893 nalloc++;
894 }
895 else
896 {
897 //printf("loader: getting a new chunk from recycling...\n");
898
899 if ( pthread_mutex_lock(&dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
900 while (dd->recycling == NULL) {
901 if ( pthread_cond_wait(&dd->recycling_cv, &dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
902 }
903 chu = dd->recycling; // pop one off recycling stack
904 dd->recycling = chu->nxt;
905 if ( pthread_mutex_unlock(&dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
906 if ( pthread_cond_signal(&dd->recycling_cv) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed"); // signal *after* unlocking mutex
907
908 //printf("loader: ... done, have new chunk from recycling.\n");
909 }
910
911 /* Refill index. (The memmove is avoidable. Alt strategy: we could load in 2 frames)
912 * The previous loop loaded packed sequence for <nload'> of the <nidx'> entries,
913 * where the 's indicate the variable has carried over from prev iteration:
914 * |----- nload' ----||--- (ncarried) ---|
915 * |-------------- nidx' ----------------|
916 * Now we're going to shift the remainder ncarried = nidx-nload to the left, then refill:
917 * |---- ncarried ----||--- (MAXSEQ-ncarried) ---|
918 * |-------------- MAXSEQ -----------------------|
919 * while watching out for the terminal case where we run out of
920 * data, loading less than (MAXSEQ-ncarried) records:
921 * |---- ncarried ----||--- nidx* ---|
922 * |------------- nidx --------------|
923 * where the <nidx*> is what fread() returns to us.
924 */
925 i0 += nload; // this chunk starts with seq #<i0>
926 ncarried = (nidx - nload);
927 memmove(idx, idx + nload, sizeof(ESL_DSQDATA_RECORD) * ncarried);
928 nidx = fread(idx + ncarried, sizeof(ESL_DSQDATA_RECORD), dd->chunk_maxseq - ncarried, dd->ifp);
929 nidx += ncarried; // usually, this'll be MAXSEQ, unless we're near EOF.
930
931 if (nidx == 0) // then we're EOD.
932 {
933 //printf("loader: reached EOD.\n");
934 dsqdata_chunk_Destroy(chu);
935 nalloc--; // we'd counted that chunk towards <nalloc>.
936 break; // this is the only way out of loader's main loop
937 }
938
939
940 /* Figure out how many sequences we're going to load: <nload>
941 * nload = max i : i <= MAXSEQ && idx[i].psq_end - psq_last <= CHUNK_MAX
942 */
943 ESL_DASSERT1(( idx[0].psq_end - psq_last <= dd->chunk_maxpacket ));
944 if (idx[nidx-1].psq_end - psq_last <= dd->chunk_maxpacket)
945 nload = nidx;
946 else
947 { // Binary search for nload = max_i idx[i-1].psq_end - lastend <= MAX
948 int righti = nidx;
949 int mid;
950 nload = 1;
951 while (righti - nload > 1)
952 {
953 mid = nload + (righti - nload) / 2;
954 if (idx[mid-1].psq_end - psq_last <= dd->chunk_maxpacket) nload = mid;
955 else righti = mid;
956 }
957 }
958
959 /* Read packed sequence. */
960 //printf("loader: loading chunk %d from disk.\n", (int) nchunk+1);
961
962 chu->pn = idx[nload-1].psq_end - psq_last;
963 nread = fread(chu->psq, sizeof(uint32_t), chu->pn, dd->sfp);
964 //printf("Read %d packed ints from seq file\n", nread);
965 if ( nread != chu->pn ) ESL_XEXCEPTION(eslEOD, "dsqdata packet loader: expected %d, got %d", chu->pn, nread);
966
967
968 /* Read metadata, reallocating if needed */
969 nmeta = idx[nload-1].metadata_end - meta_last;
970 if (nmeta > chu->mdalloc) {
971 ESL_REALLOC(chu->metadata, sizeof(char) * nmeta); // should be realloc by doubling instead?
972 chu->mdalloc = nmeta;
973 }
974 nread = fread(chu->metadata, sizeof(char), nmeta, dd->mfp);
975 if ( nread != nmeta ) ESL_XEXCEPTION(eslEOD, "dsqdata metadata loader: expected %d, got %d", nmeta, nread);
976
977 chu->i0 = i0;
978 chu->N = nload;
979 psq_last = idx[nload-1].psq_end;
980 meta_last = idx[nload-1].metadata_end;
981
982 /* Put chunk in appropriate outbox.
983 */
984 u = nchunk % dd->n_unpackers; // note this is the loader's own private <nchunk> counter, not the consumer-shared one in <dd>
985 //printf("loader: about to put chunk %d into inbox %d\n", (int) nchunk+1, u);
986 if ( pthread_mutex_lock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
987 while (dd->inbox[u] != NULL) {
988 if (pthread_cond_wait(&(dd->inbox_cv[u]), &(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
989 }
990 dd->inbox[u] = chu;
991 if ( pthread_mutex_unlock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
992 if ( pthread_cond_signal(&(dd->inbox_cv[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed");
993 //printf("loader: finished putting chunk %d into inbox %d\n", (int) nchunk+1, u);
994
995 nchunk++;
996 }
997
998 /* Cleanup time. First, set all unpacker inboxes to EOD state. (We overwrite <u> here.)
999 */
1000 for (u = 0; u < dd->n_unpackers; u++)
1001 {
1002 //printf("loader: setting EOD on inbox %d\n", u);
1003 if ( pthread_mutex_lock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
1004 while (dd->inbox[u] != NULL) {
1005 if (pthread_cond_wait(&(dd->inbox_cv[u]), &(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
1006 }
1007 dd->inbox_eod[u] = TRUE; // we know inbox[u] is already NULL
1008 if ( pthread_mutex_unlock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
1009 if ( pthread_cond_signal(&(dd->inbox_cv[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed");
1010 }
1011
1012
1013 /* Then, wait to get all our chunks back through the recycling, so we
1014 * can free them and exit cleanly. We counted them as they went out,
1015 * in <nalloc>, so we know how many need to come home to roost.
1016 */
1017 while (nalloc)
1018 {
1019 //printf("loader: clearing recycling, %d chunks left to free\n", nalloc);
1020 if ( pthread_mutex_lock(&dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
1021 while (dd->recycling == NULL) { // Readers may still be working, will Recycle() their chunks
1022 if ( pthread_cond_wait(&dd->recycling_cv, &dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
1023 }
1024 while (dd->recycling != NULL) { // Free entire stack, while we have the mutex locked.
1025 chu = dd->recycling;
1026 dd->recycling = chu->nxt;
1027 dsqdata_chunk_Destroy(chu);
1028 nalloc--;
1029 }
1030 if ( pthread_mutex_unlock(&dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
1031 /* Because the recycling is a stack, consumers never have to wait
1032 * on a condition to Recycle(), so we don't need to signal anything.
1033 */
1034 }
1035 //printf("loader: exiting\n");
1036 free(idx);
1037 pthread_exit(NULL);
1038
1039 ERROR:
1040 /* Defying Easel standards, we treat all exceptions as fatal, at
1041 * least for the moment. This isn't a problem in HMMER, Infernal
1042 * because they already use fatal exception handlers (i.e., we never
1043 * reach this code anyway, if the parent app is using default fatal
1044 * exception handling). It would become a problem if an Easel-based
1045 * app needs to assure no exits from within Easel. Because the other
1046 * threads will block waiting for chunks to come from the loader, if
1047 * the loader fails, we would need a back channel signal of some
1048 * sort to get the other threads to clean up and terminate.
1049 */
1050 if (idx) free(idx);
1051 esl_fatal(" ... dsqdata loader thread failed: unrecoverable");
1052 }
1053
1054
1055
1056 static void *
dsqdata_unpacker_thread(void * p)1057 dsqdata_unpacker_thread(void *p)
1058 {
1059 ESL_DSQDATA *dd = (ESL_DSQDATA *) p;
1060 ESL_DSQDATA_CHUNK *chu = NULL;
1061 pthread_t my_id = pthread_self();
1062 int u;
1063 int status;
1064
1065 /* Don't use <dd> until we get the structure-is-ready signal */
1066 if ( pthread_mutex_lock(&dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock failed on go_mutex");
1067 while (! dd->go) {
1068 if ( pthread_cond_wait(&dd->go_cv, &dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_wait failed on go_cv");
1069 }
1070 if ( pthread_mutex_unlock(&dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock failed on go_mutex");
1071
1072 /* There may be more than one unpacker. Figure out who we are, so we
1073 * know which slot is ours. (This is why it's especially important
1074 * for the unpacker threads to wait for <dd> to be fully
1075 * initialized.)
1076 */
1077 for (u = 0; u < dd->n_unpackers; u++)
1078 if ( pthread_equal(my_id, dd->unpacker_t[u] )) break;
1079 if (u >= dd->n_unpackers) esl_fatal("unpacker failed to figure out its identity");
1080
1081 //printf("unpacker thread %d: ready.\n", u);
1082
1083 /* Ready. Let's go. */
1084 do {
1085 //printf("unpacker thread %d: checking for next chunk.\n", u);
1086
1087 /* Get a chunk from loader in our inbox. Wait if necessary. */
1088 if ( pthread_mutex_lock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
1089 while (! dd->inbox_eod[u] && dd->inbox[u] == NULL) {
1090 if ( pthread_cond_wait(&(dd->inbox_cv[u]), &(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
1091 }
1092 chu = dd->inbox[u]; // NULL if we're EOD
1093 dd->inbox[u] = NULL;
1094 if ( pthread_mutex_unlock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
1095
1096 //if (chu) printf("unpacker thread %d: took encoded chunk from inbox\n", u);
1097 //else printf("unpacker thread %d: EOD\n", u);
1098
1099 if (chu) {
1100 /* only need to signal inbox change to the loader if we're not EOD */
1101 if ( pthread_cond_signal(&(dd->inbox_cv[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed");
1102 /* unpack it */
1103 if (( status = dsqdata_unpack_chunk(chu, dd->pack5)) != eslOK) goto ERROR;
1104 }
1105
1106 /* Put unpacked chunk into the unpacker's outbox, or set EOD status.
1107 * May need to wait for it to be empty/available.
1108 */
1109 if ( pthread_mutex_lock(&(dd->outbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
1110 while (dd->outbox[u] != NULL) {
1111 if ( pthread_cond_wait(&(dd->outbox_cv[u]), &(dd->outbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
1112 }
1113 dd->outbox[u] = chu; // that's NULL if we're EOD
1114 if (! chu) dd->outbox_eod[u] = TRUE;
1115 if ( pthread_mutex_unlock(&(dd->outbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
1116 if ( pthread_cond_signal(&(dd->outbox_cv[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed");
1117
1118 //if (chu) printf("unpacker thread %d: placed unpacked chunk on outbox\n", u);
1119 //else printf("unpacker thread %d: exiting EOD\n", u);
1120 } while (chu);
1121 pthread_exit(NULL);
1122
1123 ERROR:
1124 /* See comment in loader thread: for lack of a back channel mechanism
1125 * to tell other threads to clean up and terminate, we violate Easel standards
1126 * and turn nonfatal exceptions into fatal ones.
1127 */
1128 esl_fatal(" ... dsqdata unpacker thread failed: unrecoverable");
1129 }
1130
1131
1132 /*****************************************************************
1133 * 5. Packing sequences and unpacking chunks
1134 *****************************************************************/
1135
1136 /* dsqdata_unpack_chunk()
1137 *
1138 * <do_pack5> is a hint: if caller knows that all the packets in the
1139 * chunk are 5-bit encoded (i.e. amino acid sequence), it can pass
1140 * <TRUE>, enabling a small optimization. Otherwise the packed
1141 * sequences will be treated as mixed 2- and 5-bit encoding, as is
1142 * needed for DNA/RNA sequences; protein sequences also unpack fine
1143 * that way, but the 5-bit flag on every packet needs to be checked.
1144 *
1145 * Throws: <eslEFORMAT> if a problem is seen in the binary format
1146 */
1147 static int
dsqdata_unpack_chunk(ESL_DSQDATA_CHUNK * chu,int do_pack5)1148 dsqdata_unpack_chunk(ESL_DSQDATA_CHUNK *chu, int do_pack5)
1149 {
1150 char *ptr = chu->metadata; // ptr will walk through metadata
1151 int r; // position in unpacked dsq array
1152 int i; // sequence index: 0..chu->N-1
1153 int pos; // position in packet array
1154 int L; // an unpacked sequence length
1155 int P; // number of packets unpacked
1156
1157 /* "Unpack" the metadata */
1158 for (i = 0; i < chu->N; i++)
1159 {
1160 /* The data are user input, so we cannot trust that it has \0's where we expect them. */
1161 if ( ptr >= chu->metadata + chu->mdalloc) ESL_EXCEPTION(eslEFORMAT, "metadata format error");
1162 chu->name[i] = ptr; ptr = 1 + strchr(ptr, '\0'); if ( ptr >= chu->metadata + chu->mdalloc) ESL_EXCEPTION(eslEFORMAT, "metadata format error");
1163 chu->acc[i] = ptr; ptr = 1 + strchr(ptr, '\0'); if ( ptr >= chu->metadata + chu->mdalloc) ESL_EXCEPTION(eslEFORMAT, "metadata format error");
1164 chu->desc[i] = ptr; ptr = 1 + strchr(ptr, '\0'); if ( ptr >= chu->metadata + chu->mdalloc) ESL_EXCEPTION(eslEFORMAT, "metadata format error");
1165 chu->taxid[i] = (int32_t) *((int32_t *) ptr); ptr += sizeof(int32_t);
1166 }
1167
1168 /* Unpack the sequence data */
1169 i = 0;
1170 r = 0;
1171 pos = 0;
1172 chu->smem[0] = eslDSQ_SENTINEL; // This initialization is why <smem> needs to be unsigned.
1173 while (pos < chu->pn)
1174 {
1175 chu->dsq[i] = (ESL_DSQ *) chu->smem + r;
1176 if (do_pack5) dsqdata_unpack5(chu->psq + pos, chu->dsq[i], &L, &P);
1177 else dsqdata_unpack2(chu->psq + pos, chu->dsq[i], &L, &P);
1178
1179 r += L+1; // L+1, not L+2, because we overlap start/end sentinels
1180 pos += P;
1181 chu->L[i] = L;
1182 i++;
1183 }
1184
1185 ESL_DASSERT1(( pos == chu->pn )); // we should've unpacked exactly pn packets,
1186 ESL_DASSERT1(( i == chu->N )); // .. and exactly N sequences.
1187 return eslOK;
1188 }
1189
1190
1191 /* Unpack 5-bit encoded sequence, starting at <psq>.
1192 * Important: dsq[0] is already initialized to eslDSQ_SENTINEL,
1193 * as a nitpicky optimization (the sequence data in a chunk are
1194 * concatenated so that they share end/start sentinels).
1195 */
1196 static int
dsqdata_unpack5(uint32_t * psq,ESL_DSQ * dsq,int * ret_L,int * ret_P)1197 dsqdata_unpack5(uint32_t *psq, ESL_DSQ *dsq, int *ret_L, int *ret_P)
1198 {
1199 int pos = 0; // position in psq[]
1200 int r = 1; // position in dsq[]. caller set dsq[0] to eslDSQ_SENTINEL.
1201 uint32_t v = psq[pos++];
1202 int b; // bit shift counter
1203
1204 while (! ESL_DSQDATA_EOD(v)) // we trust that we'll see a sentinel at the end
1205 {
1206 ESL_DASSERT1(( ESL_DSQDATA_5BIT(v) )); // All packets are 5-bit encoded
1207 dsq[r++] = (v >> 25) & 31; dsq[r++] = (v >> 20) & 31; dsq[r++] = (v >> 15) & 31;
1208 dsq[r++] = (v >> 10) & 31; dsq[r++] = (v >> 5) & 31; dsq[r++] = (v >> 0) & 31;
1209 v = psq[pos++];
1210 }
1211
1212 /* Unpack sentinel packet, which may be partial; it can even contain
1213 * zero residues in the edge case of an L=0 sequence.
1214 */
1215 ESL_DASSERT1(( ESL_DSQDATA_5BIT(v) ));
1216 for (b = 25; b >= 0 && ((v >> b) & 31) != 31; b -= 5)
1217 dsq[r++] = (v >> b) & 31;
1218 dsq[r++] = eslDSQ_SENTINEL;
1219 // r is now L+2: the raw sequence length + 2 sentinels
1220 // P = pos, because pos index advanced to next packet after sentinel
1221 *ret_L = r-2;
1222 *ret_P = pos;
1223 return eslOK;
1224 }
1225
1226 /* Unpack 2-bit (+ mixed 5-bit for noncanonicals) encoding.
1227 * Important: dsq[0] is already initialized to eslDSQ_SENTINEL
1228 *
1229 * This will work for protein sequences just fine; just a little
1230 * slower than calling dsqdata_unpack5(), because here we have
1231 * to check the 5-bit encoding bit on every packet.
1232 */
1233 static int
dsqdata_unpack2(uint32_t * psq,ESL_DSQ * dsq,int * ret_L,int * ret_P)1234 dsqdata_unpack2(uint32_t *psq, ESL_DSQ *dsq, int *ret_L, int *ret_P)
1235 {
1236 int pos = 0;
1237 int r = 1;
1238 uint32_t v = psq[pos++];
1239 int b; // bit shift counter
1240
1241 while (! ESL_DSQDATA_EOD(v))
1242 {
1243 if ( ESL_DSQDATA_5BIT(v)) // 5-bit encoded, full. Don't need mask on bit 31 because we know it's down.
1244 {
1245 dsq[r++] = (v >> 25) & 31; dsq[r++] = (v >> 20) & 31; dsq[r++] = (v >> 15) & 31;
1246 dsq[r++] = (v >> 10) & 31; dsq[r++] = (v >> 5) & 31; dsq[r++] = (v >> 0) & 31;
1247 }
1248 else // 2-bit encoded, full
1249 {
1250 dsq[r++] = (v >> 28) & 3; dsq[r++] = (v >> 26) & 3; dsq[r++] = (v >> 24) & 3;
1251 dsq[r++] = (v >> 22) & 3; dsq[r++] = (v >> 20) & 3; dsq[r++] = (v >> 18) & 3;
1252 dsq[r++] = (v >> 16) & 3; dsq[r++] = (v >> 14) & 3; dsq[r++] = (v >> 12) & 3;
1253 dsq[r++] = (v >> 10) & 3; dsq[r++] = (v >> 8) & 3; dsq[r++] = (v >> 6) & 3;
1254 dsq[r++] = (v >> 4) & 3; dsq[r++] = (v >> 2) & 3; dsq[r++] = (v >> 0) & 3;
1255 }
1256 v = psq[pos++];
1257 }
1258
1259 /* Sentinel packet.
1260 * If 2-bit, it's full. If 5-bit, it's usually partial, and may even be 0-len.
1261 */
1262 if ( ESL_DSQDATA_5BIT(v)) // 5-bit, partial
1263 {
1264 for (b = 25; b >= 0 && ((v >> b) & 31) != 31; b -= 5)
1265 dsq[r++] = (v >> b) & 31;
1266 }
1267 else
1268 {
1269 dsq[r++] = (v >> 28) & 3; dsq[r++] = (v >> 26) & 3; dsq[r++] = (v >> 24) & 3;
1270 dsq[r++] = (v >> 22) & 3; dsq[r++] = (v >> 20) & 3; dsq[r++] = (v >> 18) & 3;
1271 dsq[r++] = (v >> 16) & 3; dsq[r++] = (v >> 14) & 3; dsq[r++] = (v >> 12) & 3;
1272 dsq[r++] = (v >> 10) & 3; dsq[r++] = (v >> 8) & 3; dsq[r++] = (v >> 6) & 3;
1273 dsq[r++] = (v >> 4) & 3; dsq[r++] = (v >> 2) & 3; dsq[r++] = (v >> 0) & 3;
1274 }
1275 dsq[r++] = eslDSQ_SENTINEL;
1276
1277 *ret_L = r-2;
1278 *ret_P = pos;
1279 return eslOK;
1280 }
1281
1282
1283 /* dsqdata_pack5()
1284 *
1285 * Pack a digital (protein) sequence <dsq> of length <n>, into <psq>
1286 * using 5-bit encoding; return the number of packets <*ret_P>.
1287 *
1288 * <psq> must be allocated for at least $MAX(1, (n+5)/6)$ packets.
1289 *
1290 * You can pack in place, by passing the same pointer <dsq> as <psq>,
1291 * provided that dsq is allocated for at least 1 packet (4 bytes). We
1292 * know that <psq> is either smaller than <dsq> ($4P <= n$) or that it
1293 * consists of one EOD packet (in the case n=0).
1294 */
1295 static int
dsqdata_pack5(ESL_DSQ * dsq,int n,uint32_t * psq,int * ret_P)1296 dsqdata_pack5(ESL_DSQ *dsq, int n, uint32_t *psq, int *ret_P)
1297 {
1298 int r = 1; // position in <dsq>
1299 int pos = 0; // position in <psq>.
1300 int b; // bitshift
1301 uint32_t v; // tmp var needed to guarantee pack-in-place works
1302
1303 while (r <= n)
1304 {
1305 v = eslDSQDATA_5BIT; // initialize packet with 5-bit flag
1306 for (b = 25; b >= 0 && r <= n; b -= 5) v |= (uint32_t) dsq[r++] << b;
1307 for ( ; b >= 0; b -= 5) v |= (uint32_t) 31 << b;
1308
1309 if (r > n) v |= eslDSQDATA_EOD; // EOD bit
1310 psq[pos++] = v; // we know we've already read all the dsq we need under psq[pos]
1311 }
1312
1313 /* Special case of n=0: we need an empty EOD sentinel packet. */
1314 if (pos == 0) { v = 0; psq[pos++] = ~v; } // all bits set: | EOD | 5BIT | all sentinels |
1315
1316 *ret_P = pos;
1317 return eslOK;
1318 }
1319
1320
1321 /* dsqdata_pack2()
1322 *
1323 * Pack a digital (nucleic) sequence <dsq> of total length
1324 * <n>, into <psq>; return the number of packets <*ret_P>.
1325 *
1326 * <psq> must be allocated for at least $MAX(1, (n+5)/6)$ packets.
1327 * (Yes, even in 2-bit packing, because worst case, the sequence
1328 * contains so many noncanonicals that it's entirely 5-bit encoded.)
1329 *
1330 * You can pack in place, by passing the same pointer <dsq> as <psq>,
1331 * provided that dsq is allocated for at least 1 packet (4 bytes). We
1332 * know that <psq> is either smaller than <dsq> ($4P <= n$) or that it
1333 * consists of one EOD packet (in the case n=0).
1334 */
1335 static int
dsqdata_pack2(ESL_DSQ * dsq,int n,uint32_t * psq,int * ret_P)1336 dsqdata_pack2(ESL_DSQ *dsq, int n, uint32_t *psq, int *ret_P)
1337 {
1338 int pos = 0; // position in <psq>
1339 int d = 0; // position of next degen residue, 1..n, n+1 if none
1340 int r = 1; // position in <dsq> 1..n
1341 int b; // bitshift
1342 uint32_t v; // tmp var needed to guarantee pack-in-place works
1343
1344 while (r <= n)
1345 {
1346 // Slide the "next degenerate residue" detector
1347 if (d < r)
1348 for (d = r; d <= n; d++)
1349 if (dsq[d] > 3) break;
1350
1351 // Can we 2-bit pack the next 15 residues, r..r+14?
1352 // n-r+1 = number of residues remaining to be packed.
1353 if (n-r+1 >= 15 && d > r+14)
1354 {
1355 v = 0;
1356 for (b = 28; b >= 0; b -=2) v |= (uint32_t) dsq[r++] << b;
1357 }
1358 else
1359 {
1360 v = eslDSQDATA_5BIT; // initialize v with 5-bit packing bit
1361 for (b = 25; b >= 0 && r <= n; b -= 5) v |= (uint32_t) dsq[r++] << b;
1362 for ( ; b >= 0; b -= 5) v |= (uint32_t) 31 << b;
1363 }
1364
1365 if (r > n) v |= eslDSQDATA_EOD; // EOD bit
1366 psq[pos++] = v; // we know we've already read all the dsq we need under psq[pos]
1367 }
1368
1369 /* Special case of n=0: we need an empty EOD sentinel packet.
1370 * Sentinel packets are 5-bit encoded, even in 2-bit coding scheme
1371 */
1372 if (pos == 0) { v = 0; psq[pos++] = ~v; } // all bits set: | EOD | 5BIT | all sentinels |
1373
1374 *ret_P = pos;
1375 return eslOK;
1376 }
1377
1378
1379 /*****************************************************************
1380 * 6. Notes
1381 *****************************************************************
1382 *
1383 * [1] Packed sequence data format.
1384 *
1385 * Format of a single packet:
1386 * [31] [30] [29..25] [24..20] [19..15] [14..10] [ 9..5 ] [ 4..0 ]
1387 * ^ ^ |------------ 6 5-bit packed residues ------------------|
1388 * | | [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
1389 * | | |----------- or 15 2-bit packed residues ----------------|
1390 * | |
1391 * | "packtype" bit 30 = 0 if packet is 2-bit packed; 1 if 5-bit packed
1392 * "sentinel" bit 31 = 1 if last packet in packed sequence; else 0
1393 *
1394 * (packet & (1 << 31)) tests for end of sequence
1395 * (packet & (1 << 30)) tests for 5-bit packing vs. 2-bit
1396 * ((packet >> shift) && 31) decodes 5-bit, for shift=25..0 in steps of 5
1397 * ((packet >> shift) && 3) decodes 2-bit, for shift=28..0 in steps of 2
1398 *
1399 * Packets without the sentinel bit set are always full (unpack
1400 * to 15 or 6 residue codes).
1401 *
1402 * 5-bit EOD packets may be partial: they unpack to 0..6
1403 * residues. The remaining residue codes are set to 0x1f
1404 * (11111) to indicate EOD within a partial packet.
1405 *
1406 * A 0-length sequence is encoded by a 5-bit partial EOD packet
1407 * with 0 residues. This is the only case in which a partial
1408 * packet contains 0 residues. (Because we can end with an EOD
1409 * full packet, there is no other case where we end up with 0
1410 * leftover residues to encode.)
1411 *
1412 * 2-bit EOD packets must be full, because there is no way to
1413 * signal EOD locally within a 2-bit packet. Can't use 0x03 (11)
1414 * because that's T/U. Generally, then, the last packet of a
1415 * nucleic acid sequence must be 5-bit encoded, solely to be
1416 * able to encode EOD in a partial packet.
1417 *
1418 * A packed sequence consists of an integer number of packets,
1419 * P, which ends with an EOD packet that may contain a partial
1420 * number of residues. P packets are guaranteed to be able to
1421 * encode at least 6P residues in either scheme.
1422 *
1423 * A sequence of length L packs into P <= MAX(1, (N+5)/6)
1424 * packets. (1, because a 0-length sequence still requires an
1425 * EOD packet.) This is true even for nucleic sequences, because
1426 * noncanonical residues can force DNA/RNA sequence to pack
1427 * entirely in 5-bit coding.
1428 *
1429 * A packed amino acid sequence unpacks to 6P-5 <= L <= 6P
1430 * residues (for P>1; 0 <= L <= 6 for P=1) and all packets are
1431 * 5-bit encoded.
1432 *
1433 * A packed nucleic acid sequence unpacks to 6P-5 <= L <= 15P
1434 * residues (for P>1; 0 <= L <= 15 for P=1). The packets are a
1435 * mix of 2-bit and 5-bit. Degenerate residues must be 5-bit
1436 * packed, and the EOD packet usually is too. A 5-bit packet
1437 * does not have to contain degenerate residues, because it may
1438 * have been necessary to get "in frame" to pack a downstream
1439 * degenerate residue. For example, the sequence
1440 * ACGTACGTNNA... must be packed as [ACGTAC][CGTNNA]... to get
1441 * the N's packed correctly.
1442 *
1443 * [2] Compression: relative incompressibility of biological sequences.
1444 *
1445 * Considered using fast (de)compression algorithms that are fast
1446 * enough to keep up with disk read speed, including LZ4 and
1447 * Google's Snappy. However, lz4 only achieves 1.0-1.9x global
1448 * compression of protein sequence (compared to 1.5x for
1449 * packing), and 2.0x for DNA (compared to 3.75x for packing).
1450 * With local, blockwise compression, which we need for random
1451 * access and indexing, it gets worse. Packing is superior.
1452 *
1453 * Metadata compression is more feasible, but I still opted
1454 * against it. Although metadata are globally quite compressible
1455 * (3.2-6.9x in trials with lz4), locally in 64K blocks lz4 only
1456 * achieves 2x. [xref SRE:2016/0201-seqfile-compression]
1457 *
1458 * [3] Maybe getting more packing using run-length encoding.
1459 *
1460 * Genome assemblies typically have long runs of N's (human
1461 * GRCh38.p2 is about 5% N), and it's excruciating to have to
1462 * pack it into bulky 5-bit degenerate packets. I considered
1463 * run-length encoding (RLE). One possibility is to use a special
1464 * packet format akin to the 5-bit packet format:
1465 *
1466 * [0] [?] [11111] [.....] [....................]
1467 * ^ ^ ^ 20b number, <=2^20-1
1468 * | | 5-bit residue code
1469 * | sentinel residue 31 set
1470 * sentinel bit unset
1471 *
1472 * This is a uniquely detectable packet structure because a full
1473 * packet (with unset sentinel bit) would otherwise never contain
1474 * a sentinel residue (code 31).
1475 *
1476 * However, using RLE would make our unpacked data sizes too
1477 * unpredictable; we wouldn't have the <=6P or <=15P guarantee,
1478 * so we couldn't rely on fixed-length allocation of <smem> in
1479 * our chunk. Consumers wouldn't be getting predictable chunk
1480 * sizes, which could complicate load balancing. I decided
1481 * against it.
1482 */
1483
1484
1485 /*****************************************************************
1486 * 7. Unit tests
1487 *****************************************************************/
1488 #ifdef eslDSQDATA_TESTDRIVE
1489
1490 #include "esl_randomseq.h"
1491
1492 /* Exercise the packing and unpacking routines:
1493 * dsqdata_pack2, dsqdata_pack5, and dsqdata_unpack
1494 */
1495 static void
utest_packing(ESL_RANDOMNESS * rng,ESL_ALPHABET * abc,int nsamples)1496 utest_packing(ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, int nsamples)
1497 {
1498 char msg[] = "esl_dsqdata :: packing unit test failed";
1499 ESL_DSQ *dsq = NULL; // We start with a dirty random sequence...
1500 uint32_t *psq = NULL; // ... pack it ...
1501 ESL_DSQ *dsq2 = NULL; // ... and unpack it. Then check that it's the same seq.
1502 int L_max = 46; // We'll sample L on 0..L_max. L_max doesn't need to be large to exercise well.
1503 int P_max = ESL_MAX(1, (L_max + 5) / 6); // So sayeth the docs, so let's test it.
1504 int L, P, L2, P2;
1505 int i;
1506
1507 if ((dsq = malloc(sizeof(ESL_DSQ) * (L_max + 2))) == NULL) esl_fatal(msg);
1508 if ((psq = malloc(sizeof(uint32_t) * P_max)) == NULL) esl_fatal(msg);
1509 if ((dsq2 = malloc(sizeof(ESL_DSQ) * (L_max + 2))) == NULL) esl_fatal(msg);
1510
1511 for (i = 0; i < nsamples; i++)
1512 {
1513 L = esl_rnd_Roll(rng, L_max+1); // 0..L_max
1514
1515 esl_rsq_SampleDirty(rng, abc, NULL, L, dsq);
1516
1517 if (abc->type == eslAMINO) { if ( dsqdata_pack5(dsq, L, psq, &P) != eslOK) esl_fatal(msg); }
1518 else { if ( dsqdata_pack2(dsq, L, psq, &P) != eslOK) esl_fatal(msg); }
1519
1520 dsq2[0] = eslDSQ_SENTINEL; // interface to _unpack functions requires caller to do this
1521 if (abc->type == eslAMINO) { if ( dsqdata_unpack5(psq, dsq2, &L2, &P2) != eslOK) esl_fatal(msg); }
1522 else { if ( dsqdata_unpack2(psq, dsq2, &L2, &P2) != eslOK) esl_fatal(msg); }
1523
1524 if (L2 != L) esl_fatal(msg);
1525 if (P2 != P) esl_fatal(msg);
1526 if (memcmp((void *) dsq, (void *) dsq2, L+2) != 0) esl_fatal(msg);
1527
1528 /* Write garbage into the buffers, so nobody's cheating on the test somehow */
1529 esl_rnd_mem(rng, (void *) dsq, L_max+2);
1530 esl_rnd_mem(rng, (void *) dsq2, L_max+2);
1531 esl_rnd_mem(rng, (void *) psq, (sizeof(uint32_t) * P_max));
1532 }
1533
1534 free(dsq);
1535 free(psq);
1536 free(dsq2);
1537 }
1538
1539
1540 static void
utest_readwrite(ESL_RANDOMNESS * rng,ESL_ALPHABET * abc)1541 utest_readwrite(ESL_RANDOMNESS *rng, ESL_ALPHABET *abc)
1542 {
1543 char msg[] = "esl_dsqdata :: readwrite unit test failed";
1544 char tmpfile[16] = "esltmpXXXXXX";
1545 char basename[32];
1546 ESL_SQ **sqarr = NULL;
1547 FILE *tmpfp = NULL;
1548 ESL_SQFILE *sqfp = NULL;
1549 ESL_DSQDATA *dd = NULL;
1550 ESL_DSQDATA_CHUNK *chu = NULL;
1551 int nseq = 1 + esl_rnd_Roll(rng, 20000); // 1..20000
1552 int maxL = 100;
1553 int i;
1554 int status;
1555
1556 /* 1. Sample <nseq> random dirty digital sequences, storing them for later comparison;
1557 * write them out to a tmp FASTA file. The Easel FASTA format writer writes <name> <acc>
1558 * <desc> on the descline, but the reader only reads <name> <desc> (as is standard
1559 * for FASTA format), so blank the accession to avoid confusion.
1560 */
1561 if (( status = esl_tmpfile_named(tmpfile, &tmpfp)) != eslOK) esl_fatal(msg);
1562 if (( sqarr = malloc(sizeof(ESL_SQ *) * nseq)) == NULL) esl_fatal(msg);
1563 for (i = 0; i < nseq; i++)
1564 {
1565 sqarr[i] = NULL;
1566 if (( status = esl_sq_Sample(rng, abc, maxL, &(sqarr[i]))) != eslOK) esl_fatal(msg);
1567 if (( status = esl_sq_SetAccession(sqarr[i], "")) != eslOK) esl_fatal(msg);
1568 if (( status = esl_sqio_Write(tmpfp, sqarr[i], eslSQFILE_FASTA, FALSE)) != eslOK) esl_fatal(msg);
1569 }
1570 fclose(tmpfp);
1571
1572 /* 2. Make a dsqdata database from the FASTA tmpfile.
1573 */
1574 if (( status = esl_sqfile_OpenDigital(abc, tmpfile, eslSQFILE_FASTA, NULL, &sqfp)) != eslOK) esl_fatal(msg);
1575 if (( snprintf(basename, 32, "%s-db", tmpfile)) <= 0) esl_fatal(msg);
1576 if (( status = esl_dsqdata_Write(sqfp, basename, NULL)) != eslOK) esl_fatal(msg);
1577 esl_sqfile_Close(sqfp);
1578
1579 /* 3. Open and read the dsqdata; compare to the original sequences.
1580 */
1581 if (( status = esl_dsqdata_Open(&abc, basename, 1, &dd)) != eslOK) esl_fatal(msg);
1582 while (( status = esl_dsqdata_Read(dd, &chu)) == eslOK)
1583 {
1584 for (i = 0; i < chu->N; i++)
1585 {
1586 if ( chu->L[i] != sqarr[i+chu->i0]->n ) esl_fatal(msg);
1587 if ( memcmp( chu->dsq[i], sqarr[i+chu->i0]->dsq, chu->L[i]) != 0) esl_fatal(msg);
1588 if ( strcmp( chu->name[i], sqarr[i+chu->i0]->name) != 0) esl_fatal(msg);
1589 // FASTA does not read accession - instead we get both accession/description as <desc>
1590 if ( strcmp( chu->desc[i], sqarr[i+chu->i0]->desc) != 0) esl_fatal(msg);
1591 // FASTA also does not store taxid - so don't test that either
1592 }
1593 esl_dsqdata_Recycle(dd, chu);
1594 }
1595 if (status != eslEOF) esl_fatal(msg);
1596 esl_dsqdata_Close(dd);
1597
1598 remove(tmpfile);
1599 remove(basename);
1600 snprintf(basename, 32, "%s-db.dsqi", tmpfile); remove(basename);
1601 snprintf(basename, 32, "%s-db.dsqm", tmpfile); remove(basename);
1602 snprintf(basename, 32, "%s-db.dsqs", tmpfile); remove(basename);
1603 for (i = 0; i < nseq; i++) esl_sq_Destroy(sqarr[i]);
1604 free(sqarr);
1605 }
1606 #endif /*eslDSQDATA_TESTDRIVE*/
1607
1608
1609
1610 /*****************************************************************
1611 * 8. Test driver
1612 *****************************************************************/
1613 #ifdef eslDSQDATA_TESTDRIVE
1614
1615 #include "easel.h"
1616 #include "esl_alphabet.h"
1617 #include "esl_dsqdata.h"
1618 #include "esl_getopts.h"
1619 #include "esl_random.h"
1620
1621 static ESL_OPTIONS options[] = {
1622 /* name type default env range toggles reqs incomp help docgroup*/
1623 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1624 { "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
1625 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1626 };
1627 static char usage[] = "[-options]";
1628 static char banner[] = "unit test driver for Easel dsqdata module";
1629
1630 int
main(int argc,char ** argv)1631 main(int argc, char **argv)
1632 {
1633 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1634 ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
1635 ESL_ALPHABET *amino = esl_alphabet_Create(eslAMINO);
1636 ESL_ALPHABET *nucleic = esl_alphabet_Create(eslRNA);
1637 int nsamples = 100;
1638
1639 fprintf(stderr, "## %s\n", argv[0]);
1640 fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
1641
1642 utest_packing(rng, nucleic, nsamples);
1643 utest_packing(rng, amino, nsamples);
1644
1645 utest_readwrite(rng, nucleic);
1646 utest_readwrite(rng, amino);
1647
1648 fprintf(stderr, "# status = ok\n");
1649
1650 esl_alphabet_Destroy(amino);
1651 esl_alphabet_Destroy(nucleic);
1652 esl_randomness_Destroy(rng);
1653 esl_getopts_Destroy(go);
1654 exit(0);
1655 }
1656 #endif /*eslDSQDATA_TESTDRIVE*/
1657
1658 /*****************************************************************
1659 * 9. Examples
1660 *****************************************************************/
1661
1662 /* esl_dsqdata_example2
1663 * Example of creating a new dsqdata database from a sequence file.
1664 */
1665 #ifdef eslDSQDATA_EXAMPLE2
1666 #include "easel.h"
1667 #include "esl_alphabet.h"
1668 #include "esl_dsqdata.h"
1669 #include "esl_getopts.h"
1670
1671 static ESL_OPTIONS options[] = {
1672 /* name type default env range toggles reqs incomp help docgroup*/
1673 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1674 { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use DNA alphabet", 0 },
1675 { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use RNA alphabet", 0 },
1676 { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "use protein alphabet", 0 },
1677 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1678 };
1679 static char usage[] = "[-options] <seqfile_in> <binary seqfile_out>";
1680 static char banner[] = "experimental: create binary database for esl_dsqdata";
1681
1682 int
main(int argc,char ** argv)1683 main(int argc, char **argv)
1684 {
1685 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage);
1686 ESL_ALPHABET *abc = NULL;
1687 char *infile = esl_opt_GetArg(go, 1);
1688 char *basename = esl_opt_GetArg(go, 2);
1689 int format = eslSQFILE_UNKNOWN;
1690 int alphatype = eslUNKNOWN;
1691 ESL_SQFILE *sqfp = NULL;
1692 char errbuf[eslERRBUFSIZE];
1693 int status;
1694
1695 status = esl_sqfile_Open(infile, format, NULL, &sqfp);
1696 if (status == eslENOTFOUND) esl_fatal("No such file.");
1697 else if (status == eslEFORMAT) esl_fatal("Format unrecognized.");
1698 else if (status != eslOK) esl_fatal("Open failed, code %d.", status);
1699
1700 if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA;
1701 else if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA;
1702 else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO;
1703 else {
1704 status = esl_sqfile_GuessAlphabet(sqfp, &alphatype);
1705 if (status == eslENOALPHABET) esl_fatal("Couldn't guess alphabet from first sequence in %s", infile);
1706 else if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s)\n%s\n", infile, sqfp->get_error(sqfp));
1707 else if (status == eslENODATA) esl_fatal("Sequence file %s contains no data?", infile);
1708 else if (status != eslOK) esl_fatal("Failed to guess alphabet (error code %d)\n", status);
1709 }
1710 abc = esl_alphabet_Create(alphatype);
1711 esl_sqfile_SetDigital(sqfp, abc);
1712
1713 status = esl_dsqdata_Write(sqfp, basename, errbuf);
1714 if (status == eslEWRITE) esl_fatal("Failed to open dsqdata output files:\n %s", errbuf);
1715 else if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s)\n %s", infile, sqfp->get_error(sqfp));
1716 else if (status != eslOK) esl_fatal("Unexpected error while creating dsqdata file (code %d)\n", status);
1717
1718 esl_sqfile_Close(sqfp);
1719 esl_alphabet_Destroy(abc);
1720 esl_getopts_Destroy(go);
1721 return eslOK;
1722 }
1723 #endif /*eslDSQDATA_EXAMPLE2*/
1724
1725
1726 /* esl_dsqdata_example
1727 * Example of opening and reading a dsqdata database.
1728 */
1729 #ifdef eslDSQDATA_EXAMPLE
1730 #include "easel.h"
1731 #include "esl_alphabet.h"
1732 #include "esl_dsqdata.h"
1733 #include "esl_getopts.h"
1734 #include "esl_vectorops.h"
1735
1736 static ESL_OPTIONS options[] = {
1737 /* name type default env range toggles reqs incomp help docgroup*/
1738 { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
1739 { "-c", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "report summary of chunk contents", 0 },
1740 { "-r", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "report summary of residue counts", 0 },
1741 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1742 };
1743
1744 static char usage[] = "[-options] <basename>";
1745 static char banner[] = "example of using ESL_DSQDATA to read sequence db";
1746
1747 int
main(int argc,char ** argv)1748 main(int argc, char **argv)
1749 {
1750 ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
1751 ESL_ALPHABET *abc = NULL;
1752 char *basename = esl_opt_GetArg(go, 1);
1753 int do_summary = esl_opt_GetBoolean(go, "-c");
1754 int do_resct = esl_opt_GetBoolean(go, "-r");
1755 int ncpu = 1;
1756 ESL_DSQDATA *dd = NULL;
1757 ESL_DSQDATA_CHUNK *chu = NULL;
1758 int nchunk = 0;
1759 int i;
1760 int64_t pos;
1761 int64_t ct[128], total;
1762 int x;
1763 int status;
1764
1765 status = esl_dsqdata_Open(&abc, basename, ncpu, &dd);
1766 if (status == eslENOTFOUND) esl_fatal("Failed to open dsqdata files:\n %s", dd->errbuf);
1767 else if (status == eslEFORMAT) esl_fatal("Format problem in dsqdata files:\n %s", dd->errbuf);
1768 else if (status != eslOK) esl_fatal("Unexpected error in opening dsqdata (code %d)", status);
1769
1770 for (x = 0; x < 127; x++) ct[x] = 0;
1771
1772 if (do_summary) esl_dataheader(stdout, 8, "idx", 4, "nseq", 8, "nres", 7, "npacket", 0);
1773
1774 while ((status = esl_dsqdata_Read(dd, &chu)) == eslOK)
1775 {
1776 nchunk++;
1777
1778 if (do_resct)
1779 for (i = 0; i < chu->N; i++)
1780 for (pos = 1; pos <= chu->L[i]; pos++)
1781 ct[ chu->dsq[i][pos] ]++;
1782
1783 if (do_summary)
1784 printf("%-8d %4d %8" PRId64 " %7d\n", nchunk, chu->N, esl_vec_LSum(chu->L, chu->N), chu->pn);
1785
1786 esl_dsqdata_Recycle(dd, chu);
1787 }
1788 if (status != eslEOF) esl_fatal("unexpected error %d in reading dsqdata", status);
1789
1790 if (do_resct)
1791 {
1792 total = 0;
1793 for (x = 0; x < abc->Kp; x++)
1794 {
1795 printf("%c %" PRId64 "\n", abc->sym[x], ct[x]);
1796 total += ct[x];
1797 }
1798 printf("Total = %" PRId64 "\n", total);
1799 }
1800
1801 esl_alphabet_Destroy(abc);
1802 esl_dsqdata_Close(dd);
1803 esl_getopts_Destroy(go);
1804 return 0;
1805 }
1806 #endif /*eslDSQDATA_EXAMPLE*/
1807
1808
1809
1810