1 /* esl_dsqdata : faster sequence input
2  *
3  * Implements a predigitized binary file format for biological
4  * sequences. Sequence data are packed bitwise into 32-bit packets,
5  * where each packet contains either six 5-bit residues or fifteen
6  * 2-bit residues, plus two control bits.  Input is asynchronous,
7  * using POSIX threads, with a "loader" thread doing disk reads and
8  * "unpacker" thread(s) preparing chunks of sequences for
9  * analysis. Sequence data and metadata are stored in separate files,
10  * which may allow further input acceleration by deferring
11  * metadata accesses until they're actually needed.
12  *
13  * All thread synchronization is handled internally. A caller does not
14  * need to worry about the internal parallelism; it just calls
15  * <esl_dsqdata_Read()>. Caller can create multiple threads, each
16  * calling <esl_dsqdata_Read()>.
17  *
18  * A DSQDATA database <basename> is stored in four files:
19  *    - basename       : a human-readable stub
20  *    - basename.dsqi  : index file, enabling random access & parallel chunking
21  *    - basename.dsqm  : metadata including names, accessions, descs, taxonomy
22  *    - basename.dsqs  : sequences, in a packed binary format
23  *
24  * Contents:
25  *   1. ESL_DSQDATA: reading dsqdata format
26  *   2. Creating dsqdata format from a sequence file
27  *   3. ESL_DSQDATA_CHUNK, a chunk of input sequence data
28  *   4. Loader and unpacker, the input threads
29  *   5. Packing sequences and unpacking chunks
30  *   6. Notes and references
31  *   7. Unit tests
32  *   8. Test driver
33  *   9. Examples
34  */
35 #include "esl_config.h"
36 
37 #include <stdio.h>
38 #include <string.h>
39 #include <stdint.h>
40 #include <pthread.h>
41 
42 #include "easel.h"
43 #include "esl_alphabet.h"
44 #include "esl_random.h"
45 #include "esl_sq.h"
46 #include "esl_sqio.h"
47 
48 #include "esl_dsqdata.h"
49 
50 static ESL_DSQDATA_CHUNK *dsqdata_chunk_Create (ESL_DSQDATA *dd);
51 static void               dsqdata_chunk_Destroy(ESL_DSQDATA_CHUNK *chu);
52 
53 static void *dsqdata_loader_thread  (void *p);
54 static void *dsqdata_unpacker_thread(void *p);
55 
56 static int   dsqdata_unpack_chunk(ESL_DSQDATA_CHUNK *chu, int do_pack5);
57 static int   dsqdata_unpack5(uint32_t *psq, ESL_DSQ *dsq, int *ret_L, int *ret_P);
58 static int   dsqdata_unpack2(uint32_t *psq, ESL_DSQ *dsq, int *ret_L, int *ret_P);
59 static int   dsqdata_pack5  (ESL_DSQ *dsq, int L, uint32_t *psq, int *ret_P);
60 static int   dsqdata_pack2  (ESL_DSQ *dsq, int L, uint32_t *psq, int *ret_P);
61 
62 
63 /* Embedded magic numbers allow us to validate the correct binary
64  * format, with version (if needed in the future), and to detect
65  * byteswapping.
66  */
67 static uint32_t eslDSQDATA_MAGIC_V1     = 0xc4d3d1b1; // "dsq1" + 0x80808080
68 static uint32_t eslDSQDATA_MAGIC_V1SWAP = 0xb1d1d3c4; //  ... as above, but byteswapped.
69 
70 /*****************************************************************
71  *# 1. <ESL_DSQDATA>: reading dsqdata format
72  *****************************************************************/
73 
74 /* Function:  esl_dsqdata_Open()
75  * Synopsis:  Open a digital sequence database for reading
76  * Incept:    SRE, Wed Jan 20 09:50:00 2016 [Amtrak 2150, NYP-BOS]
77  *
78  * Purpose:   Open dsqdata database <basename> for reading.  The file
79  *            <basename> is a stub describing the database. The bulk
80  *            of the data are in three accompanying binary files: the
81  *            index file <basename>.dsqi, the metadata file
82  *            <basename>.dsqm, and the sequence file <basename>.dsqs.
83  *
84  *            <nconsumers> is an upper bound on the number of threads
85  *            in which the caller plans to be calling
86  *            <esl_dsqdata_Read()> -- or, more precisely, the maximum
87  *            number of data chunks that the caller could be working
88  *            on at any given instant.  This is a hint, not a
89  *            commitment. The dsqdata loader uses it to determine the
90  *            maximum number of data chunks that can be in play at
91  *            once (including chunks it is juggling internally, plus
92  *            if all the caller's reader threads are busy on
93  *            theirs). If <nconsumers> is set too small, the loader
94  *            may stall waiting for chunks to come back for recycling.
95  *
96  *            Reading digital sequence data requires a digital
97  *            alphabet.  You can either provide one (in which case we
98  *            validate that it matches the alphabet used by the
99  *            dsqdata) or, as a convenience, <esl_dsqdata_Open()> can
100  *            create one for you. Either way, you pass a pointer to an
101  *            <ESL_ALPHABET> structure <abc>, in <byp_abc>.  <byp_abc>
102  *            uses a partial Easel "bypass" idiom: if <*byp_abc> is
103  *            NULL, we allocate and return a new alphabet; if
104  *            <*byp_abc> is a ptr to an existing alphabet, we use it
105  *            for validation. That is, you have two choices:
106  *
107  *            ```
108  *                ESL_ALPHABET *abc = NULL;
109  *                esl_dsqdata_Open(&abc, basename...)
110  *                // <abc> is now the alphabet of <basename>;
111  *                // now you're responsible for Destroy'ing it
112  *            ```
113  *
114  *            or:
115  *
116  *            ```
117  *                ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO);
118  *                status = esl_dsqdata_Open(&abc, basename);
119  *                // if status == eslEINCOMPAT, alphabet in basename
120  *                // doesn't match caller's expectation
121  *            ```
122  *
123  * Args:      byp_abc    : expected or created alphabet; pass &abc, abc=NULL or abc=expected alphabet
124  *            basename   : data are in files <basename> and <basename.dsq[ism]>
125  *            nconsumers : upper bound on number of consumer threads caller is going to Read() with
126  *            ret_dd     : RETURN : the new ESL_DSQDATA object.
127  *
128  * Returns:   <eslOK> on success.
129  *
130  *            <eslENOTFOUND> if one or more of the expected datafiles
131  *            aren't there or can't be opened.
132  *
133  *            <eslEFORMAT> if something looks wrong in parsing file
134  *            formats.  Includes problems in headers, and also the
135  *            case where caller provides a digital alphabet in
136  *            <*byp_abc> and it doesn't match the database's alphabet.
137  *
138  *            On any normal error, <*ret_dd> is still returned, but in
139  *            an error state, and <dd->errbuf> is a user-directed
140  *            error message that the caller can relay to the user. Other
141  *            than the <errbuf>, the rest of the contents are undefined.
142  *
143  *            Caller is responsible for destroying <*byp_abc>.
144  *
145  * Throws:    <eslEMEM> on allocation error.
146  *            <eslESYS> on system call failure.
147  *            <eslEUNIMPLEMENTED> if data are byteswapped
148  *               TODO: handle byteswapping
149  *
150  *            On any thrown exception, <*ret_dd> is returned NULL.
151  *
152  *            On <eslESYS> exceptions, some thread resources may
153  *            not be fully freed, leading to some memory leakage.
154  */
155 int
esl_dsqdata_Open(ESL_ALPHABET ** byp_abc,char * basename,int nconsumers,ESL_DSQDATA ** ret_dd)156 esl_dsqdata_Open(ESL_ALPHABET **byp_abc, char *basename, int nconsumers, ESL_DSQDATA **ret_dd)
157 {
158   ESL_DSQDATA *dd        = NULL;
159   int          bufsize   = 4096;
160   uint32_t     magic     = 0;
161   uint32_t     tag       = 0;
162   uint32_t     alphatype = eslUNKNOWN;
163   char        *p;                       // used for strtok() parsing of fields on a line
164   char         buf[4096];
165   int          u;
166   int          status;
167 
168   ESL_DASSERT1(( nconsumers > 0   ));
169   ESL_DASSERT1(( byp_abc  != NULL ));  // either *byp_abc == NULL or *byp_abc = the caller's expected alphabet.
170 
171   ESL_ALLOC(dd, sizeof(ESL_DSQDATA));
172   dd->stubfp          = NULL;
173   dd->ifp             = NULL;
174   dd->sfp             = NULL;
175   dd->mfp             = NULL;
176   dd->abc_r           = *byp_abc;        // This may be NULL; if so, we create it later.
177 
178   dd->magic           = 0;
179   dd->uniquetag       = 0;
180   dd->flags           = 0;
181   dd->max_namelen     = 0;
182   dd->max_acclen      = 0;
183   dd->max_desclen     = 0;
184   dd->max_seqlen      = 0;
185   dd->nseq            = 0;
186   dd->nres            = 0;
187 
188   dd->chunk_maxseq    = eslDSQDATA_CHUNK_MAXSEQ;    // someday we may want to allow tuning these
189   dd->chunk_maxpacket = eslDSQDATA_CHUNK_MAXPACKET;
190   dd->do_byteswap     = FALSE;
191   dd->pack5           = FALSE;
192 
193   dd->nconsumers      = nconsumers;
194   dd->n_unpackers     = eslDSQDATA_UNPACKERS;      // we'll want to allow tuning this too
195   dd->errbuf[0]       = '\0';
196 
197   /* Open the four files.
198    */
199   ESL_ALLOC( dd->basename, sizeof(char) * (strlen(basename) + 6)); // +5 for .dsqx; +1 for \0
200   if ( sprintf(dd->basename, "%s.dsqi", basename) <= 0)   ESL_XEXCEPTION_SYS(eslESYS, "sprintf() failure");
201   if (( dd->ifp = fopen(dd->basename, "rb"))   == NULL)   ESL_XFAIL(eslENOTFOUND, dd->errbuf, "Failed to find or open index file %s\n", dd->basename);
202 
203   if ( sprintf(dd->basename, "%s.dsqm", basename) <= 0)   ESL_XEXCEPTION_SYS(eslESYS, "sprintf() failure");
204   if (( dd->mfp = fopen(dd->basename, "rb"))   == NULL)   ESL_XFAIL(eslENOTFOUND, dd->errbuf, "Failed to find or open metadata file %s\n", dd->basename);
205 
206   if ( sprintf(dd->basename, "%s.dsqs", basename) <= 0)   ESL_XEXCEPTION_SYS(eslESYS, "sprintf() failure");
207   if (( dd->sfp = fopen(dd->basename, "rb"))   == NULL)   ESL_XFAIL(eslENOTFOUND, dd->errbuf, "Failed to find or open sequence file %s\n", dd->basename);
208 
209   strcpy(dd->basename, basename);
210   if (( dd->stubfp = fopen(dd->basename, "r")) == NULL)   ESL_XFAIL(eslENOTFOUND, dd->errbuf, "Failed to find or open stub file %s\n", dd->basename);
211 
212   /* The stub file is unparsed, intended to be human readable, with one exception:
213    * The first line contains the unique tag that we use to validate linkage of the 4 files.
214    * The format of that first line is:
215    *     Easel dsqdata v123 x0000000000
216    */
217   if ( fgets(buf, bufsize, dd->stubfp) == NULL)           ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file is empty - no tag line found");
218   if (( p = strtok(buf,  " \t\n\r"))   == NULL)           ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format: tag line has no data");
219   if (  strcmp(p, "Easel") != 0)                          ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
220   if (( p = strtok(NULL, " \t\n\r"))   == NULL)           ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
221   if (  strcmp(p, "dsqdata") != 0)                        ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
222   if (( p = strtok(NULL, " \t\n\r"))   == NULL)           ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
223   if ( *p != 'v')                                         ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format: no v on version");
224   if ( ! esl_str_IsInteger(p+1))                          ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file had bad format: no version number");
225   // version number is currently unused: there's only 1
226   if (( p = strtok(NULL, " \t\n\r"))   == NULL)           ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format in tag line");
227   if ( *p != 'x')                                         ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file has bad format: no x on tag");
228   if ( ! esl_str_IsInteger(p+1))                          ESL_XFAIL(eslEFORMAT, dd->errbuf, "stub file had bad format: no integer tag");
229   dd->uniquetag = strtoul(p+1, NULL, 10);
230 
231   /* Index file has a header of 7 uint32's, 3 uint64's */
232   if ( fread(&(dd->magic),       sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file has no header - is empty?");
233   if ( fread(&tag,               sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no tag");
234   if ( fread(&alphatype,         sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no alphatype");
235   if ( fread(&(dd->flags),       sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no flags");
236   if ( fread(&(dd->max_namelen), sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no max name len");
237   if ( fread(&(dd->max_acclen),  sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no max accession len");
238   if ( fread(&(dd->max_desclen), sizeof(uint32_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no max description len");
239 
240   if ( fread(&(dd->max_seqlen),  sizeof(uint64_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no max seq len");
241   if ( fread(&(dd->nseq),        sizeof(uint64_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no nseq");
242   if ( fread(&(dd->nres),        sizeof(uint64_t), 1, dd->ifp) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file header truncated, no nres");
243 
244   /* Check the magic and the tag */
245   if      (tag != dd->uniquetag)                 ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file has bad tag, doesn't go with stub file");
246   // Eventually we would set dd->do_byteswap = TRUE; below.
247   if      (dd->magic == eslDSQDATA_MAGIC_V1SWAP) ESL_XEXCEPTION(eslEUNIMPLEMENTED, "dsqdata cannot yet read data in different byte orders");
248   else if (dd->magic != eslDSQDATA_MAGIC_V1)     ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file has bad magic");
249 
250   /* Either validate, or create the alphabet */
251   if  (dd->abc_r)
252     {
253       if (alphatype != dd->abc_r->type)
254 	ESL_XFAIL(eslEFORMAT, dd->errbuf, "data files use %s alphabet; expected %s alphabet",
255 		  esl_abc_DecodeType(alphatype),
256 		  esl_abc_DecodeType(dd->abc_r->type));
257     }
258   else
259     {
260       if ( esl_abc_ValidateType(alphatype)             != eslOK) ESL_XFAIL(eslEFORMAT, dd->errbuf, "index file has invalid alphabet type %d", alphatype);
261       if (( dd->abc_r = esl_alphabet_Create(alphatype)) == NULL) ESL_XEXCEPTION(eslEMEM, "alphabet creation failed");
262     }
263 
264   /* If it's protein, flip the switch to expect all 5-bit packing */
265   if (dd->abc_r->type == eslAMINO) dd->pack5 = TRUE;
266 
267   /* Metadata file has a header of 2 uint32's, magic and uniquetag */
268   if (( fread(&magic, sizeof(uint32_t), 1, dd->mfp)) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "metadata file has no header - is empty?");
269   if (( fread(&tag,   sizeof(uint32_t), 1, dd->mfp)) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "metadata file header truncated - no tag?");
270   if ( magic != dd->magic)                                 ESL_XFAIL(eslEFORMAT, dd->errbuf, "metadata file has bad magic");
271   if ( tag   != dd->uniquetag)                             ESL_XFAIL(eslEFORMAT, dd->errbuf, "metadata file has bad tag, doesn't match stub");
272 
273   /* Sequence file also has a header of 2 uint32's, magic and uniquetag */
274   if (( fread(&magic, sizeof(uint32_t), 1, dd->sfp)) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "sequence file has no header - is empty?");
275   if (( fread(&tag,   sizeof(uint32_t), 1, dd->sfp)) != 1) ESL_XFAIL(eslEFORMAT, dd->errbuf, "sequence file header truncated - no tag?");
276   if ( magic != dd->magic)                                 ESL_XFAIL(eslEFORMAT, dd->errbuf, "sequence file has bad magic");
277   if ( tag   != dd->uniquetag)                             ESL_XFAIL(eslEFORMAT, dd->errbuf, "sequence file has bad tag, doesn't match stub");
278 
279   /* unpacker inboxes and outboxes */
280   for (u = 0; u < dd->n_unpackers; u++)
281     {
282       dd->inbox[u]  = NULL;
283       if ( pthread_mutex_init(&(dd->inbox_mutex[u]),   NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed");
284       if ( pthread_cond_init (&(dd->inbox_cv[u]),      NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_init() failed");
285       dd->inbox_eod[u] = FALSE;
286 
287       dd->outbox[u]  = NULL;
288       if ( pthread_mutex_init(&(dd->outbox_mutex[u]),  NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed");
289       if ( pthread_cond_init (&(dd->outbox_cv[u]),     NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_init() failed");
290       dd->outbox_eod[u] = FALSE;
291     }
292 
293   /* consumers share access to <nchunk> counter */
294   dd->nchunk = 0;
295   if ( pthread_mutex_init(&dd->nchunk_mutex,           NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed");
296 
297   /* chunk recycling stack */
298   dd->recycling = NULL;
299   if ( pthread_mutex_init(&dd->recycling_mutex,        NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed");
300   if ( pthread_cond_init(&dd->recycling_cv,            NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_init() failed");
301 
302   /* Create the "initialization is complete" signaling mechanism
303    * before creating any threads, and lock the initialization mutex
304    * while we're creating them. The issue here is that unpackers
305    * identify their [u] index by comparing their self thread id to the
306    * master list of thread ids, so make sure that master list actually
307    * exists.
308    */
309   dd->go = FALSE;
310   if ( pthread_mutex_init(&dd->go_mutex, NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_init() failed on go_mutex");
311   if ( pthread_cond_init( &dd->go_cv,    NULL) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_init() failed on go_cv");
312   if ( pthread_mutex_lock(&dd->go_mutex)       != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock() failed on go_mutex");
313 
314   /* Create the loader and the unpackers.
315    * They will wait to start until we signal through go->cv.
316    */
317   if ( pthread_create(&dd->loader_t,   NULL, dsqdata_loader_thread,   dd) != 0) ESL_XEXCEPTION(eslESYS, "pthread_create() failed");
318   for (u = 0; u < dd->n_unpackers; u++)
319     if ( pthread_create(&(dd->unpacker_t[u]), NULL, dsqdata_unpacker_thread, dd) != 0) ESL_XEXCEPTION(eslESYS, "pthread_create() failed");
320 
321   /* All threads started, initialization complete - broadcast "go" signal to all threads */
322   dd->go = TRUE;
323   if ( pthread_mutex_unlock(&dd->go_mutex)  != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_unlock() failed on go_mutex");
324   if ( pthread_cond_broadcast(&dd->go_cv)   != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_broadcast() failed on go_cv");
325 
326   *ret_dd  = dd;
327   *byp_abc = dd->abc_r;     // If caller provided <*byp_abc> this is a no-op, because we set abc_r = *byp_abc.
328   return eslOK;             //  .. otherwise we're passing the created <abc> back to caller, caller's
329                             //     responsibility, we just keep the reference to it.
330  ERROR:
331   if (status == eslENOTFOUND || status == eslEFORMAT || status == eslEINCOMPAT)
332     {    /* on normal errors, we return <dd> with its <errbuf>, don't change *byp_abc */
333       *ret_dd  = dd;
334       if (*byp_abc == NULL && dd->abc_r) esl_alphabet_Destroy(dd->abc_r);
335       return status;
336     }
337   else if (status != eslESYS)
338     {   /* on most exceptions, we free <dd>, return it NULL, don't change *byp_abc */
339       esl_dsqdata_Close(dd);
340       *ret_dd = NULL;
341       if (*byp_abc == NULL && dd->abc_r) esl_alphabet_Destroy(dd->abc_r);
342       return status;
343     }
344   else
345     { /* on eslESYS exceptions - pthread initializations failing - we can't assume we can _Close() correctly. */
346       *ret_dd = NULL;
347       if (*byp_abc == NULL && dd->abc_r) esl_alphabet_Destroy(dd->abc_r);
348       return status;
349     }
350 }
351 
352 
353 
354 /* Function:  esl_dsqdata_Read()
355  * Synopsis:  Read next chunk of sequence data.
356  * Incept:    SRE, Thu Jan 21 11:21:38 2016 [Harvard]
357  *
358  * Purpose:   Read the next chunk from <dd>, return a pointer to it in
359  *            <*ret_chu>, and return <eslOK>. When data are exhausted,
360  *            return <eslEOF>, and <*ret_chu> is <NULL>.
361  *
362  *            Threadsafe. All thread operations in the dsqdata reader
363  *            are handled internally. Caller does not have to worry
364  *            about wrapping this in a mutex. Multiple caller threads
365  *            can call <esl_dsqdata_Read()>.
366  *
367  *            All chunk allocation and deallocation is handled
368  *            internally. After using a chunk, caller gives it back to
369  *            the reader using <esl_dsqdata_Recycle()>.
370  *
371  * Args:      dd      : open dsqdata object to read from
372  *            ret_chu : RETURN : next chunk of seq data
373  *
374  * Returns:   <eslOK> on success. <*ret_chu> is a chunk of seq data.
375  *            Caller must call <esl_dsqdata_Recycle()> on each chunk
376  *            that it Read()'s.
377  *
378  *            <eslEOF> if we've reached the end of the input file;
379  *            <*ret_chu> is NULL.
380  *
381  * Throws:    <eslESYS> if a pthread call fails.
382  *            Caller should treat this as disastrous. Without correctly
383  *            working pthread calls, we cannot read, and we may not be able
384  *            to correctly clean up and close the reader. Caller should
385  *            treat <dd> as toxic, clean up whatever else it may need to,
386  *            and exit.
387  */
388 int
esl_dsqdata_Read(ESL_DSQDATA * dd,ESL_DSQDATA_CHUNK ** ret_chu)389 esl_dsqdata_Read(ESL_DSQDATA *dd, ESL_DSQDATA_CHUNK **ret_chu)
390 {
391   ESL_DSQDATA_CHUNK *chu    = NULL;
392   int                u;
393 
394   /* First, determine which slot the next chunk is in, using the consumer-shared <nchunk> counter */
395   if ( pthread_mutex_lock(&dd->nchunk_mutex) != 0) ESL_EXCEPTION(eslESYS, "failed to lock reader mutex");
396   u = (int) (dd->nchunk % dd->n_unpackers);
397 
398   /* Acquire a lock that outbox, wait for it to be in full or EOD state */
399   if ( pthread_mutex_lock(&(dd->outbox_mutex[u])) != 0) ESL_EXCEPTION(eslESYS, "failed to lock outbox[u] mutex");
400   while (! dd->outbox_eod[u]  && dd->outbox[u] == NULL)  {
401     if ( pthread_cond_wait(&(dd->outbox_cv[u]), &(dd->outbox_mutex[u])) != 0) ESL_EXCEPTION(eslESYS, "failed to wait on outbox[u] signal");
402   }
403 
404   /* Get the chunk from outbox. */
405   chu           = dd->outbox[u];
406   dd->outbox[u] = NULL;
407   if (chu) dd->nchunk++;     // chu==NULL if we're EOD. (and eod flag is up too, but we already know that by getting here)
408 
409   /* Release the outbox lock and signal back to unpacker (skip signalling if we're EOD; unpacker[u] is done) */
410   if (        pthread_mutex_unlock(&(dd->outbox_mutex[u])) != 0) ESL_EXCEPTION(eslESYS, "failed to unlock outbox[u] mutex");
411   if ( chu && pthread_cond_signal (&(dd->outbox_cv[u]))    != 0) ESL_EXCEPTION(eslESYS, "failed to signal outbox[u] is empty");
412 
413   /* Release the reader lock that protects dd->nchunk counter */
414   if ( pthread_mutex_unlock(&dd->nchunk_mutex) != 0) ESL_EXCEPTION(eslESYS, "failed to unlock reader mutex");
415 
416   *ret_chu = chu;
417   return (chu ? eslOK : eslEOF);
418 }
419 
420 
421 /* Function:  esl_dsqdata_Recycle()
422  * Synopsis:  Give a chunk back to the reader.
423  * Incept:    SRE, Thu Feb 11 19:24:33 2016
424  *
425  * Purpose:   Recycle chunk <chu> back to the reader <dd>.  The reader
426  *            is responsible for all allocation and deallocation of
427  *            chunks. The reader will either reuse the chunk's memory
428  *            if more chunks remain to be read, or it will free it.
429  *
430  * Args:      dd  : dsqdata reader
431  *            chu : chunk to recycle
432  *
433  * Returns:   <eslOK> on success.
434  *
435  * Throws:    <eslESYS> on a pthread call failure. Caller should regard
436  *            such an error as disastrous; if pthread calls are
437  *            failing, you cannot depend on the reader to be working
438  *            at all, and you should treat <dd> as toxic. Do whatever
439  *            desperate things you need to do and exit.
440  */
441 int
esl_dsqdata_Recycle(ESL_DSQDATA * dd,ESL_DSQDATA_CHUNK * chu)442 esl_dsqdata_Recycle(ESL_DSQDATA *dd, ESL_DSQDATA_CHUNK *chu)
443 {
444   if (chu)
445     {
446       if ( pthread_mutex_lock(&dd->recycling_mutex)   != 0) ESL_EXCEPTION(eslESYS, "pthread mutex lock failed");
447       chu->nxt      = dd->recycling;       // Push chunk onto head of recycling stack
448       dd->recycling = chu;
449       if ( pthread_mutex_unlock(&dd->recycling_mutex) != 0) ESL_EXCEPTION(eslESYS, "pthread mutex unlock failed");
450       if ( pthread_cond_signal(&dd->recycling_cv)     != 0) ESL_EXCEPTION(eslESYS, "pthread cond signal failed");  // loader may wait for this signal
451     }
452   return eslOK;
453 }
454 
455 
456 
457 /* Function:  esl_dsqdata_Close()
458  * Synopsis:  Close a dsqdata reader.
459  * Incept:    SRE, Thu Feb 11 19:32:54 2016
460  *
461  * Purpose:   Close a dsqdata reader.
462  *
463  * Returns:   <eslOK> on success.
464 
465  * Throws:    <eslESYS> on a system call failure, including pthread
466  *            calls and fclose(). Caller should regard such a failure
467  *            as disastrous: treat <dd> as toxic and exit as soon as
468  *            possible without making any other system calls, if possible.
469  *
470  * Note: Normally Easel would call _Close() not only on a correctly
471  *            opened object (after the Open returns) but also from
472  *            within the Open call to clean up on failure. This relies
473  *            on being able to tell which parts of an incomplete
474  *            object have been initialized; this is why Easel sets
475  *            ptrs to NULL before allocating them, for example.  POSIX
476  *            threads doesn't seem to give us a way to check whether a
477  *            mutex, condition variable, or thread has been
478  *            initialized; and destroying an uninitialized mutex or CV
479  *            results in undefined behavior. We could include a
480  *            boolean flag for each pthreads component -- or one for
481  *            all of them -- but instead, we simply say that a failure
482  *            of pthreads initialization in the Open function is so
483  *            catastrophic and unexpected that we exit that function
484  *            without cleaning up the incomplete ESL_DSQDATA
485  *            structure.  We expect that the caller is simply going to
486  *            have to quit anyway, so we don't need to worry about
487  *            memory leakage.
488  */
489 int
esl_dsqdata_Close(ESL_DSQDATA * dd)490 esl_dsqdata_Close(ESL_DSQDATA *dd)
491 {
492   int u;
493 
494   if (dd)
495     {
496       /* out of abundance of caution - wait for threads to join before breaking down <dd> */
497       if ( pthread_join(dd->loader_t,   NULL)      != 0)  ESL_EXCEPTION(eslESYS, "pthread join failed");
498       for (u = 0; u < dd->n_unpackers; u++)
499 	if ( pthread_join(dd->unpacker_t[u], NULL) != 0)  ESL_EXCEPTION(eslESYS, "pthread join failed");
500 
501       if (dd->basename) free(dd->basename);
502       if (dd->stubfp) { if ( fclose(dd->stubfp) != 0) ESL_EXCEPTION(eslESYS, "fclose failed"); }
503       if (dd->ifp)    { if ( fclose(dd->ifp)    != 0) ESL_EXCEPTION(eslESYS, "fclose failed"); }
504       if (dd->sfp)    { if ( fclose(dd->sfp)    != 0) ESL_EXCEPTION(eslESYS, "fclose failed"); }
505       if (dd->mfp)    { if ( fclose(dd->mfp)    != 0) ESL_EXCEPTION(eslESYS, "fclose failed"); }
506 
507       for (u = 0; u < dd->n_unpackers; u++)
508 	{
509 	  if ( pthread_mutex_destroy(&(dd->inbox_mutex[u]))  != 0)  ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
510 	  if ( pthread_cond_destroy(&(dd->inbox_cv[u]))      != 0)  ESL_EXCEPTION(eslESYS, "pthread cond destroy failed");
511 	  if ( pthread_mutex_destroy(&(dd->outbox_mutex[u])) != 0)  ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
512 	  if ( pthread_cond_destroy(&(dd->outbox_cv[u]))     != 0)  ESL_EXCEPTION(eslESYS, "pthread cond destroy failed");
513 	}
514       if ( pthread_mutex_destroy(&dd->nchunk_mutex)          != 0)  ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
515       if ( pthread_mutex_destroy(&dd->recycling_mutex)       != 0)  ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
516       if ( pthread_cond_destroy(&dd->recycling_cv)           != 0)  ESL_EXCEPTION(eslESYS, "pthread cond destroy failed");
517       if ( pthread_mutex_destroy(&dd->go_mutex)              != 0)  ESL_EXCEPTION(eslESYS, "pthread mutex destroy failed");
518       if ( pthread_cond_destroy(&dd->go_cv)                  != 0)  ESL_EXCEPTION(eslESYS, "pthread cond destroy failed");
519 
520       /* Loader thread is responsible for freeing all chunks it created, even on error. */
521 #if (eslDEBUGLEVEL >= 1)
522       for (u = 0; u < dd->n_unpackers; u++) {
523 	assert( dd->inbox[u]  == NULL );
524 	assert( dd->outbox[u] == NULL );
525       }
526       assert(dd->recycling == NULL );
527 #endif
528       free(dd);
529     }
530   return eslOK;
531 }
532 
533 
534 /*****************************************************************
535  *# 2. Creating dsqdata format from a sequence file
536  *****************************************************************/
537 
538 /* Function:  esl_dsqdata_Write()
539  * Synopsis:  Create a dsqdata database
540  * Incept:    SRE, Sat Feb 13 07:33:30 2016 [AGBT 2016, Orlando]
541  *
542  * Purpose:   Caller has just opened <sqfp>, in digital mode.
543  *            Create a dsqdata database <basename> from the sequence
544  *            data in <sqfp>.
545  *
546  *            <sqfp> must be protein, DNA, or RNA sequence data.  It
547  *            must be rewindable (i.e. a file), because we have to
548  *            read it twice. It must be newly opened (i.e. positioned
549  *            at the start).
550  *
551  * Args:      sqfp     - newly opened sequence data file
552  *            basename - base name of dsqdata files to create
553  *            errbuf   - user-directed error message on normal errors
554  *
555  * Returns:   <eslOK> on success.
556  *
557  *            <eslEWRITE> if an output file can't be opened. <errbuf>
558  *            contains user-directed error message.
559  *
560  *            <eslEFORMAT> if a parse error is encountered while
561  *            reading <sqfp>.
562  *
563  *
564  * Throws:    <eslESYS>   A system call failed, such as fwrite().
565  *            <eslEINVAL> Sequence handle <sqfp> isn't digital and rewindable.
566  *            <eslEMEM>   Allocation failure
567  *            <eslEUNIMPLEMENTED> Sequence is too long to be encoded.
568  *                               (TODO: chromosome-scale DNA sequences)
569  */
570 int
esl_dsqdata_Write(ESL_SQFILE * sqfp,char * basename,char * errbuf)571 esl_dsqdata_Write(ESL_SQFILE *sqfp, char *basename, char *errbuf)
572 {
573   ESL_RANDOMNESS *rng         = NULL;
574   ESL_SQ         *sq          = NULL;
575   FILE           *stubfp      = NULL;
576   FILE           *ifp         = NULL;
577   FILE           *mfp         = NULL;
578   FILE           *sfp         = NULL;
579   char           *outfile     = NULL;
580   uint32_t        magic       = eslDSQDATA_MAGIC_V1;
581   uint32_t        uniquetag;
582   uint32_t        alphatype;
583   uint32_t        flags       = 0;
584   uint32_t        max_namelen = 0;
585   uint32_t        max_acclen  = 0;
586   uint32_t        max_desclen = 0;
587   uint64_t        max_seqlen  = 0;
588   uint64_t        nseq        = 0;
589   uint64_t        nres        = 0;
590   int             do_pack5    = FALSE;
591   uint32_t       *psq;
592   ESL_DSQDATA_RECORD idx;                    // one index record to write
593   int             plen;
594   int64_t         spos        = 0;
595   int64_t         mpos        = 0;
596   int             n;
597   int             status;
598 
599   if (! esl_sqfile_IsRewindable(sqfp))  ESL_EXCEPTION(eslEINVAL, "sqfp must be rewindable (e.g. an open file)");
600   if (! sqfp->abc)                      ESL_EXCEPTION(eslEINVAL, "sqfp must be digital");
601   // Could also check that it's positioned at the start.
602   if ( (sq = esl_sq_CreateDigital(sqfp->abc)) == NULL) { status = eslEMEM; goto ERROR; }
603 
604   /* First pass over the sequence file, to get statistics.
605    * Read it now, before opening any files, in case we find any parse errors.
606    */
607   while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
608     {
609       if (sq->n >= 6 * eslDSQDATA_CHUNK_MAXPACKET)  // guaranteed limit
610 	ESL_EXCEPTION(eslEUNIMPLEMENTED, "dsqdata cannot currently deal with large sequences");
611 
612       nseq++;
613       nres += sq->n;
614       if (sq->n > max_seqlen) max_seqlen = sq->n;
615       n = strlen(sq->name); if (n > max_namelen) max_namelen = n;
616       n = strlen(sq->acc);  if (n > max_acclen)  max_acclen  = n;
617       n = strlen(sq->desc); if (n > max_desclen) max_desclen = n;
618       esl_sq_Reuse(sq);
619     }
620   if      (status == eslEFORMAT) ESL_XFAIL(eslEFORMAT, errbuf, sqfp->get_error(sqfp));
621   else if (status != eslEOF)     return status;
622 
623   if ((status = esl_sqfile_Position(sqfp, 0)) != eslOK) return status;
624 
625 
626   if ((    rng = esl_randomness_Create(0) )        == NULL)  { status = eslEMEM; goto ERROR; }
627   uniquetag = esl_random_uint32(rng);
628   alphatype = sqfp->abc->type;
629 
630   if      (alphatype == eslAMINO)                      do_pack5 = TRUE;
631   else if (alphatype != eslDNA && alphatype != eslRNA) ESL_EXCEPTION(eslEINVAL, "alphabet must be protein or nucleic");
632 
633 
634   if (( status = esl_sprintf(&outfile, "%s.dsqi", basename)) != eslOK) goto ERROR;
635   if ((    ifp = fopen(outfile, "wb"))             == NULL)  ESL_XFAIL(eslEWRITE, errbuf, "failed to open dsqdata index file %s for writing", outfile);
636   sprintf(outfile, "%s.dsqm", basename);
637   if ((    mfp = fopen(outfile, "wb"))             == NULL)  ESL_XFAIL(eslEWRITE, errbuf, "failed to open dsqdata metadata file %s for writing", outfile);
638   sprintf(outfile, "%s.dsqs", basename);
639   if ((    sfp = fopen(outfile, "wb"))             == NULL)  ESL_XFAIL(eslEWRITE, errbuf, "failed to open dsqdata sequence file %s for writing", outfile);
640   if (( stubfp = fopen(basename, "w"))             == NULL)  ESL_XFAIL(eslEWRITE, errbuf, "failed to open dsqdata stub file %s for writing", basename);
641 
642 
643 
644 
645   /* Header: index file */
646   if (fwrite(&magic,       sizeof(uint32_t), 1, ifp) != 1 ||
647       fwrite(&uniquetag,   sizeof(uint32_t), 1, ifp) != 1 ||
648       fwrite(&alphatype,   sizeof(uint32_t), 1, ifp) != 1 ||
649       fwrite(&flags,       sizeof(uint32_t), 1, ifp) != 1 ||
650       fwrite(&max_namelen, sizeof(uint32_t), 1, ifp) != 1 ||
651       fwrite(&max_acclen,  sizeof(uint32_t), 1, ifp) != 1 ||
652       fwrite(&max_desclen, sizeof(uint32_t), 1, ifp) != 1 ||
653       fwrite(&max_seqlen,  sizeof(uint64_t), 1, ifp) != 1 ||
654       fwrite(&nseq,        sizeof(uint64_t), 1, ifp) != 1 ||
655       fwrite(&nres,        sizeof(uint64_t), 1, ifp) != 1)
656     ESL_XEXCEPTION_SYS(eslESYS, "fwrite() failed, index file header");
657 
658   /* Header: metadata file */
659   if (fwrite(&magic,       sizeof(uint32_t), 1, mfp) != 1 ||
660       fwrite(&uniquetag,   sizeof(uint32_t), 1, mfp) != 1)
661     ESL_XEXCEPTION_SYS(eslESYS, "fwrite() failed, metadata file header");
662 
663   /* Header: sequence file */
664   if (fwrite(&magic,       sizeof(uint32_t), 1, sfp) != 1 ||
665       fwrite(&uniquetag,   sizeof(uint32_t), 1, sfp) != 1)
666     ESL_XEXCEPTION_SYS(eslESYS, "fwrite() failed, metadata file header");
667 
668   /* Second pass: index, metadata, and sequence files */
669   while ((status = esl_sqio_Read(sqfp, sq)) == eslOK)
670     {
671       /* Packed sequence */
672       psq = (uint32_t *) sq->dsq;        // pack-in-place
673       ESL_DASSERT1(( sq->salloc >= 4 )); // required min space for pack-in-place
674       if (do_pack5) dsqdata_pack5(sq->dsq, sq->n, psq, &plen);
675       else          dsqdata_pack2(sq->dsq, sq->n, psq, &plen);
676       if ( fwrite(psq, sizeof(uint32_t), plen, sfp) != plen)
677 	ESL_XEXCEPTION(eslESYS, "fwrite() failed, packed seq");
678       spos += plen;
679 
680       /* Metadata */
681       n = strlen(sq->name);
682       if ( fwrite(sq->name, sizeof(char), n+1, mfp) != n+1)
683 	ESL_XEXCEPTION(eslESYS, "fwrite () failed, metadata, name");
684       mpos += n+1;
685 
686       n = strlen(sq->acc);
687       if ( fwrite(sq->acc,  sizeof(char), n+1, mfp) != n+1)
688 	ESL_XEXCEPTION(eslESYS, "fwrite () failed, metadata, accession");
689       mpos += n+1;
690 
691       n = strlen(sq->desc);
692       if ( fwrite(sq->desc, sizeof(char), n+1, mfp) != n+1)
693 	ESL_XEXCEPTION(eslESYS, "fwrite () failed, metadata, description");
694       mpos += n+1;
695 
696       if ( fwrite( &(sq->tax_id), sizeof(int32_t), 1, mfp) != 1)
697 	ESL_XEXCEPTION(eslESYS, "fwrite () failed, metadata, taxonomy id");
698       mpos += sizeof(int32_t);
699 
700       /* Index file */
701       idx.psq_end      = spos-1;  // could be -1, on 1st seq, if 1st seq L=0.
702       idx.metadata_end = mpos-1;
703       if ( fwrite(&idx, sizeof(ESL_DSQDATA_RECORD), 1, ifp) != 1)
704 	ESL_XEXCEPTION(eslESYS, "fwrite () failed, index file");
705 
706       esl_sq_Reuse(sq);
707     }
708 
709   /* Stub file */
710   fprintf(stubfp, "Easel dsqdata v1 x%" PRIu32 "\n", uniquetag);
711   fprintf(stubfp, "\n");
712   fprintf(stubfp, "Original file:   %s\n",          sqfp->filename);
713   fprintf(stubfp, "Original format: %s\n",          esl_sqio_DecodeFormat(sqfp->format));
714   fprintf(stubfp, "Type:            %s\n",          esl_abc_DecodeType(sqfp->abc->type));
715   fprintf(stubfp, "Sequences:       %" PRIu64 "\n", nseq);
716   fprintf(stubfp, "Residues:        %" PRIu64 "\n", nres);
717 
718   esl_sq_Destroy(sq);
719   esl_randomness_Destroy(rng);
720   free(outfile);
721   fclose(stubfp);
722   fclose(ifp);
723   fclose(mfp);
724   fclose(sfp);
725   return eslOK;
726 
727  ERROR:
728   if (sq)      esl_sq_Destroy(sq);
729   if (rng)     esl_randomness_Destroy(rng);
730   if (outfile) free(outfile);
731   if (stubfp)  fclose(stubfp);
732   if (ifp)     fclose(ifp);
733   if (mfp)     fclose(mfp);
734   if (sfp)     fclose(sfp);
735   return status;
736 }
737 
738 
739 
740 /*****************************************************************
741  * 3. ESL_DSQDATA_CHUNK: a chunk of input sequence data
742  *****************************************************************/
743 
744 static ESL_DSQDATA_CHUNK *
dsqdata_chunk_Create(ESL_DSQDATA * dd)745 dsqdata_chunk_Create(ESL_DSQDATA *dd)
746 {
747   ESL_DSQDATA_CHUNK *chu = NULL;
748   int                U;               // max size of unpacked seq data, in bytes (smem allocation)
749   int                status;
750 
751   ESL_ALLOC(chu, sizeof(ESL_DSQDATA_CHUNK));
752   chu->i0       = 0;
753   chu->N        = 0;
754   chu->pn       = 0;
755   chu->dsq      = NULL;
756   chu->name     = NULL;
757   chu->acc      = NULL;
758   chu->desc     = NULL;
759   chu->taxid    = NULL;
760   chu->L        = NULL;
761   chu->metadata = NULL;
762   chu->smem     = NULL;
763   chu->nxt      = NULL;
764 
765   /* dsq, name, acc, desc are arrays of pointers into smem, metadata.
766    * taxid is cast to int, from the metadata.
767    * L is figured out by the unpacker.
768    * All of these are set by the unpacker.
769    */
770   ESL_ALLOC(chu->dsq,   dd->chunk_maxseq * sizeof(ESL_DSQ *));
771   ESL_ALLOC(chu->name,  dd->chunk_maxseq * sizeof(char *));
772   ESL_ALLOC(chu->acc,   dd->chunk_maxseq * sizeof(char *));
773   ESL_ALLOC(chu->desc,  dd->chunk_maxseq * sizeof(char *));
774   ESL_ALLOC(chu->taxid, dd->chunk_maxseq * sizeof(int));
775   ESL_ALLOC(chu->L,     dd->chunk_maxseq * sizeof(int64_t));
776 
777   /* On the <smem> allocation, and the <dsq> and <psq> pointers into it:
778    *
779    * <maxpacket> (in uint32's) sets the maximum single fread() size:
780    * one load of a new chunk of packed sequence, up to maxpacket*4
781    * bytes. <smem> needs to be able to hold both that and the fully
782    * unpacked sequence, because we unpack in place.  Each packet
783    * unpacks to at most 6 or 15 residues (5-bit or 2-bit packing) We
784    * don't pack sentinels, so the maximum unpacked size includes
785    * <maxseq>+1 sentinels... because we concat the digital seqs so
786    * that the trailing sentinel of seq i is the leading sentinel of
787    * seq i+1.
788    *
789    * The packed seq (max of P bytes) loads overlap with the unpacked
790    * data (max of U bytes):
791    *                   psq
792    *                   v[    P bytes    ]
793    * smem: 0........0........0..........0
794    *       ^[         U bytes           ]
795    *       ^dsq[0]  ^dsq[1]  ^dsq[2]
796    *
797    * and as long as we unpack psq left to right -- and as long as we
798    * read the last packet before we write the last unpacked residues
799    * to smem - we're guaranteed that the unpacking works without
800    * overwriting any unpacked data.
801    */
802   U  = (dd->pack5 ? 6 * dd->chunk_maxpacket : 15 * dd->chunk_maxpacket);
803   U += dd->chunk_maxseq + 1;
804   ESL_ALLOC(chu->smem, sizeof(ESL_DSQ) * U);
805   chu->psq = (uint32_t *) (chu->smem + U - 4*dd->chunk_maxpacket);
806 
807   /* We don't have any guarantees about the amount of metadata
808    * associated with the N sequences, so <metadata> has to be a
809    * reallocatable space. We make a lowball guess for the initial
810    * alloc, on the off chance that the metadata size is small (names
811    * only, no acc/desc): minimally, say 12 bytes of name, 3 \0's, and
812    * 4 bytes for the taxid integer: call it 20.
813    */
814   chu->mdalloc = 20 * dd->chunk_maxseq;
815   ESL_ALLOC(chu->metadata, sizeof(char) * chu->mdalloc);
816 
817   return chu;
818 
819  ERROR:
820   dsqdata_chunk_Destroy(chu);
821   return NULL;
822 }
823 
824 
825 static void
dsqdata_chunk_Destroy(ESL_DSQDATA_CHUNK * chu)826 dsqdata_chunk_Destroy(ESL_DSQDATA_CHUNK *chu)
827 {
828   if (chu)
829     {
830       if (chu->metadata) free(chu->metadata);
831       if (chu->smem)     free(chu->smem);
832       if (chu->L)        free(chu->L);
833       if (chu->taxid)    free(chu->taxid);
834       if (chu->desc)     free(chu->desc);
835       if (chu->acc)      free(chu->acc);
836       if (chu->name)     free(chu->name);
837       if (chu->dsq)      free(chu->dsq);
838       free(chu);
839     }
840 }
841 
842 
843 /*****************************************************************
844  * 4. Loader and unpacker, the input threads
845  *****************************************************************/
846 
847 static void *
dsqdata_loader_thread(void * p)848 dsqdata_loader_thread(void *p)
849 {
850   ESL_DSQDATA         *dd        = (ESL_DSQDATA *) p;
851   ESL_DSQDATA_RECORD  *idx       = NULL;
852   ESL_DSQDATA_CHUNK   *chu       = NULL;
853   int64_t              nchunk    = 0;             // total number of chunks loaded (including empty EOF)
854   int                  nalloc    = 0;             // number of chunks we create, and need to destroy.
855   int                  nidx      = 0;             // how many records in <idx>: usually MAXSEQ, until end
856   int                  nload     = 0;             // how many sequences we load: >=1, <=nidx
857   int                  ncarried  = 0;             // how many records carry over to next iteration: nidx-nload
858   int                  nread     = 0;             // fread()'s return value
859   int                  nmeta     = 0;             // how many bytes of metadata we want to read for this chunk
860   int                  i0        = 0;             // absolute index of first record in <idx>, 0-offset
861   int64_t              psq_last  = -1;            // psq_end for record i0-1
862   int64_t              meta_last = -1;            // metadata_end for record i0-1
863   int                  u;                         // which unpacker outbox we put this chunk in
864   int                  status;
865 
866   /* Don't use <dd> until we get the structure-is-ready signal */
867   if ( pthread_mutex_lock(&dd->go_mutex)   != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock failed on go_mutex");
868   while (! dd->go) {
869     if ( pthread_cond_wait(&dd->go_cv, &dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_wait failed on go_cv");
870   }
871   if ( pthread_mutex_unlock(&dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock failed on go_mutex");
872 
873   /* We can begin. */
874   ESL_ALLOC(idx, sizeof(ESL_DSQDATA_RECORD) * dd->chunk_maxseq);
875   while (1)
876     {
877       //printf("loader: working on chunk %d\n", (int) nchunk+1);
878 
879       /* Get a chunk structure we can use - either by creating it, or recycling it.
880        * We probably don't benefit from having more than <nconsumers> + 3*<n_unpackers> + 2,
881        * which is enough to have all threads working, all in/outboxes full, and at least 1
882        * waiting in recycling.
883        * SRE TODO: test, how many is optimal, does it matter?
884        *           the two limits below perform comparably in `esl_dsqdata_example -n`, but that
885        *           doesn't have high-cpu reader threads.
886        */
887       //if (nalloc < dd->nconsumers + dd->n_unpackers + 1)
888       if (nalloc < dd->nconsumers + 3 * dd->n_unpackers + 2)
889 	{
890 	  //printf("loader: creating a new chunk\n");
891 
892 	  if ( (chu = dsqdata_chunk_Create(dd)) == NULL) { status = eslEMEM; goto ERROR; }
893 	  nalloc++;
894 	}
895       else
896 	{
897 	  //printf("loader: getting a new chunk from recycling...\n");
898 
899 	  if ( pthread_mutex_lock(&dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
900 	  while (dd->recycling == NULL) {
901 	    if ( pthread_cond_wait(&dd->recycling_cv, &dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
902 	  }
903 	  chu           = dd->recycling;                 // pop one off recycling stack
904 	  dd->recycling = chu->nxt;
905 	  if ( pthread_mutex_unlock(&dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
906 	  if ( pthread_cond_signal(&dd->recycling_cv)     != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed"); 	  // signal *after* unlocking mutex
907 
908 	  //printf("loader: ... done, have new chunk from recycling.\n");
909 	}
910 
911       /* Refill index. (The memmove is avoidable. Alt strategy: we could load in 2 frames)
912        * The previous loop loaded packed sequence for <nload'> of the <nidx'> entries,
913        * where the 's indicate the variable has carried over from prev iteration:
914        *       |----- nload' ----||--- (ncarried) ---|
915        *       |-------------- nidx' ----------------|
916        * Now we're going to shift the remainder ncarried = nidx-nload to the left, then refill:
917        *       |---- ncarried ----||--- (MAXSEQ-ncarried) ---|
918        *       |-------------- MAXSEQ -----------------------|
919        * while watching out for the terminal case where we run out of
920        * data, loading less than (MAXSEQ-ncarried) records:
921        *       |---- ncarried ----||--- nidx* ---|
922        *       |------------- nidx --------------|
923        * where the <nidx*> is what fread() returns to us.
924        */
925       i0      += nload;               // this chunk starts with seq #<i0>
926       ncarried = (nidx - nload);
927       memmove(idx, idx + nload, sizeof(ESL_DSQDATA_RECORD) * ncarried);
928       nidx  = fread(idx + ncarried, sizeof(ESL_DSQDATA_RECORD), dd->chunk_maxseq - ncarried, dd->ifp);
929       nidx += ncarried;               // usually, this'll be MAXSEQ, unless we're near EOF.
930 
931       if (nidx == 0)  // then we're EOD.
932 	{
933 	  //printf("loader: reached EOD.\n");
934 	  dsqdata_chunk_Destroy(chu);
935 	  nalloc--;  // we'd counted that chunk towards <nalloc>.
936 	  break;     // this is the only way out of loader's main loop
937 	}
938 
939 
940       /* Figure out how many sequences we're going to load: <nload>
941        *  nload = max i : i <= MAXSEQ && idx[i].psq_end - psq_last <= CHUNK_MAX
942        */
943       ESL_DASSERT1(( idx[0].psq_end - psq_last <= dd->chunk_maxpacket ));
944       if (idx[nidx-1].psq_end - psq_last <= dd->chunk_maxpacket)
945 	nload = nidx;
946       else
947 	{ // Binary search for nload = max_i idx[i-1].psq_end - lastend <= MAX
948 	  int righti = nidx;
949 	  int mid;
950 	  nload = 1;
951 	  while (righti - nload > 1)
952 	    {
953 	      mid = nload + (righti - nload) / 2;
954 	      if (idx[mid-1].psq_end - psq_last <= dd->chunk_maxpacket) nload = mid;
955 	      else righti = mid;
956 	    }
957 	}
958 
959       /* Read packed sequence. */
960       //printf("loader: loading chunk %d from disk.\n", (int) nchunk+1);
961 
962       chu->pn = idx[nload-1].psq_end - psq_last;
963       nread   = fread(chu->psq, sizeof(uint32_t), chu->pn, dd->sfp);
964       //printf("Read %d packed ints from seq file\n", nread);
965       if ( nread != chu->pn ) ESL_XEXCEPTION(eslEOD, "dsqdata packet loader: expected %d, got %d", chu->pn, nread);
966 
967 
968       /* Read metadata, reallocating if needed */
969       nmeta = idx[nload-1].metadata_end - meta_last;
970       if (nmeta > chu->mdalloc) {
971 	ESL_REALLOC(chu->metadata, sizeof(char) * nmeta);   // should be realloc by doubling instead?
972 	chu->mdalloc = nmeta;
973       }
974       nread  = fread(chu->metadata, sizeof(char), nmeta, dd->mfp);
975       if ( nread != nmeta ) ESL_XEXCEPTION(eslEOD, "dsqdata metadata loader: expected %d, got %d", nmeta, nread);
976 
977       chu->i0   = i0;
978       chu->N    = nload;
979       psq_last  = idx[nload-1].psq_end;
980       meta_last = idx[nload-1].metadata_end;
981 
982       /* Put chunk in appropriate outbox.
983        */
984       u = nchunk % dd->n_unpackers;   // note this is the loader's own private <nchunk> counter, not the consumer-shared one in <dd>
985       //printf("loader: about to put chunk %d into inbox %d\n", (int) nchunk+1, u);
986       if ( pthread_mutex_lock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
987       while (dd->inbox[u] != NULL) {
988 	if (pthread_cond_wait(&(dd->inbox_cv[u]), &(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
989       }
990       dd->inbox[u] = chu;
991       if ( pthread_mutex_unlock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
992       if ( pthread_cond_signal(&(dd->inbox_cv[u]))     != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed");
993       //printf("loader: finished putting chunk %d into inbox %d\n", (int) nchunk+1, u);
994 
995       nchunk++;
996     }
997 
998   /* Cleanup time. First, set all unpacker inboxes to EOD state. (We overwrite <u> here.)
999    */
1000   for (u = 0; u < dd->n_unpackers; u++)
1001     {
1002       //printf("loader: setting EOD on inbox %d\n", u);
1003       if ( pthread_mutex_lock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
1004       while (dd->inbox[u] != NULL) {
1005 	if (pthread_cond_wait(&(dd->inbox_cv[u]), &(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
1006       }
1007       dd->inbox_eod[u] = TRUE;  // we know inbox[u] is already NULL
1008       if ( pthread_mutex_unlock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
1009       if ( pthread_cond_signal(&(dd->inbox_cv[u]))     != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed");
1010     }
1011 
1012 
1013   /* Then, wait to get all our chunks back through the recycling, so we
1014    * can free them and exit cleanly. We counted them as they went out,
1015    * in <nalloc>, so we know how many need to come home to roost.
1016    */
1017   while (nalloc)
1018     {
1019       //printf("loader: clearing recycling, %d chunks left to free\n", nalloc);
1020       if ( pthread_mutex_lock(&dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
1021       while (dd->recycling == NULL)  {               // Readers may still be working, will Recycle() their chunks
1022 	if ( pthread_cond_wait(&dd->recycling_cv, &dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
1023       }
1024       while (dd->recycling != NULL) {               // Free entire stack, while we have the mutex locked.
1025 	chu           = dd->recycling;
1026 	dd->recycling = chu->nxt;
1027 	dsqdata_chunk_Destroy(chu);
1028 	nalloc--;
1029       }
1030       if ( pthread_mutex_unlock(&dd->recycling_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
1031       /* Because the recycling is a stack, consumers never have to wait
1032        * on a condition to Recycle(), so we don't need to signal anything.
1033        */
1034     }
1035   //printf("loader: exiting\n");
1036   free(idx);
1037   pthread_exit(NULL);
1038 
1039  ERROR:
1040   /* Defying Easel standards, we treat all exceptions as fatal, at
1041    * least for the moment.  This isn't a problem in HMMER, Infernal
1042    * because they already use fatal exception handlers (i.e., we never
1043    * reach this code anyway, if the parent app is using default fatal
1044    * exception handling). It would become a problem if an Easel-based
1045    * app needs to assure no exits from within Easel. Because the other
1046    * threads will block waiting for chunks to come from the loader, if
1047    * the loader fails, we would need a back channel signal of some
1048    * sort to get the other threads to clean up and terminate.
1049    */
1050   if (idx) free(idx);
1051   esl_fatal("  ... dsqdata loader thread failed: unrecoverable");
1052 }
1053 
1054 
1055 
1056 static void *
dsqdata_unpacker_thread(void * p)1057 dsqdata_unpacker_thread(void *p)
1058 {
1059   ESL_DSQDATA          *dd    = (ESL_DSQDATA *) p;
1060   ESL_DSQDATA_CHUNK    *chu   = NULL;
1061   pthread_t             my_id = pthread_self();
1062   int                   u;
1063   int                   status;
1064 
1065   /* Don't use <dd> until we get the structure-is-ready signal */
1066   if ( pthread_mutex_lock(&dd->go_mutex)   != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock failed on go_mutex");
1067   while (! dd->go) {
1068     if ( pthread_cond_wait(&dd->go_cv, &dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_cond_wait failed on go_cv");
1069   }
1070   if ( pthread_mutex_unlock(&dd->go_mutex) != 0) ESL_XEXCEPTION(eslESYS, "pthread_mutex_lock failed on go_mutex");
1071 
1072   /* There may be more than one unpacker. Figure out who we are, so we
1073    * know which slot is ours. (This is why it's especially important
1074    * for the unpacker threads to wait for <dd> to be fully
1075    * initialized.)
1076    */
1077   for (u = 0; u < dd->n_unpackers; u++)
1078     if ( pthread_equal(my_id, dd->unpacker_t[u] )) break;
1079   if (u >= dd->n_unpackers) esl_fatal("unpacker failed to figure out its identity");
1080 
1081   //printf("unpacker thread %d: ready.\n", u);
1082 
1083   /* Ready. Let's go. */
1084   do {
1085     //printf("unpacker thread %d: checking for next chunk.\n", u);
1086 
1087     /* Get a chunk from loader in our inbox. Wait if necessary. */
1088     if ( pthread_mutex_lock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
1089     while (! dd->inbox_eod[u] && dd->inbox[u] == NULL) {
1090       if ( pthread_cond_wait(&(dd->inbox_cv[u]), &(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
1091     }
1092     chu           = dd->inbox[u];  // NULL if we're EOD
1093     dd->inbox[u]  = NULL;
1094     if ( pthread_mutex_unlock(&(dd->inbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
1095 
1096     //if (chu) printf("unpacker thread %d: took encoded chunk from inbox\n", u);
1097     //else     printf("unpacker thread %d: EOD\n", u);
1098 
1099     if (chu) {
1100       /* only need to signal inbox change to the loader if we're not EOD */
1101       if ( pthread_cond_signal(&(dd->inbox_cv[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed");
1102       /* unpack it */
1103       if (( status = dsqdata_unpack_chunk(chu, dd->pack5)) != eslOK) goto ERROR;
1104     }
1105 
1106     /* Put unpacked chunk into the unpacker's outbox, or set EOD status.
1107      * May need to wait for it to be empty/available.
1108      */
1109     if ( pthread_mutex_lock(&(dd->outbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex lock failed");
1110     while (dd->outbox[u] != NULL) {
1111       if ( pthread_cond_wait(&(dd->outbox_cv[u]), &(dd->outbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread cond wait failed");
1112     }
1113     dd->outbox[u] = chu;  // that's NULL if we're EOD
1114     if (! chu) dd->outbox_eod[u] = TRUE;
1115     if ( pthread_mutex_unlock(&(dd->outbox_mutex[u])) != 0) ESL_XEXCEPTION(eslESYS, "pthread mutex unlock failed");
1116     if ( pthread_cond_signal(&(dd->outbox_cv[u]))     != 0) ESL_XEXCEPTION(eslESYS, "pthread cond signal failed");
1117 
1118     //if (chu) printf("unpacker thread %d: placed unpacked chunk on outbox\n", u);
1119     //else     printf("unpacker thread %d: exiting EOD\n", u);
1120   } while (chu);
1121   pthread_exit(NULL);
1122 
1123  ERROR:
1124   /* See comment in loader thread: for lack of a back channel mechanism
1125    * to tell other threads to clean up and terminate, we violate Easel standards
1126    * and turn nonfatal exceptions into fatal ones.
1127    */
1128   esl_fatal("  ... dsqdata unpacker thread failed: unrecoverable");
1129 }
1130 
1131 
1132 /*****************************************************************
1133  * 5. Packing sequences and unpacking chunks
1134  *****************************************************************/
1135 
1136 /* dsqdata_unpack_chunk()
1137  *
1138  * <do_pack5> is a hint: if caller knows that all the packets in the
1139  * chunk are 5-bit encoded (i.e. amino acid sequence), it can pass
1140  * <TRUE>, enabling a small optimization. Otherwise the packed
1141  * sequences will be treated as mixed 2- and 5-bit encoding, as is
1142  * needed for DNA/RNA sequences; protein sequences also unpack fine
1143  * that way, but the 5-bit flag on every packet needs to be checked.
1144  *
1145  * Throws:    <eslEFORMAT> if a problem is seen in the binary format
1146  */
1147 static int
dsqdata_unpack_chunk(ESL_DSQDATA_CHUNK * chu,int do_pack5)1148 dsqdata_unpack_chunk(ESL_DSQDATA_CHUNK *chu, int do_pack5)
1149 {
1150   char     *ptr = chu->metadata;           // ptr will walk through metadata
1151   int       r;                             // position in unpacked dsq array
1152   int       i;                             // sequence index: 0..chu->N-1
1153   int       pos;                           // position in packet array
1154   int       L;                             // an unpacked sequence length
1155   int       P;                             // number of packets unpacked
1156 
1157   /* "Unpack" the metadata */
1158   for (i = 0; i < chu->N; i++)
1159     {
1160       /* The data are user input, so we cannot trust that it has \0's where we expect them.  */
1161       if ( ptr >= chu->metadata + chu->mdalloc) ESL_EXCEPTION(eslEFORMAT, "metadata format error");
1162       chu->name[i] = ptr;                           ptr = 1 + strchr(ptr, '\0');   if ( ptr >= chu->metadata + chu->mdalloc) ESL_EXCEPTION(eslEFORMAT, "metadata format error");
1163       chu->acc[i]  = ptr;                           ptr = 1 + strchr(ptr, '\0');   if ( ptr >= chu->metadata + chu->mdalloc) ESL_EXCEPTION(eslEFORMAT, "metadata format error");
1164       chu->desc[i] = ptr;                           ptr = 1 + strchr(ptr, '\0');   if ( ptr >= chu->metadata + chu->mdalloc) ESL_EXCEPTION(eslEFORMAT, "metadata format error");
1165       chu->taxid[i] = (int32_t) *((int32_t *) ptr); ptr += sizeof(int32_t);
1166     }
1167 
1168   /* Unpack the sequence data */
1169   i            = 0;
1170   r            = 0;
1171   pos          = 0;
1172   chu->smem[0] = eslDSQ_SENTINEL;  // This initialization is why <smem> needs to be unsigned.
1173   while (pos < chu->pn)
1174     {
1175       chu->dsq[i] = (ESL_DSQ *) chu->smem + r;
1176       if (do_pack5) dsqdata_unpack5(chu->psq + pos, chu->dsq[i], &L, &P);
1177       else          dsqdata_unpack2(chu->psq + pos, chu->dsq[i], &L, &P);
1178 
1179       r   += L+1;     // L+1, not L+2, because we overlap start/end sentinels
1180       pos += P;
1181       chu->L[i] = L;
1182       i++;
1183     }
1184 
1185   ESL_DASSERT1(( pos == chu->pn ));  // we should've unpacked exactly pn packets,
1186   ESL_DASSERT1((   i == chu->N ));   //  .. and exactly N sequences.
1187   return eslOK;
1188 }
1189 
1190 
1191 /* Unpack 5-bit encoded sequence, starting at <psq>.
1192  * Important: dsq[0] is already initialized to eslDSQ_SENTINEL,
1193  * as a nitpicky optimization (the sequence data in a chunk are
1194  * concatenated so that they share end/start sentinels).
1195  */
1196 static int
dsqdata_unpack5(uint32_t * psq,ESL_DSQ * dsq,int * ret_L,int * ret_P)1197 dsqdata_unpack5(uint32_t *psq, ESL_DSQ *dsq, int *ret_L, int *ret_P)
1198 {
1199   int      pos = 0;          // position in psq[]
1200   int      r   = 1;          // position in dsq[]. caller set dsq[0] to eslDSQ_SENTINEL.
1201   uint32_t v   = psq[pos++];
1202   int      b;                // bit shift counter
1203 
1204   while (! ESL_DSQDATA_EOD(v))               // we trust that we'll see a sentinel at the end
1205     {
1206       ESL_DASSERT1(( ESL_DSQDATA_5BIT(v) )); // All packets are 5-bit encoded
1207       dsq[r++] = (v >> 25) & 31; dsq[r++] = (v >> 20) & 31; dsq[r++] = (v >> 15) & 31;
1208       dsq[r++] = (v >> 10) & 31; dsq[r++] = (v >>  5) & 31; dsq[r++] = (v >>  0) & 31;
1209       v = psq[pos++];
1210     }
1211 
1212   /* Unpack sentinel packet, which may be partial; it can even contain
1213    * zero residues in the edge case of an L=0 sequence.
1214    */
1215   ESL_DASSERT1(( ESL_DSQDATA_5BIT(v) ));
1216   for (b = 25; b >= 0 && ((v >> b) & 31) != 31; b -= 5)
1217     dsq[r++] = (v >> b) & 31;
1218   dsq[r++] = eslDSQ_SENTINEL;
1219   // r is now L+2:   the raw sequence length + 2 sentinels
1220   // P = pos, because pos index advanced to next packet after sentinel
1221   *ret_L = r-2;
1222   *ret_P = pos;
1223   return eslOK;
1224 }
1225 
1226 /* Unpack 2-bit (+ mixed 5-bit for noncanonicals) encoding.
1227  * Important: dsq[0] is already initialized to eslDSQ_SENTINEL
1228  *
1229  * This will work for protein sequences just fine; just a little
1230  * slower than calling dsqdata_unpack5(), because here we have
1231  * to check the 5-bit encoding bit on every packet.
1232  */
1233 static int
dsqdata_unpack2(uint32_t * psq,ESL_DSQ * dsq,int * ret_L,int * ret_P)1234 dsqdata_unpack2(uint32_t *psq, ESL_DSQ *dsq, int *ret_L, int *ret_P)
1235 {
1236   int      pos = 0;
1237   int      r   = 1;
1238   uint32_t v   = psq[pos++];
1239   int      b;                  // bit shift counter
1240 
1241   while (! ESL_DSQDATA_EOD(v))
1242     {
1243       if ( ESL_DSQDATA_5BIT(v))  // 5-bit encoded, full. Don't need mask on bit 31 because we know it's down.
1244 	{
1245 	  dsq[r++] = (v >> 25) & 31; dsq[r++] = (v >> 20) & 31; dsq[r++] = (v >> 15) & 31;
1246 	  dsq[r++] = (v >> 10) & 31; dsq[r++] = (v >>  5) & 31; dsq[r++] = (v >>  0) & 31;
1247 	}
1248       else                      // 2-bit encoded, full
1249 	{
1250 	  dsq[r++] = (v >> 28) & 3;  dsq[r++] = (v >> 26) & 3;  dsq[r++] = (v >> 24) & 3;
1251 	  dsq[r++] = (v >> 22) & 3;  dsq[r++] = (v >> 20) & 3;  dsq[r++] = (v >> 18) & 3;
1252 	  dsq[r++] = (v >> 16) & 3;  dsq[r++] = (v >> 14) & 3;  dsq[r++] = (v >> 12) & 3;
1253 	  dsq[r++] = (v >> 10) & 3;  dsq[r++] = (v >>  8) & 3;  dsq[r++] = (v >>  6) & 3;
1254 	  dsq[r++] = (v >>  4) & 3;  dsq[r++] = (v >>  2) & 3;  dsq[r++] = (v >>  0) & 3;
1255 	}
1256       v = psq[pos++];
1257     }
1258 
1259   /* Sentinel packet.
1260    * If 2-bit, it's full. If 5-bit, it's usually partial, and may even be 0-len.
1261    */
1262   if ( ESL_DSQDATA_5BIT(v)) // 5-bit, partial
1263     {
1264       for (b = 25; b >= 0 && ((v >> b) & 31) != 31; b -= 5)
1265 	dsq[r++] = (v >> b) & 31;
1266     }
1267   else
1268     {
1269       dsq[r++] = (v >> 28) & 3;  dsq[r++] = (v >> 26) & 3;  dsq[r++] = (v >> 24) & 3;
1270       dsq[r++] = (v >> 22) & 3;  dsq[r++] = (v >> 20) & 3;  dsq[r++] = (v >> 18) & 3;
1271       dsq[r++] = (v >> 16) & 3;  dsq[r++] = (v >> 14) & 3;  dsq[r++] = (v >> 12) & 3;
1272       dsq[r++] = (v >> 10) & 3;  dsq[r++] = (v >>  8) & 3;  dsq[r++] = (v >>  6) & 3;
1273       dsq[r++] = (v >>  4) & 3;  dsq[r++] = (v >>  2) & 3;  dsq[r++] = (v >>  0) & 3;
1274     }
1275   dsq[r++] = eslDSQ_SENTINEL;
1276 
1277   *ret_L = r-2;
1278   *ret_P = pos;
1279   return eslOK;
1280 }
1281 
1282 
1283 /* dsqdata_pack5()
1284  *
1285  * Pack a digital (protein) sequence <dsq> of length <n>, into <psq>
1286  * using 5-bit encoding; return the number of packets <*ret_P>.
1287  *
1288  * <psq> must be allocated for at least $MAX(1, (n+5)/6)$ packets.
1289  *
1290  * You can pack in place, by passing the same pointer <dsq> as <psq>,
1291  * provided that dsq is allocated for at least 1 packet (4 bytes).  We
1292  * know that <psq> is either smaller than <dsq> ($4P <= n$) or that it
1293  * consists of one EOD packet (in the case n=0).
1294  */
1295 static int
dsqdata_pack5(ESL_DSQ * dsq,int n,uint32_t * psq,int * ret_P)1296 dsqdata_pack5(ESL_DSQ *dsq, int n, uint32_t *psq, int *ret_P)
1297 {
1298   int        r   = 1;    // position in <dsq>
1299   int        pos = 0;    // position in <psq>.
1300   int        b;          // bitshift
1301   uint32_t   v;          // tmp var needed to guarantee pack-in-place works
1302 
1303   while (r <= n)
1304     {
1305       v = eslDSQDATA_5BIT;            // initialize packet with 5-bit flag
1306       for (b = 25; b >= 0 && r <= n; b -= 5)  v  |= (uint32_t) dsq[r++] << b;
1307       for (      ; b >= 0;           b -= 5)  v  |= (uint32_t)       31 << b;
1308 
1309       if (r > n) v |= eslDSQDATA_EOD; // EOD bit
1310       psq[pos++] = v;                 // we know we've already read all the dsq we need under psq[pos]
1311     }
1312 
1313   /* Special case of n=0: we need an empty EOD sentinel packet. */
1314   if (pos == 0) { v = 0; psq[pos++] = ~v; }   // all bits set: | EOD | 5BIT | all sentinels |
1315 
1316   *ret_P = pos;
1317   return eslOK;
1318 }
1319 
1320 
1321 /* dsqdata_pack2()
1322  *
1323  * Pack a digital (nucleic) sequence <dsq> of total length
1324  * <n>, into <psq>; return the number of packets <*ret_P>.
1325  *
1326  * <psq> must be allocated for at least $MAX(1, (n+5)/6)$ packets.
1327  * (Yes, even in 2-bit packing, because worst case, the sequence
1328  * contains so many noncanonicals that it's entirely 5-bit encoded.)
1329  *
1330  * You can pack in place, by passing the same pointer <dsq> as <psq>,
1331  * provided that dsq is allocated for at least 1 packet (4 bytes).  We
1332  * know that <psq> is either smaller than <dsq> ($4P <= n$) or that it
1333  * consists of one EOD packet (in the case n=0).
1334  */
1335 static int
dsqdata_pack2(ESL_DSQ * dsq,int n,uint32_t * psq,int * ret_P)1336 dsqdata_pack2(ESL_DSQ *dsq, int n, uint32_t *psq, int *ret_P)
1337 {
1338   int        pos  = 0;     // position in <psq>
1339   int        d    = 0;     // position of next degen residue, 1..n, n+1 if none
1340   int        r    = 1;     // position in <dsq> 1..n
1341   int        b;            // bitshift
1342   uint32_t   v;            // tmp var needed to guarantee pack-in-place works
1343 
1344   while (r <= n)
1345     {
1346       // Slide the "next degenerate residue" detector
1347       if (d < r)
1348 	for (d = r; d <= n; d++)
1349 	  if (dsq[d] > 3) break;
1350 
1351       // Can we 2-bit pack the next 15 residues, r..r+14?
1352       // n-r+1 = number of residues remaining to be packed.
1353       if (n-r+1 >= 15 && d > r+14)
1354 	{
1355 	  v  = 0;
1356 	  for (b = 28; b >= 0; b -=2) v |= (uint32_t) dsq[r++] << b;
1357 	}
1358       else
1359 	{
1360 	  v = eslDSQDATA_5BIT; // initialize v with 5-bit packing bit
1361 	  for (b = 25; b >= 0 && r <= n; b -= 5) v  |= (uint32_t) dsq[r++] << b;
1362 	  for (      ; b >= 0;           b -= 5) v  |= (uint32_t)       31 << b;
1363 	}
1364 
1365       if (r > n) v |= eslDSQDATA_EOD; // EOD bit
1366       psq[pos++] = v;                 // we know we've already read all the dsq we need under psq[pos]
1367     }
1368 
1369   /* Special case of n=0: we need an empty EOD sentinel packet.
1370    * Sentinel packets are 5-bit encoded, even in 2-bit coding scheme
1371    */
1372   if (pos == 0) { v = 0; psq[pos++] = ~v; }   // all bits set: | EOD | 5BIT | all sentinels |
1373 
1374   *ret_P = pos;
1375   return eslOK;
1376 }
1377 
1378 
1379 /*****************************************************************
1380  * 6. Notes
1381  *****************************************************************
1382  *
1383  * [1] Packed sequence data format.
1384  *
1385  *      Format of a single packet:
1386  *      [31] [30] [29..25]  [24..20]  [19..15]  [14..10]  [ 9..5 ]  [ 4..0 ]
1387  *       ^    ^   |------------  6 5-bit packed residues ------------------|
1388  *       |    |   []  []  []  []  []  []  []  []  []  []  []  []  []  []  []
1389  *       |    |   |----------- or 15 2-bit packed residues ----------------|
1390  *       |    |
1391  *       |    "packtype" bit 30 = 0 if packet is 2-bit packed; 1 if 5-bit packed
1392  *       "sentinel" bit 31 = 1 if last packet in packed sequence; else 0
1393  *
1394  *       (packet & (1 << 31)) tests for end of sequence
1395  *       (packet & (1 << 30)) tests for 5-bit packing vs. 2-bit
1396  *       ((packet >> shift) && 31) decodes 5-bit, for shift=25..0 in steps of 5
1397  *       ((packet >> shift) && 3)  decodes 2-bit, for shift=28..0 in steps of 2
1398  *
1399  *       Packets without the sentinel bit set are always full (unpack
1400  *       to 15 or 6 residue codes).
1401  *
1402  *       5-bit EOD packets may be partial: they unpack to 0..6
1403  *       residues.  The remaining residue codes are set to 0x1f
1404  *       (11111) to indicate EOD within a partial packet.
1405  *
1406  *       A 0-length sequence is encoded by a 5-bit partial EOD packet
1407  *       with 0 residues. This is the only case in which a partial
1408  *       packet contains 0 residues. (Because we can end with an EOD
1409  *       full packet, there is no other case where we end up with 0
1410  *       leftover residues to encode.)
1411  *
1412  *       2-bit EOD packets must be full, because there is no way to
1413  *       signal EOD locally within a 2-bit packet. Can't use 0x03 (11)
1414  *       because that's T/U. Generally, then, the last packet of a
1415  *       nucleic acid sequence must be 5-bit encoded, solely to be
1416  *       able to encode EOD in a partial packet.
1417  *
1418  *       A packed sequence consists of an integer number of packets,
1419  *       P, which ends with an EOD packet that may contain a partial
1420  *       number of residues. P packets are guaranteed to be able to
1421  *       encode at least 6P residues in either scheme.
1422  *
1423  *       A sequence of length L packs into P <= MAX(1, (N+5)/6)
1424  *       packets. (1, because a 0-length sequence still requires an
1425  *       EOD packet.) This is true even for nucleic sequences, because
1426  *       noncanonical residues can force DNA/RNA sequence to pack
1427  *       entirely in 5-bit coding.
1428  *
1429  *       A packed amino acid sequence unpacks to 6P-5 <= L <= 6P
1430  *       residues (for P>1; 0 <= L <= 6 for P=1) and all packets are
1431  *       5-bit encoded.
1432  *
1433  *       A packed nucleic acid sequence unpacks to 6P-5 <= L <= 15P
1434  *       residues (for P>1; 0 <= L <= 15 for P=1). The packets are a
1435  *       mix of 2-bit and 5-bit. Degenerate residues must be 5-bit
1436  *       packed, and the EOD packet usually is too. A 5-bit packet
1437  *       does not have to contain degenerate residues, because it may
1438  *       have been necessary to get "in frame" to pack a downstream
1439  *       degenerate residue. For example, the sequence
1440  *       ACGTACGTNNA... must be packed as [ACGTAC][CGTNNA]... to get
1441  *       the N's packed correctly.
1442  *
1443  * [2] Compression: relative incompressibility of biological sequences.
1444  *
1445  *      Considered using fast (de)compression algorithms that are fast
1446  *      enough to keep up with disk read speed, including LZ4 and
1447  *      Google's Snappy. However, lz4 only achieves 1.0-1.9x global
1448  *      compression of protein sequence (compared to 1.5x for
1449  *      packing), and 2.0x for DNA (compared to 3.75x for packing).
1450  *      With local, blockwise compression, which we need for random
1451  *      access and indexing, it gets worse. Packing is superior.
1452  *
1453  *      Metadata compression is more feasible, but I still opted
1454  *      against it. Although metadata are globally quite compressible
1455  *      (3.2-6.9x in trials with lz4), locally in 64K blocks lz4 only
1456  *      achieves 2x.  [xref SRE:2016/0201-seqfile-compression]
1457  *
1458  * [3] Maybe getting more packing using run-length encoding.
1459  *
1460  *      Genome assemblies typically have long runs of N's (human
1461  *      GRCh38.p2 is about 5% N), and it's excruciating to have to
1462  *      pack it into bulky 5-bit degenerate packets. I considered
1463  *      run-length encoding (RLE). One possibility is to use a special
1464  *      packet format akin to the 5-bit packet format:
1465  *
1466  *        [0] [?] [11111] [.....] [....................]
1467  *        ^        ^       ^       20b number, <=2^20-1
1468  *        |        |       5-bit residue code
1469  *        |        sentinel residue 31 set
1470  *        sentinel bit unset
1471  *
1472  *      This is a uniquely detectable packet structure because a full
1473  *      packet (with unset sentinel bit) would otherwise never contain
1474  *      a sentinel residue (code 31).
1475  *
1476  *      However, using RLE would make our unpacked data sizes too
1477  *      unpredictable; we wouldn't have the <=6P or <=15P guarantee,
1478  *      so we couldn't rely on fixed-length allocation of <smem> in
1479  *      our chunk. Consumers wouldn't be getting predictable chunk
1480  *      sizes, which could complicate load balancing. I decided
1481  *      against it.
1482  */
1483 
1484 
1485 /*****************************************************************
1486  * 7. Unit tests
1487  *****************************************************************/
1488 #ifdef eslDSQDATA_TESTDRIVE
1489 
1490 #include "esl_randomseq.h"
1491 
1492 /* Exercise the packing and unpacking routines:
1493  *    dsqdata_pack2, dsqdata_pack5, and dsqdata_unpack
1494  */
1495 static void
utest_packing(ESL_RANDOMNESS * rng,ESL_ALPHABET * abc,int nsamples)1496 utest_packing(ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, int nsamples)
1497 {
1498   char      msg[] = "esl_dsqdata :: packing unit test failed";
1499   ESL_DSQ  *dsq   = NULL;  // We start with a dirty random sequence...
1500   uint32_t *psq   = NULL;  //   ... pack it ...
1501   ESL_DSQ  *dsq2  = NULL;  //   ... and unpack it. Then check that it's the same seq.
1502   int       L_max = 46;    // We'll sample L on 0..L_max. L_max doesn't need to be large to exercise well.
1503   int       P_max = ESL_MAX(1, (L_max + 5) / 6); // So sayeth the docs, so let's test it.
1504   int       L, P, L2, P2;
1505   int       i;
1506 
1507   if ((dsq  = malloc(sizeof(ESL_DSQ)  * (L_max + 2))) == NULL) esl_fatal(msg);
1508   if ((psq  = malloc(sizeof(uint32_t) * P_max))       == NULL) esl_fatal(msg);
1509   if ((dsq2 = malloc(sizeof(ESL_DSQ)  * (L_max + 2))) == NULL) esl_fatal(msg);
1510 
1511   for (i = 0; i < nsamples; i++)
1512     {
1513       L = esl_rnd_Roll(rng, L_max+1); // 0..L_max
1514 
1515       esl_rsq_SampleDirty(rng, abc, NULL, L, dsq);
1516 
1517       if (abc->type == eslAMINO) { if ( dsqdata_pack5(dsq, L, psq, &P) != eslOK) esl_fatal(msg); }
1518       else                       { if ( dsqdata_pack2(dsq, L, psq, &P) != eslOK) esl_fatal(msg); }
1519 
1520       dsq2[0] = eslDSQ_SENTINEL;  // interface to _unpack functions requires caller to do this
1521       if (abc->type == eslAMINO) { if ( dsqdata_unpack5(psq, dsq2, &L2, &P2) != eslOK) esl_fatal(msg); }
1522       else                       { if ( dsqdata_unpack2(psq, dsq2, &L2, &P2) != eslOK) esl_fatal(msg); }
1523 
1524       if (L2 != L)                                       esl_fatal(msg);
1525       if (P2 != P)                                       esl_fatal(msg);
1526       if (memcmp((void *) dsq, (void *) dsq2, L+2) != 0) esl_fatal(msg);
1527 
1528       /* Write garbage into the buffers, so nobody's cheating on the test somehow */
1529       esl_rnd_mem(rng, (void *) dsq,  L_max+2);
1530       esl_rnd_mem(rng, (void *) dsq2, L_max+2);
1531       esl_rnd_mem(rng, (void *) psq,  (sizeof(uint32_t) * P_max));
1532     }
1533 
1534   free(dsq);
1535   free(psq);
1536   free(dsq2);
1537 }
1538 
1539 
1540 static void
utest_readwrite(ESL_RANDOMNESS * rng,ESL_ALPHABET * abc)1541 utest_readwrite(ESL_RANDOMNESS *rng, ESL_ALPHABET *abc)
1542 {
1543   char               msg[]         = "esl_dsqdata :: readwrite unit test failed";
1544   char               tmpfile[16]   = "esltmpXXXXXX";
1545   char               basename[32];
1546   ESL_SQ           **sqarr         = NULL;
1547   FILE              *tmpfp         = NULL;
1548   ESL_SQFILE        *sqfp          = NULL;
1549   ESL_DSQDATA       *dd            = NULL;
1550   ESL_DSQDATA_CHUNK *chu           = NULL;
1551   int               nseq           = 1 + esl_rnd_Roll(rng, 20000);  // 1..20000
1552   int               maxL           = 100;
1553   int               i;
1554   int               status;
1555 
1556   /* 1. Sample <nseq> random dirty digital sequences, storing them for later comparison;
1557    *    write them out to a tmp FASTA file. The Easel FASTA format writer writes <name> <acc>
1558    *    <desc> on the descline, but the reader only reads <name> <desc> (as is standard
1559    *    for FASTA format), so blank the accession to avoid confusion.
1560    */
1561   if (( status = esl_tmpfile_named(tmpfile, &tmpfp)) != eslOK) esl_fatal(msg);
1562   if (( sqarr = malloc(sizeof(ESL_SQ *) * nseq))      == NULL) esl_fatal(msg);
1563   for (i = 0; i < nseq; i++)
1564     {
1565       sqarr[i] = NULL;
1566       if (( status = esl_sq_Sample(rng, abc, maxL, &(sqarr[i])))              != eslOK) esl_fatal(msg);
1567       if (( status = esl_sq_SetAccession(sqarr[i], ""))                       != eslOK) esl_fatal(msg);
1568       if (( status = esl_sqio_Write(tmpfp, sqarr[i], eslSQFILE_FASTA, FALSE)) != eslOK) esl_fatal(msg);
1569     }
1570   fclose(tmpfp);
1571 
1572   /* 2.  Make a dsqdata database from the FASTA tmpfile.
1573    */
1574   if (( status = esl_sqfile_OpenDigital(abc, tmpfile, eslSQFILE_FASTA, NULL, &sqfp)) != eslOK) esl_fatal(msg);
1575   if ((          snprintf(basename, 32, "%s-db", tmpfile))                           <= 0)     esl_fatal(msg);
1576   if (( status = esl_dsqdata_Write(sqfp, basename, NULL))                            != eslOK) esl_fatal(msg);
1577   esl_sqfile_Close(sqfp);
1578 
1579   /* 3.  Open and read the dsqdata; compare to the original sequences.
1580    */
1581   if    (( status = esl_dsqdata_Open(&abc, basename, 1, &dd)) != eslOK)  esl_fatal(msg);
1582   while (( status = esl_dsqdata_Read(dd, &chu)) == eslOK)
1583     {
1584       for (i = 0; i < chu->N; i++)
1585 	{
1586 	  if ( chu->L[i]          != sqarr[i+chu->i0]->n )                   esl_fatal(msg);
1587 	  if ( memcmp( chu->dsq[i],  sqarr[i+chu->i0]->dsq, chu->L[i]) != 0) esl_fatal(msg);
1588 	  if ( strcmp( chu->name[i], sqarr[i+chu->i0]->name)           != 0) esl_fatal(msg);
1589 	  // FASTA does not read accession - instead we get both accession/description as <desc>
1590 	  if ( strcmp( chu->desc[i], sqarr[i+chu->i0]->desc)           != 0) esl_fatal(msg);
1591 	  // FASTA also does not store taxid - so don't test that either
1592 	}
1593       esl_dsqdata_Recycle(dd, chu);
1594     }
1595   if (status != eslEOF) esl_fatal(msg);
1596   esl_dsqdata_Close(dd);
1597 
1598   remove(tmpfile);
1599   remove(basename);
1600   snprintf(basename, 32, "%s-db.dsqi", tmpfile); remove(basename);
1601   snprintf(basename, 32, "%s-db.dsqm", tmpfile); remove(basename);
1602   snprintf(basename, 32, "%s-db.dsqs", tmpfile); remove(basename);
1603   for (i = 0; i < nseq; i++) esl_sq_Destroy(sqarr[i]);
1604   free(sqarr);
1605 }
1606 #endif /*eslDSQDATA_TESTDRIVE*/
1607 
1608 
1609 
1610 /*****************************************************************
1611  * 8. Test driver
1612  *****************************************************************/
1613 #ifdef eslDSQDATA_TESTDRIVE
1614 
1615 #include "easel.h"
1616 #include "esl_alphabet.h"
1617 #include "esl_dsqdata.h"
1618 #include "esl_getopts.h"
1619 #include "esl_random.h"
1620 
1621 static ESL_OPTIONS options[] = {
1622   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
1623   { "-h",        eslARG_NONE,   FALSE, NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",           0 },
1624   { "-s",        eslARG_INT,      "0", NULL, NULL,  NULL,  NULL, NULL, "set random number seed to <n>",                  0 },
1625   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1626 };
1627 static char usage[]  = "[-options]";
1628 static char banner[] = "unit test driver for Easel dsqdata module";
1629 
1630 int
main(int argc,char ** argv)1631 main(int argc, char **argv)
1632 {
1633   ESL_GETOPTS    *go       = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
1634   ESL_RANDOMNESS *rng      = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));
1635   ESL_ALPHABET   *amino    = esl_alphabet_Create(eslAMINO);
1636   ESL_ALPHABET   *nucleic  = esl_alphabet_Create(eslRNA);
1637   int             nsamples = 100;
1638 
1639   fprintf(stderr, "## %s\n", argv[0]);
1640   fprintf(stderr, "#  rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
1641 
1642   utest_packing(rng, nucleic, nsamples);
1643   utest_packing(rng, amino,   nsamples);
1644 
1645   utest_readwrite(rng, nucleic);
1646   utest_readwrite(rng, amino);
1647 
1648   fprintf(stderr, "#  status = ok\n");
1649 
1650   esl_alphabet_Destroy(amino);
1651   esl_alphabet_Destroy(nucleic);
1652   esl_randomness_Destroy(rng);
1653   esl_getopts_Destroy(go);
1654   exit(0);
1655 }
1656 #endif /*eslDSQDATA_TESTDRIVE*/
1657 
1658 /*****************************************************************
1659  * 9. Examples
1660  *****************************************************************/
1661 
1662 /* esl_dsqdata_example2
1663  * Example of creating a new dsqdata database from a sequence file.
1664  */
1665 #ifdef eslDSQDATA_EXAMPLE2
1666 #include "easel.h"
1667 #include "esl_alphabet.h"
1668 #include "esl_dsqdata.h"
1669 #include "esl_getopts.h"
1670 
1671 static ESL_OPTIONS options[] = {
1672   /* name           type      default  env  range toggles reqs incomp  help                                       docgroup*/
1673   { "-h",        eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",    0 },
1674   { "--dna",     eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use DNA alphabet",                        0 },
1675   { "--rna",     eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use RNA alphabet",                        0 },
1676   { "--amino",   eslARG_NONE,   FALSE,  NULL, NULL,  NULL,  NULL, NULL, "use protein alphabet",                    0 },
1677   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1678 };
1679 static char usage[]  = "[-options] <seqfile_in> <binary seqfile_out>";
1680 static char banner[] = "experimental: create binary database for esl_dsqdata";
1681 
1682 int
main(int argc,char ** argv)1683 main(int argc, char **argv)
1684 {
1685   ESL_GETOPTS    *go        = esl_getopts_CreateDefaultApp(options, 2, argc, argv, banner, usage);
1686   ESL_ALPHABET   *abc       = NULL;
1687   char           *infile    = esl_opt_GetArg(go, 1);
1688   char           *basename  = esl_opt_GetArg(go, 2);
1689   int             format    = eslSQFILE_UNKNOWN;
1690   int             alphatype = eslUNKNOWN;
1691   ESL_SQFILE     *sqfp      = NULL;
1692   char            errbuf[eslERRBUFSIZE];
1693   int             status;
1694 
1695   status = esl_sqfile_Open(infile, format, NULL, &sqfp);
1696   if      (status == eslENOTFOUND) esl_fatal("No such file.");
1697   else if (status == eslEFORMAT)   esl_fatal("Format unrecognized.");
1698   else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);
1699 
1700   if      (esl_opt_GetBoolean(go, "--rna"))   alphatype = eslRNA;
1701   else if (esl_opt_GetBoolean(go, "--dna"))   alphatype = eslDNA;
1702   else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO;
1703   else {
1704     status = esl_sqfile_GuessAlphabet(sqfp, &alphatype);
1705     if      (status == eslENOALPHABET) esl_fatal("Couldn't guess alphabet from first sequence in %s", infile);
1706     else if (status == eslEFORMAT)     esl_fatal("Parse failed (sequence file %s)\n%s\n", infile, sqfp->get_error(sqfp));
1707     else if (status == eslENODATA)     esl_fatal("Sequence file %s contains no data?", infile);
1708     else if (status != eslOK)          esl_fatal("Failed to guess alphabet (error code %d)\n", status);
1709   }
1710   abc = esl_alphabet_Create(alphatype);
1711   esl_sqfile_SetDigital(sqfp, abc);
1712 
1713   status = esl_dsqdata_Write(sqfp, basename, errbuf);
1714   if      (status == eslEWRITE)  esl_fatal("Failed to open dsqdata output files:\n  %s", errbuf);
1715   else if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s)\n  %s", infile, sqfp->get_error(sqfp));
1716   else if (status != eslOK)      esl_fatal("Unexpected error while creating dsqdata file (code %d)\n", status);
1717 
1718   esl_sqfile_Close(sqfp);
1719   esl_alphabet_Destroy(abc);
1720   esl_getopts_Destroy(go);
1721   return eslOK;
1722 }
1723 #endif /*eslDSQDATA_EXAMPLE2*/
1724 
1725 
1726 /* esl_dsqdata_example
1727  * Example of opening and reading a dsqdata database.
1728  */
1729 #ifdef eslDSQDATA_EXAMPLE
1730 #include "easel.h"
1731 #include "esl_alphabet.h"
1732 #include "esl_dsqdata.h"
1733 #include "esl_getopts.h"
1734 #include "esl_vectorops.h"
1735 
1736 static ESL_OPTIONS options[] = {
1737   /* name             type          default  env  range toggles reqs incomp  help                                       docgroup*/
1738   { "-h",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "show brief help on version and usage",        0 },
1739   { "-c",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "report summary of chunk contents",            0 },
1740   { "-r",          eslARG_NONE,       FALSE,  NULL, NULL,  NULL,  NULL, NULL, "report summary of residue counts",            0 },
1741   {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
1742 };
1743 
1744 static char usage[]  = "[-options] <basename>";
1745 static char banner[] = "example of using ESL_DSQDATA to read sequence db";
1746 
1747 int
main(int argc,char ** argv)1748 main(int argc, char **argv)
1749 {
1750   ESL_GETOPTS       *go         = esl_getopts_CreateDefaultApp(options, 1, argc, argv, banner, usage);
1751   ESL_ALPHABET      *abc        = NULL;
1752   char              *basename   = esl_opt_GetArg(go, 1);
1753   int                do_summary = esl_opt_GetBoolean(go, "-c");
1754   int                do_resct   = esl_opt_GetBoolean(go, "-r");
1755   int                ncpu       = 1;
1756   ESL_DSQDATA       *dd         = NULL;
1757   ESL_DSQDATA_CHUNK *chu        = NULL;
1758   int                nchunk     = 0;
1759   int                i;
1760   int64_t            pos;
1761   int64_t            ct[128], total;
1762   int                x;
1763   int                status;
1764 
1765   status = esl_dsqdata_Open(&abc, basename, ncpu, &dd);
1766   if      (status == eslENOTFOUND) esl_fatal("Failed to open dsqdata files:\n  %s",    dd->errbuf);
1767   else if (status == eslEFORMAT)   esl_fatal("Format problem in dsqdata files:\n  %s", dd->errbuf);
1768   else if (status != eslOK)        esl_fatal("Unexpected error in opening dsqdata (code %d)", status);
1769 
1770   for (x = 0; x < 127; x++) ct[x] = 0;
1771 
1772   if (do_summary) esl_dataheader(stdout, 8, "idx", 4, "nseq", 8, "nres", 7, "npacket", 0);
1773 
1774   while ((status = esl_dsqdata_Read(dd, &chu)) == eslOK)
1775     {
1776       nchunk++;
1777 
1778       if (do_resct)
1779 	for (i = 0; i < chu->N; i++)
1780 	  for (pos = 1; pos <= chu->L[i]; pos++)
1781 	    ct[ chu->dsq[i][pos] ]++;
1782 
1783       if (do_summary)
1784 	printf("%-8d %4d %8" PRId64 " %7d\n", nchunk, chu->N, esl_vec_LSum(chu->L, chu->N), chu->pn);
1785 
1786       esl_dsqdata_Recycle(dd, chu);
1787     }
1788   if (status != eslEOF) esl_fatal("unexpected error %d in reading dsqdata", status);
1789 
1790   if (do_resct)
1791     {
1792       total = 0;
1793       for (x = 0; x < abc->Kp; x++)
1794 	{
1795 	  printf("%c  %" PRId64 "\n", abc->sym[x], ct[x]);
1796 	  total += ct[x];
1797 	}
1798       printf("Total = %" PRId64 "\n", total);
1799     }
1800 
1801   esl_alphabet_Destroy(abc);
1802   esl_dsqdata_Close(dd);
1803   esl_getopts_Destroy(go);
1804   return 0;
1805 }
1806 #endif /*eslDSQDATA_EXAMPLE*/
1807 
1808 
1809 
1810