1 /* esl_dsqdata : faster sequence input
2  */
3 #ifndef eslDSQDATA_INCLUDED
4 #define eslDSQDATA_INCLUDED
5 #include "esl_config.h"
6 
7 #include <stdio.h>
8 #include <stdint.h>
9 #include <pthread.h>
10 
11 #include "easel.h"
12 #include "esl_alphabet.h"
13 #include "esl_sqio.h"
14 #ifdef __cplusplus // magic to make C++ compilers happy
15 extern "C" {
16 #endif
17 /* Defaults for control parameters
18  */
19 #define eslDSQDATA_CHUNK_MAXSEQ       4096      // max number of sequences in a chunk
20 #define eslDSQDATA_CHUNK_MAXPACKET  262144      // max number of uint32 sequence packets in a chunk (1MiB chunks)
21 #define eslDSQDATA_UNPACKERS             4      // default number of unpacker threads
22 #define eslDSQDATA_UMAX                  4      // max number of unpacker threads (compile-time)
23 
24 
25 /* ESL_DSQDATA_CHUNK
26  * A data chunk returned by esl_dsqdata_Read().
27  */
28 typedef struct esl_dsqdata_chunk_s {
29   int64_t   i0;           // Chunk contains sequences i0..i0+N-1 from the database, 0-offset
30   int       N;            // Chunk contains N sequences
31 
32   ESL_DSQ **dsq;          // Pointers to each of the N sequences
33   char    **name;         // Names, \0 terminated.  Ptr into <metadata> buffer.
34   char    **acc;          // Optional accessions, \0 terminated;   "\0" if none.
35   char    **desc;         // Optional descriptions, \0 terminated; "\0" if none
36   int32_t  *taxid;        // NCBI taxonomy identifiers. (>=1 is a taxid; -1 means none)
37   int64_t  *L;            // Sequence lengths, in residues. The unpacker figures these out.
38 
39   /* Memory management */
40   unsigned char *smem;    // Unpacked (dsq[]) and packed (psq) data ptrs share this allocation. [can't be void; we do arithmetic on it]
41   uint32_t *psq;          // Pointer into smem; packed data fread()'s go here.
42   int       pn;           // how many uint32's are loaded in <psq>
43   char     *metadata;     // Raw fread() buffer of all name/acc/desc/taxid data.
44   int       mdalloc;      // Current allocation size for <metadata> in bytes
45   struct esl_dsqdata_chunk_s *nxt; // Chunks can be put in linked lists
46 } ESL_DSQDATA_CHUNK;
47 
48 
49 
50 /* ESL_DSQDATA_RECORD
51  * The dsqi index file is composed of an array of these, aside from its header.
52  */
53 typedef struct esl_dsqdata_record_s {
54   int64_t  metadata_end;
55   int64_t  psq_end;
56 } ESL_DSQDATA_RECORD;
57 
58 
59 
60 
61 /* ESL_DSQDATA
62  * The object created by esl_dsqdata_Open() and used by esl_dsqdata_Read()
63  * to read chunks of sequence data from the database.
64  */
65 typedef struct esl_dsqdata_s {
66   char         *basename;    // Basename of the four dsqdata data files
67   FILE         *stubfp;      // Open <basename> stub file
68   FILE         *ifp;         // Open basename.dsqi index file
69   FILE         *sfp;         // Open basename.dsqs sequence file
70   FILE         *mfp;         // Open basename.dsqm metadata file
71   ESL_ALPHABET *abc_r;       // Copy of ptr to the alphabet the caller told us to read in.
72 
73   /* Header information from dsqi index file
74    *  .. dsqm, dsqs have magic and uniquetag for integrity checking
75    *  .. and stub file has uniquetag as text.
76    */
77   uint32_t     magic;       // Binary magic format code, for detecting byteswapping
78   uint32_t     uniquetag;   // Random number tag that links the four files
79   uint32_t     flags;       // Currently unused (0); reserved for future bitflags
80   uint32_t     max_namelen; // Max name length in the dataset
81   uint32_t     max_acclen;  //  .. and max accession length
82   uint32_t     max_desclen; //  .. and max description length
83   uint64_t     max_seqlen;  //  .. and max seq length. 64b = bring on Paris japonica.
84   uint64_t     nseq;        // Total number of sequences in the dataset
85   uint64_t     nres;        //  .. and total number of residues
86 
87   /* Control parameters. */
88   int          chunk_maxseq;    // default = eslDSQDATA_CHUNK_MAXSEQ
89   int          chunk_maxpacket; // default = eslDSQDATA_CHUNK_MAXPACKET
90   int          do_byteswap;     // TRUE if we need to byteswap (bigendian <=> littleendian)
91   int          pack5;           // TRUE if we're using all 5bit packing; FALSE for mixed 2+5bit
92 
93   /* Managing the reader's threaded producer/consumer pipeline:
94    * consisting of 1 loader thread and <n_unpackers> unpacker threads
95    * that we manage, and <nconsumers> consumer threads that caller
96    * created to get successive chunks with esl_dsqdata_Read().
97    */
98   int                nconsumers;                     // caller told us the reader is being used by this many consumer threads
99   int                n_unpackers;                    // number of unpacker threads
100 
101   ESL_DSQDATA_CHUNK *inbox[eslDSQDATA_UMAX];         // unpacker input slots
102   pthread_mutex_t    inbox_mutex[eslDSQDATA_UMAX];   // mutexes protecting the inboxes
103   pthread_cond_t     inbox_cv[eslDSQDATA_UMAX];      // signal that state of inbox[u] has changed
104   int                inbox_eod[eslDSQDATA_UMAX];     // flag that inbox[u] is in EOD state
105 
106   ESL_DSQDATA_CHUNK *outbox[eslDSQDATA_UMAX];        // unpacker output slots
107   pthread_mutex_t    outbox_mutex[eslDSQDATA_UMAX];  // mutexes protecting the outboxes
108   pthread_cond_t     outbox_cv[eslDSQDATA_UMAX];     // signal that state of outbox[u] has changed
109   int                outbox_eod[eslDSQDATA_UMAX];    // flag that outbox[u] is in EOD state
110 
111   int64_t            nchunk;                         // # of chunks read so far; shared across consumers
112   pthread_mutex_t    nchunk_mutex;                   // mutex protecting access to <nchunk> from other consumers
113 
114   ESL_DSQDATA_CHUNK *recycling;                      // linked list of chunk memory for reuse
115   pthread_mutex_t    recycling_mutex;                // mutex protecting the recycling list
116   pthread_cond_t     recycling_cv;                   // signal to loader that a chunk is available
117 
118   /* _Open() starts threads while it's still initializing.
119    * To be sure that initialization is complete before threads start their work,
120    * we use a condition variable to send a signal.
121    */
122   int                go;            // TRUE when _Open() completes thread initialization.
123   pthread_mutex_t    go_mutex;      //
124   pthread_cond_t     go_cv;         // Used to signal worker threads that DSQDATA structure is ready.
125 
126   pthread_t          loader_t;                       // loader thread id
127   pthread_t          unpacker_t[eslDSQDATA_UMAX];    // unpacker thread ids
128 
129   char errbuf[eslERRBUFSIZE];   // User-directed error message in case of a failed open or read.
130 } ESL_DSQDATA;
131 
132 
133 
134 
135 
136 /* Reading the control bits on a packet v
137  */
138 #define eslDSQDATA_EOD   (1 << 31)
139 #define eslDSQDATA_5BIT  (1 << 30)
140 #define ESL_DSQDATA_EOD(v)   ((v) & eslDSQDATA_EOD)
141 #define ESL_DSQDATA_5BIT(v)  ((v) & eslDSQDATA_5BIT)
142 
143 /* Functions in the API
144  */
145 extern int  esl_dsqdata_Open   (ESL_ALPHABET **byp_abc, char *basename, int nconsumers, ESL_DSQDATA **ret_dd);
146 extern int  esl_dsqdata_Read   (ESL_DSQDATA *dd, ESL_DSQDATA_CHUNK **ret_chu);
147 extern int  esl_dsqdata_Recycle(ESL_DSQDATA *dd, ESL_DSQDATA_CHUNK *chu);
148 extern int  esl_dsqdata_Close  (ESL_DSQDATA *dd);
149 
150 extern int  esl_dsqdata_Write  (ESL_SQFILE *sqfp, char *basename, char *errbuf);
151 #ifdef __cplusplus // magic to make C++ compilers happy
152 }
153 #endif
154 #endif /*eslDSQDATA_INCLUDED*/
155