1 /* esl_dsqdata : faster sequence input 2 */ 3 #ifndef eslDSQDATA_INCLUDED 4 #define eslDSQDATA_INCLUDED 5 #include "esl_config.h" 6 7 #include <stdio.h> 8 #include <stdint.h> 9 #include <pthread.h> 10 11 #include "easel.h" 12 #include "esl_alphabet.h" 13 #include "esl_sqio.h" 14 #ifdef __cplusplus // magic to make C++ compilers happy 15 extern "C" { 16 #endif 17 /* Defaults for control parameters 18 */ 19 #define eslDSQDATA_CHUNK_MAXSEQ 4096 // max number of sequences in a chunk 20 #define eslDSQDATA_CHUNK_MAXPACKET 262144 // max number of uint32 sequence packets in a chunk (1MiB chunks) 21 #define eslDSQDATA_UNPACKERS 4 // default number of unpacker threads 22 #define eslDSQDATA_UMAX 4 // max number of unpacker threads (compile-time) 23 24 25 /* ESL_DSQDATA_CHUNK 26 * A data chunk returned by esl_dsqdata_Read(). 27 */ 28 typedef struct esl_dsqdata_chunk_s { 29 int64_t i0; // Chunk contains sequences i0..i0+N-1 from the database, 0-offset 30 int N; // Chunk contains N sequences 31 32 ESL_DSQ **dsq; // Pointers to each of the N sequences 33 char **name; // Names, \0 terminated. Ptr into <metadata> buffer. 34 char **acc; // Optional accessions, \0 terminated; "\0" if none. 35 char **desc; // Optional descriptions, \0 terminated; "\0" if none 36 int32_t *taxid; // NCBI taxonomy identifiers. (>=1 is a taxid; -1 means none) 37 int64_t *L; // Sequence lengths, in residues. The unpacker figures these out. 38 39 /* Memory management */ 40 unsigned char *smem; // Unpacked (dsq[]) and packed (psq) data ptrs share this allocation. [can't be void; we do arithmetic on it] 41 uint32_t *psq; // Pointer into smem; packed data fread()'s go here. 42 int pn; // how many uint32's are loaded in <psq> 43 char *metadata; // Raw fread() buffer of all name/acc/desc/taxid data. 44 int mdalloc; // Current allocation size for <metadata> in bytes 45 struct esl_dsqdata_chunk_s *nxt; // Chunks can be put in linked lists 46 } ESL_DSQDATA_CHUNK; 47 48 49 50 /* ESL_DSQDATA_RECORD 51 * The dsqi index file is composed of an array of these, aside from its header. 52 */ 53 typedef struct esl_dsqdata_record_s { 54 int64_t metadata_end; 55 int64_t psq_end; 56 } ESL_DSQDATA_RECORD; 57 58 59 60 61 /* ESL_DSQDATA 62 * The object created by esl_dsqdata_Open() and used by esl_dsqdata_Read() 63 * to read chunks of sequence data from the database. 64 */ 65 typedef struct esl_dsqdata_s { 66 char *basename; // Basename of the four dsqdata data files 67 FILE *stubfp; // Open <basename> stub file 68 FILE *ifp; // Open basename.dsqi index file 69 FILE *sfp; // Open basename.dsqs sequence file 70 FILE *mfp; // Open basename.dsqm metadata file 71 ESL_ALPHABET *abc_r; // Copy of ptr to the alphabet the caller told us to read in. 72 73 /* Header information from dsqi index file 74 * .. dsqm, dsqs have magic and uniquetag for integrity checking 75 * .. and stub file has uniquetag as text. 76 */ 77 uint32_t magic; // Binary magic format code, for detecting byteswapping 78 uint32_t uniquetag; // Random number tag that links the four files 79 uint32_t flags; // Currently unused (0); reserved for future bitflags 80 uint32_t max_namelen; // Max name length in the dataset 81 uint32_t max_acclen; // .. and max accession length 82 uint32_t max_desclen; // .. and max description length 83 uint64_t max_seqlen; // .. and max seq length. 64b = bring on Paris japonica. 84 uint64_t nseq; // Total number of sequences in the dataset 85 uint64_t nres; // .. and total number of residues 86 87 /* Control parameters. */ 88 int chunk_maxseq; // default = eslDSQDATA_CHUNK_MAXSEQ 89 int chunk_maxpacket; // default = eslDSQDATA_CHUNK_MAXPACKET 90 int do_byteswap; // TRUE if we need to byteswap (bigendian <=> littleendian) 91 int pack5; // TRUE if we're using all 5bit packing; FALSE for mixed 2+5bit 92 93 /* Managing the reader's threaded producer/consumer pipeline: 94 * consisting of 1 loader thread and <n_unpackers> unpacker threads 95 * that we manage, and <nconsumers> consumer threads that caller 96 * created to get successive chunks with esl_dsqdata_Read(). 97 */ 98 int nconsumers; // caller told us the reader is being used by this many consumer threads 99 int n_unpackers; // number of unpacker threads 100 101 ESL_DSQDATA_CHUNK *inbox[eslDSQDATA_UMAX]; // unpacker input slots 102 pthread_mutex_t inbox_mutex[eslDSQDATA_UMAX]; // mutexes protecting the inboxes 103 pthread_cond_t inbox_cv[eslDSQDATA_UMAX]; // signal that state of inbox[u] has changed 104 int inbox_eod[eslDSQDATA_UMAX]; // flag that inbox[u] is in EOD state 105 106 ESL_DSQDATA_CHUNK *outbox[eslDSQDATA_UMAX]; // unpacker output slots 107 pthread_mutex_t outbox_mutex[eslDSQDATA_UMAX]; // mutexes protecting the outboxes 108 pthread_cond_t outbox_cv[eslDSQDATA_UMAX]; // signal that state of outbox[u] has changed 109 int outbox_eod[eslDSQDATA_UMAX]; // flag that outbox[u] is in EOD state 110 111 int64_t nchunk; // # of chunks read so far; shared across consumers 112 pthread_mutex_t nchunk_mutex; // mutex protecting access to <nchunk> from other consumers 113 114 ESL_DSQDATA_CHUNK *recycling; // linked list of chunk memory for reuse 115 pthread_mutex_t recycling_mutex; // mutex protecting the recycling list 116 pthread_cond_t recycling_cv; // signal to loader that a chunk is available 117 118 /* _Open() starts threads while it's still initializing. 119 * To be sure that initialization is complete before threads start their work, 120 * we use a condition variable to send a signal. 121 */ 122 int go; // TRUE when _Open() completes thread initialization. 123 pthread_mutex_t go_mutex; // 124 pthread_cond_t go_cv; // Used to signal worker threads that DSQDATA structure is ready. 125 126 pthread_t loader_t; // loader thread id 127 pthread_t unpacker_t[eslDSQDATA_UMAX]; // unpacker thread ids 128 129 char errbuf[eslERRBUFSIZE]; // User-directed error message in case of a failed open or read. 130 } ESL_DSQDATA; 131 132 133 134 135 136 /* Reading the control bits on a packet v 137 */ 138 #define eslDSQDATA_EOD (1 << 31) 139 #define eslDSQDATA_5BIT (1 << 30) 140 #define ESL_DSQDATA_EOD(v) ((v) & eslDSQDATA_EOD) 141 #define ESL_DSQDATA_5BIT(v) ((v) & eslDSQDATA_5BIT) 142 143 /* Functions in the API 144 */ 145 extern int esl_dsqdata_Open (ESL_ALPHABET **byp_abc, char *basename, int nconsumers, ESL_DSQDATA **ret_dd); 146 extern int esl_dsqdata_Read (ESL_DSQDATA *dd, ESL_DSQDATA_CHUNK **ret_chu); 147 extern int esl_dsqdata_Recycle(ESL_DSQDATA *dd, ESL_DSQDATA_CHUNK *chu); 148 extern int esl_dsqdata_Close (ESL_DSQDATA *dd); 149 150 extern int esl_dsqdata_Write (ESL_SQFILE *sqfp, char *basename, char *errbuf); 151 #ifdef __cplusplus // magic to make C++ compilers happy 152 } 153 #endif 154 #endif /*eslDSQDATA_INCLUDED*/ 155