1 #include "GCdbYank.h"
2 #include "GBase.h"
3 #include <ctype.h>
4
5 #define ERR_READ "GCdbYank: error reading from file.\n"
6 #define ERR_READFMT "GCdbYank read error: incorrect file format.\n"
7 #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n"
8 #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n"
9 // 1MB memory buffer:
10 #define MAX_MEM_RECSIZE 1048576
11 #ifndef O_BINARY
12 #define O_BINARY 0x0000
13 #endif
14
15 //default size of the index record for records stored with 32bit offsets
16 uint32 irec_size32=8;
17
18 #ifdef ENABLE_COMPRESSION
19
GCdbZFasta(FILE * azf,int zrsize,char * r_delim)20 GCdbZFasta::GCdbZFasta(FILE* azf, int zrsize, char* r_delim) {
21 zrecsize=-1;
22 zpos=0;
23 recdelim=Gstrdup(r_delim);
24 zf=azf;
25 decomp_start(zrsize);
26 chrhandler=new GFastaCharHandler(recdelim);
27 }
28
~GCdbZFasta()29 GCdbZFasta::~GCdbZFasta() {
30 //if (zf!=NULL && zf!=stdout && zf!=stdin) fclose(zf);
31 // FULL_FLUSH method instead of finish
32 delete chrhandler;
33 GFREE(recdelim);
34 decomp_end();
35 }
36
decomp_start(int zrsize)37 void GCdbZFasta::decomp_start(int zrsize) {
38 zstream.zalloc = (alloc_func)0;
39 zstream.zfree = (free_func)0;
40 zstream.opaque = (voidpf)0;
41 zstream.next_in = (Bytef*)sbuf;
42 zstream.avail_in = 0;
43 zstream.next_out = (Bytef*)lbuf;
44 int err = inflateInit(&zstream);
45 if (err!=Z_OK)
46 GMessage("Error at inflateInit()\n");
47 //-- now read and discard the first record, so we can use random access later
48 // (needed by zlib)
49 int bytes_read=fread(sbuf, 1, zrsize, zf);
50 if (bytes_read<zrsize)
51 GError("Error reading 1st record from zrec file\n");
52 zstream.next_in = (Bytef*)sbuf;
53 zstream.avail_in = bytes_read;
54 //decompress first chunk
55 zstream.next_out = (Bytef*)lbuf;
56 zstream.avail_out = GCDBZ_LBUF_LEN;
57 err = inflate(&zstream, Z_SYNC_FLUSH);
58 if (err !=Z_OK && err!=Z_STREAM_END)
59 GError("GCdbZFasta error: 1st record inflate failed! (err=%d)\n",err);
60 }
61
decomp_end()62 void GCdbZFasta::decomp_end() {
63 int err = inflateEnd(&zstream);
64 if (err!=Z_OK)
65 GError("Error at inflateEnd() (err=%d)\n", err);
66 }
67
68
69 //fasta record decompress
70 //returns: the number of bytes decompressed
decompress(FastaSeq & rec,int csize,int zfofs,charFunc * seqCallBack)71 int GCdbZFasta::decompress(FastaSeq& rec, int csize, int zfofs, charFunc* seqCallBack) {
72 if (zfofs>=0) {
73 if (fseeko(zf, zfofs, 0))
74 GError("GCdbZFasta::decompress: error fseeko() to %d\n", zfofs);
75 }
76 else
77 if (feof(zf)) return 0;
78 bool in_rec=true;
79 int err=0;
80 int total_read=0;
81 int total_written=0;
82 chrhandler->init(&rec, seqCallBack);
83 while (in_rec) { // read loop
84 int to_read=0;
85 int bytes_read=0;
86 if (csize<=0) { //read one byte at a time
87 to_read=1;
88 int c;
89 if ((c =fgetc(zf))!=EOF) {
90 bytes_read = 1;
91 sbuf[0]=c;
92 }
93 else {
94 //bytes_read=0;
95 return 0; //eof
96 }
97 total_read+=bytes_read;
98 }
99 else {
100 to_read = csize-total_read>GCDBZ_SBUF_LEN ?
101 GCDBZ_SBUF_LEN : csize-total_read;
102 // check for csize vs bytes_read match:
103 if (to_read==0) return 0;
104 bytes_read=fread(sbuf, 1, to_read, zf);
105 if (bytes_read!=to_read)
106 GError("Error reading from zrec file\n");
107 total_read+=bytes_read;
108 in_rec=(total_read<csize);
109 }
110 if (bytes_read==0) {
111 //GMessage("bytes_read = 0\n");
112 return 0;
113 }
114 if (in_rec && bytes_read<to_read) in_rec=false;
115 zstream.next_in = (Bytef*)sbuf;
116 zstream.avail_in = bytes_read;
117
118 do { //decompression loop
119 zstream.next_out = (Bytef*)lbuf;
120 zstream.avail_out = GCDBZ_LBUF_LEN;
121 uLong t_out=zstream.total_out;
122 err = inflate(&zstream, Z_SYNC_FLUSH);
123 uLong toWrite=zstream.total_out-t_out;
124 if (toWrite>0) {
125 /* if (fwrite(lbuf, 1, toWrite, outf)<toWrite) {
126 GError("Error writing inflated chunk!\n");
127 } */
128 for (unsigned int i=0;i<toWrite;i++)
129 chrhandler->processChar(lbuf[i]);
130
131 total_written+=toWrite;
132 }
133 if (err==Z_STREAM_END) {
134 in_rec=false;
135 if (total_written==0) {
136 GMessage("Z_STREAM_END found but total_written=0!\n");
137 }
138 break;
139 }
140 else if (err !=Z_OK)
141 GError("GCdbZFasta error: inflate failed! (err=%d)\n",err);
142 } while (zstream.avail_in!=0); //decompression loop
143 } //read loop
144 chrhandler->done();
145 /*if (err!=Z_STREAM_END) {
146 GError("decompress: Z_STREAM_END not found!\n");
147 }*/
148 return total_written;
149 }
150
openCdbz(char * p)151 GCdbZFasta* GCdbYank::openCdbz(char* p) {
152 //in case this was not done before:
153 gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86;
154 FILE* zf=fopen(p, "rb");
155 if (zf==NULL) {
156 GMessage("Error: cannot open compressed file '%s'!\n",p);
157 return NULL;
158 }
159 //check if the file is valid and read the length of the first record
160 //
161 char ztag[5];
162 ztag[4]='\0';
163 if (fread(ztag, 1, 4, zf)<4) {
164 GMessage("Error reading header of compressed file '%s'\n",p);
165 return NULL;
166 }
167 if (strcmp(ztag, "CDBZ")!=0) {
168 GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p);
169 return NULL;
170 }
171 unsigned int zrecsize;
172 if (fread((void*) &zrecsize,1,4,zf)<4) {
173 GMessage("Error reading 1st compressed record size for file '%s'!\n",p);
174 return NULL;
175 }
176 zrecsize=gcvt_uint(&zrecsize);
177 return new GCdbZFasta(zf, zrecsize, recdelim);
178 }
179 #else
180 static const char err_COMPRESSED[]="Error: compression detected but not compiled in!\n";
181 #endif
182 //decompression stuff
183
184
inplace_Lower(char * c)185 void inplace_Lower(char* c) {
186 char *p=c;
187 while (*p!='\0') { *p=tolower(*p);p++; }
188 }
189
buf_get(GCDBuffer * b,uint32 & pos,char * buf,unsigned int len)190 void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) {
191 int r;
192 while (len > 0) {
193 r = b->get(buf,len);
194 if (r == -1) GError(ERR_READ);
195 if (r == 0)
196 GError(ERR_READFMT);
197 pos += r;
198 buf += r;
199 len -= r;
200 }
201 }
202
buf_getnum(GCDBuffer * b,uint32 & pos,uint32 * num)203 void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) {
204 char buf[4];
205 buf_get(b, pos, buf, 4);
206 uint32_unpack(buf,num);
207 }
208
209
read_dbinfo(int fd,char ** fnameptr,cdbInfo & dbstat)210 int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) {
211 //this is messy due to the need of compatibility with the
212 //old 32bit file-length
213 char* dbname=*fnameptr;
214 //read just the tag first: 4 bytes ID
215 lseek(fd, -cdbInfoSIZE, SEEK_END);
216 int r=read(fd, &dbstat, cdbInfoSIZE );
217 if (r!=cdbInfoSIZE) return 2;
218 //GMessage("Size of dbstat=%d\n", cdbInfoSIZE);
219 if (strncmp(dbstat.oldtag, "CIDX", 4)==0) {
220 //old dbstat structure -- convert it
221 dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]);
222 dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]);
223 dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize);
224 dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags);
225 //position on the dbnamelen entry
226 dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen);
227 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
228 lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END);
229 }
230 else if (strncmp(dbstat.tag, "CDBX", 4)!=0) {
231 GMessage("Error: this doesn't appear to be a cdbfasta created file!\n");
232 return 1;
233 }
234 else { // new CDBX type:
235 dbstat.dbsize = gcvt_offt(&dbstat.dbsize);
236 dbstat.num_keys=gcvt_uint(&dbstat.num_keys);
237 dbstat.num_records=gcvt_uint(&dbstat.num_records);
238 dbstat.idxflags = gcvt_uint(&dbstat.idxflags);
239 //position on the dbnamelen entry
240 dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen);
241 //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
242 lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END);
243 }
244
245 GMALLOC(dbname, dbstat.dbnamelen+1);
246 dbname[dbstat.dbnamelen]='\0';
247 r=read(fd, dbname, dbstat.dbnamelen);
248 *fnameptr=dbname;
249 if (r!=dbstat.dbnamelen)
250 return 2;
251 return 0;
252 }
253
parse_int(char * buf,const char * key,int & e)254 int parse_int(char* buf, const char* key, int& e) {
255 char* p, *q;
256 while (e!=EOF && isspace(e)) { //skip any spaces
257 if (e=='\n') GError(ERR_RANGEFMT, key);
258 e=fgetc(stdin);
259 }
260 if (e==EOF) GError(ERR_RANGEFMT, key);
261 //now e is the first non-space
262 p=buf;
263 q=p;
264 while (e!=EOF && !isspace(e)) {
265 *q=e;
266 q++;
267 e=fgetc(stdin);
268 }
269 *q='\0'; //now p is the starting coordinate string
270 return atoi(p);
271 //now the file pointer should be on the first space after the parsed value
272 }
273
parse_int(char * & f,const char * key,int & e)274 int parse_int(char*& f, const char* key, int& e) {
275 char* p, *q;
276 char buf[16];
277 while (e!='\0' && isspace(e)) { //skip any spaces
278 if (e=='\n') GError(ERR_RANGEFMT, key);
279 f++;
280 e=*f;
281 }
282 if (e=='\0') GError(ERR_RANGEFMT, key);
283 //now e is the first non-space char
284 p=buf;
285 q=p;
286 while (e!='\0' && !isspace(e)) {
287 *q=e;
288 q++;
289 f++;
290 e=*f;
291 }
292 *q='\0';
293 return atoi(p);
294 //now f and e should be on the first space after the parsed value (or '\0')
295 }
296
GCdbYank(const char * fidx,const char * recsep)297 GCdbYank::GCdbYank(const char* fidx, const char* recsep) {
298 is_compressed=false;
299 fd=-1;
300 cdb=NULL;
301 warnings=0;
302 #ifdef ENABLE_COMPRESSION
303 cdbz=NULL;
304 #endif
305 fdb=-1;
306 fz=NULL;
307 dbname=NULL;
308 recdelim=Gstrdup(recsep);
309 if (fidx==NULL) GError("GCdbYank Error: NULL index file name!");
310 idxfile=Gstrdup(fidx);
311 cdb=new GCdbRead(idxfile);
312 fd=cdb->getfd();
313 db_size=0;
314 dbstat.dbsize=0;
315 info_dbname=NULL;
316 int r=read_dbinfo(fd, &info_dbname, dbstat);
317 lseek(fd, 0, SEEK_SET);
318 if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n");
319 else if (r==2)
320 GError("Error reading info chunk!\n");
321 /*try to find the database file
322 rules: if given, only the -d given filename is used
323 otherwise:
324 1) the same directory with the given index file(stripping the suffix)
325 2) the dbstat filepath/name stored by cdbfasta
326 */
327 if (dbname==NULL) {
328 char* p = rstrchr(idxfile, '.');
329 if (p!=NULL) {
330 /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\
331 given, so the database file can be located in the same directory \n\
332 by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/
333 int nlen=p-idxfile;
334 char* namebuf=NULL;
335 GMALLOC(namebuf, nlen+1);
336 strncpy(namebuf, idxfile, nlen);
337 namebuf[nlen]='\0';
338 if (fileExists(namebuf))
339 dbname=namebuf;
340
341 } // strip the index file extenstion
342 // 2) try the stored dbstat name
343 if (dbname==NULL) {
344 if (fileExists(info_dbname)) dbname=info_dbname;
345 else GError("Cannot locate the database file for this index\n");
346 }
347 }// database name not given
348 is_compressed=(dbstat.idxflags & CDBMSK_OPT_COMPRESS);
349 if (is_compressed)
350 //try to open the dbname as a compressed file
351 fz=fopen(dbname, "rb");
352 else fdb=open(dbname, O_RDONLY|O_BINARY);
353 if (fdb==-1 && fz==NULL)
354 GError("Error: cannot open database file %s\n",dbname);
355 if (is_compressed) {
356 #ifndef ENABLE_COMPRESSION
357 GError(err_COMPRESSED);
358 #else
359 fclose(fz);//just to start fresh here
360 //determine size:
361 int ftmp = open(dbname, O_RDONLY|O_BINARY);
362 if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname);
363 struct stat fdbstat;
364 fstat(ftmp, &fdbstat);
365 db_size=fdbstat.st_size;
366 close(ftmp);
367 //-------- reopen here
368 cdbz=openCdbz(dbname);
369 if (cdbz==NULL)
370 GError("Error opening the cdbz file '%s'\n");
371 fz=cdbz->getZFile();
372 #endif
373 }
374 else {
375 struct stat fdbstat;
376 if (stat(dbname, &fdbstat)!=0) {
377 perror("stat()");
378 exit(1);
379 }
380 db_size=fdbstat.st_size;
381 }
382 //abort if the database size was read and it doesn't match the cdbfasta stored size
383 if (dbstat.dbsize>0 && dbstat.dbsize!=db_size)
384 GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n",
385 fdb, dbstat.dbsize, db_size, dbname);
386 fastahandler=new GFastaCharHandler(recdelim);
387 } //* GCdbYank constructor *//
388
~GCdbYank()389 GCdbYank::~GCdbYank() {
390 if (is_compressed) {
391 fclose(fz);
392 #ifdef ENABLE_COMPRESSION
393 delete cdbz;
394 #endif
395 }
396 else close(fdb);
397 GFREE(info_dbname);
398 delete fastahandler;
399 GFREE(recdelim);
400 GFREE(dbname);
401 GFREE(idxfile);
402 delete cdb;
403 close(fd);
404 }
405
406
getRecord(const char * key,FastaSeq & rec,charFunc * seqCallBack)407 int GCdbYank::getRecord(const char* key, FastaSeq& rec, charFunc* seqCallBack) {
408 //assumes fdb is open, cdb was created on the index file
409 int r=cdb->find(key);
410 if (r==0) return 0;
411 if (r==-1)
412 GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
413 /* while (r>0) { */
414 off_t pos = cdb->datapos(); //position of this key's record in the index file
415 unsigned int len=cdb->datalen(); // length of this key's record
416 char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
417 if (cdb->read(bytes,len,pos) == -1)
418 GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
419
420 off_t fpos; //this will be the fastadb offset
421 uint32 reclen; //this will be the fasta record offset
422 if (len>irec_size32) { //64 bit file offset was used
423 fpos=gcvt_offt(bytes);
424 reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]);
425 }
426 else { //32bit offset used
427 fpos=gcvt_uint(bytes);
428 reclen=gcvt_uint(&bytes[sizeof(uint32)]);
429 }
430 //GMessage("reclen=%d\n", reclen);
431
432 /* if (showQuery)
433 fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);*/
434 /*========= FETCHING RECORD CONTENT ======= */
435 if (is_compressed) {
436 //for now: ignore special retrievals, just print the whole record
437 #ifdef ENABLE_COMPRESSION
438 return cdbz->decompress(rec, reclen, fpos, seqCallBack);
439 #else
440 GError(err_COMPRESSED);
441 #endif
442 }
443 else { // not compressed -- position into the file and build an ad hoc GFastaFile
444 lseek(fdb, fpos, SEEK_SET);
445 // read it char by char and return it as output
446 char c='\0';
447 int charsread=0;
448 fastahandler->init(&rec, seqCallBack);
449 while (reclen-- && read(fdb, &c, 1)==1) {
450 fastahandler->processChar(c);
451 charsread++;
452 }
453 fastahandler->done();
454 return charsread;
455 } // not compressed
456 /* if (many) r=cdb->findnext(key, strlen(key));
457 else r=0;
458 } */
459 return 0;
460 }
461
getRecordPos(const char * key,uint32 * record_len)462 off_t GCdbYank::getRecordPos(const char* key, uint32* record_len) {
463 //assumes fdb is open, cdb was created on the index file
464 int r=cdb->find(key);
465 if (r==0 && warnings) {
466 GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile);
467 return -1;
468 }
469 if (r==-1)
470 GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
471 off_t pos = cdb->datapos(); //position of this key's record in the index file
472 unsigned int len=cdb->datalen(); // length of this key's record
473 char bytes[64]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
474 if (cdb->read(bytes,len,pos) == -1)
475 GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
476
477 off_t fpos; //this will be the fastadb offset
478 uint32 rlen; //this will be the fasta record length
479 if (len>irec_size32) { //64 bit file offset was used
480 fpos=gcvt_offt(bytes);
481 rlen=gcvt_uint(&bytes[offsetof(CIdxData, reclen)]);
482 }
483 else { //32bit offset used
484 fpos=gcvt_uint(bytes);
485 rlen=gcvt_uint(&bytes[offsetof(CIdxData32, reclen)]);
486 }
487 if (record_len!=NULL) *record_len=rlen;
488 return fpos;
489 }
490