1 #include "GCdbYank.h"
2 #include "GBase.h"
3 #include <ctype.h>
4 
5 #define ERR_READ "GCdbYank: error reading from file.\n"
6 #define ERR_READFMT "GCdbYank read error: incorrect file format.\n"
7 #define ERR_RANGEFMT "Sequence range parsing error for key '%s'\n"
8 #define ERR_RANGE_INVALID "Invalid range (%d-%d) specified for sequence '%s' of length %d\n"
9 // 1MB memory buffer:
10 #define MAX_MEM_RECSIZE 1048576
11 #ifndef O_BINARY
12  #define O_BINARY 0x0000
13 #endif
14 
15 //default size of the index record for records stored with 32bit offsets
16 uint32 irec_size32=8;
17 
18 #ifdef ENABLE_COMPRESSION
19 
GCdbZFasta(FILE * azf,int zrsize,char * r_delim)20 GCdbZFasta::GCdbZFasta(FILE* azf, int zrsize, char* r_delim) {
21  zrecsize=-1;
22  zpos=0;
23  recdelim=Gstrdup(r_delim);
24  zf=azf;
25  decomp_start(zrsize);
26  chrhandler=new GFastaCharHandler(recdelim);
27 }
28 
~GCdbZFasta()29 GCdbZFasta::~GCdbZFasta() {
30  //if (zf!=NULL && zf!=stdout && zf!=stdin) fclose(zf);
31  // FULL_FLUSH method instead of finish
32  delete chrhandler;
33  GFREE(recdelim);
34  decomp_end();
35 }
36 
decomp_start(int zrsize)37 void GCdbZFasta::decomp_start(int zrsize) {
38  zstream.zalloc = (alloc_func)0;
39  zstream.zfree = (free_func)0;
40  zstream.opaque = (voidpf)0;
41  zstream.next_in  = (Bytef*)sbuf;
42  zstream.avail_in = 0;
43  zstream.next_out = (Bytef*)lbuf;
44  int err = inflateInit(&zstream);
45  if (err!=Z_OK)
46      GMessage("Error at inflateInit()\n");
47  //-- now read and discard the first record, so we can use random access later
48  // (needed by zlib)
49  int bytes_read=fread(sbuf, 1, zrsize, zf);
50  if (bytes_read<zrsize)
51      GError("Error reading 1st record from zrec file\n");
52  zstream.next_in = (Bytef*)sbuf;
53  zstream.avail_in = bytes_read;
54 //decompress first chunk
55  zstream.next_out = (Bytef*)lbuf;
56  zstream.avail_out = GCDBZ_LBUF_LEN;
57  err = inflate(&zstream, Z_SYNC_FLUSH);
58  if (err !=Z_OK && err!=Z_STREAM_END)
59      GError("GCdbZFasta error: 1st record inflate failed! (err=%d)\n",err);
60 }
61 
decomp_end()62 void GCdbZFasta::decomp_end() {
63   int err = inflateEnd(&zstream);
64   if (err!=Z_OK)
65      GError("Error at inflateEnd() (err=%d)\n", err);
66 }
67 
68 
69 //fasta record decompress
70 //returns: the number of bytes decompressed
decompress(FastaSeq & rec,int csize,int zfofs,charFunc * seqCallBack)71 int GCdbZFasta::decompress(FastaSeq& rec, int csize, int zfofs, charFunc* seqCallBack) {
72  if (zfofs>=0) {
73     if (fseeko(zf, zfofs, 0))
74       GError("GCdbZFasta::decompress: error fseeko() to %d\n", zfofs);
75     }
76   else
77      if (feof(zf)) return 0;
78  bool in_rec=true;
79  int err=0;
80  int total_read=0;
81  int total_written=0;
82  chrhandler->init(&rec, seqCallBack);
83  while (in_rec) { // read loop
84      int to_read=0;
85      int bytes_read=0;
86      if (csize<=0) { //read one byte at a time
87         to_read=1;
88         int c;
89         if ((c =fgetc(zf))!=EOF) {
90            bytes_read = 1;
91            sbuf[0]=c;
92            }
93           else {
94             //bytes_read=0;
95             return 0; //eof
96             }
97         total_read+=bytes_read;
98         }
99       else {
100         to_read = csize-total_read>GCDBZ_SBUF_LEN ?
101                                  GCDBZ_SBUF_LEN : csize-total_read;
102        // check for csize vs bytes_read match:
103         if (to_read==0) return 0;
104         bytes_read=fread(sbuf, 1, to_read, zf);
105         if (bytes_read!=to_read)
106             GError("Error reading from zrec file\n");
107         total_read+=bytes_read;
108         in_rec=(total_read<csize);
109         }
110      if (bytes_read==0) {
111         //GMessage("bytes_read = 0\n");
112         return 0;
113         }
114      if (in_rec && bytes_read<to_read) in_rec=false;
115      zstream.next_in = (Bytef*)sbuf;
116      zstream.avail_in = bytes_read;
117 
118      do { //decompression loop
119         zstream.next_out = (Bytef*)lbuf;
120         zstream.avail_out = GCDBZ_LBUF_LEN;
121         uLong t_out=zstream.total_out;
122         err = inflate(&zstream, Z_SYNC_FLUSH);
123         uLong toWrite=zstream.total_out-t_out;
124         if (toWrite>0) {
125              /* if (fwrite(lbuf, 1, toWrite, outf)<toWrite) {
126                GError("Error writing inflated chunk!\n");
127                } */
128              for (unsigned int i=0;i<toWrite;i++)
129                chrhandler->processChar(lbuf[i]);
130 
131              total_written+=toWrite;
132              }
133         if (err==Z_STREAM_END) {
134               in_rec=false;
135               if (total_written==0) {
136                 GMessage("Z_STREAM_END found but total_written=0!\n");
137                 }
138               break;
139               }
140          else if (err !=Z_OK)
141                 GError("GCdbZFasta error: inflate failed! (err=%d)\n",err);
142         } while (zstream.avail_in!=0); //decompression loop
143    } //read loop
144   chrhandler->done();
145  /*if (err!=Z_STREAM_END) {
146    GError("decompress: Z_STREAM_END not found!\n");
147    }*/
148   return total_written;
149 }
150 
openCdbz(char * p)151 GCdbZFasta* GCdbYank::openCdbz(char* p) {
152     //in case this was not done before:
153     gcvt_uint=(endian_test())? &uint32_sun : &uint32_x86;
154     FILE* zf=fopen(p, "rb");
155     if (zf==NULL) {
156           GMessage("Error: cannot open compressed file '%s'!\n",p);
157           return NULL;
158           }
159     //check if the file is valid and read the length of the first record
160     //
161     char ztag[5];
162     ztag[4]='\0';
163     if (fread(ztag, 1, 4, zf)<4) {
164        GMessage("Error reading header of compressed file '%s'\n",p);
165        return NULL;
166        }
167     if (strcmp(ztag, "CDBZ")!=0) {
168        GMessage("Error: file '%s' doesn't appear to be a zlib compressed cdb?\n",p);
169        return NULL;
170        }
171     unsigned int zrecsize;
172     if (fread((void*) &zrecsize,1,4,zf)<4) {
173        GMessage("Error reading 1st compressed record size for file '%s'!\n",p);
174        return NULL;
175        }
176    zrecsize=gcvt_uint(&zrecsize);
177    return new GCdbZFasta(zf, zrecsize, recdelim);
178 }
179 #else
180 static const char err_COMPRESSED[]="Error: compression detected but not compiled in!\n";
181 #endif
182 //decompression stuff
183 
184 
inplace_Lower(char * c)185 void inplace_Lower(char* c) {
186  char *p=c;
187  while (*p!='\0') { *p=tolower(*p);p++; }
188 }
189 
buf_get(GCDBuffer * b,uint32 & pos,char * buf,unsigned int len)190 void buf_get(GCDBuffer* b, uint32& pos, char *buf, unsigned int len) {
191   int r;
192   while (len > 0) {
193     r = b->get(buf,len);
194     if (r == -1) GError(ERR_READ);
195     if (r == 0)
196        GError(ERR_READFMT);
197     pos += r;
198     buf += r;
199     len -= r;
200     }
201 }
202 
buf_getnum(GCDBuffer * b,uint32 & pos,uint32 * num)203 void buf_getnum(GCDBuffer* b, uint32& pos, uint32 *num) {
204   char buf[4];
205   buf_get(b, pos, buf, 4);
206   uint32_unpack(buf,num);
207 }
208 
209 
read_dbinfo(int fd,char ** fnameptr,cdbInfo & dbstat)210 int read_dbinfo(int fd, char** fnameptr, cdbInfo& dbstat) {
211 //this is messy due to the need of compatibility with the
212 //old 32bit file-length
213        char* dbname=*fnameptr;
214        //read just the tag first: 4 bytes ID
215        lseek(fd, -cdbInfoSIZE, SEEK_END);
216        int r=read(fd, &dbstat, cdbInfoSIZE );
217        if (r!=cdbInfoSIZE) return 2;
218        //GMessage("Size of dbstat=%d\n", cdbInfoSIZE);
219        if (strncmp(dbstat.oldtag, "CIDX", 4)==0) {
220             //old dbstat structure -- convert it
221             dbstat.num_keys=gcvt_uint(&dbstat.oldnum[0]);
222             dbstat.num_records=gcvt_uint(&dbstat.oldnum[1]);
223             dbstat.dbsize=gcvt_uint(&dbstat.old_dbsize);
224             dbstat.idxflags = gcvt_uint(&dbstat.old_idxflags);
225              //position on the dbnamelen entry
226             dbstat.dbnamelen = gcvt_uint(&dbstat.old_dbnamelen);
227             //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
228             lseek(fd, -(off_t)(cdbInfoSIZE-4+dbstat.dbnamelen), SEEK_END);
229             }
230           else if (strncmp(dbstat.tag, "CDBX", 4)!=0) {
231             GMessage("Error: this doesn't appear to be a cdbfasta created file!\n");
232             return 1;
233             }
234            else { // new CDBX type:
235             dbstat.dbsize = gcvt_offt(&dbstat.dbsize);
236             dbstat.num_keys=gcvt_uint(&dbstat.num_keys);
237             dbstat.num_records=gcvt_uint(&dbstat.num_records);
238             dbstat.idxflags = gcvt_uint(&dbstat.idxflags);
239             //position on the dbnamelen entry
240             dbstat.dbnamelen = gcvt_uint(&dbstat.dbnamelen);
241             //GMessage("dbnamelen=%d\n", dbstat.dbnamelen);
242             lseek(fd, -(off_t)(cdbInfoSIZE+dbstat.dbnamelen), SEEK_END);
243             }
244 
245        GMALLOC(dbname, dbstat.dbnamelen+1);
246        dbname[dbstat.dbnamelen]='\0';
247        r=read(fd, dbname, dbstat.dbnamelen);
248        *fnameptr=dbname;
249        if (r!=dbstat.dbnamelen)
250          return 2;
251        return 0;
252        }
253 
parse_int(char * buf,const char * key,int & e)254 int parse_int(char* buf, const char* key, int& e) {
255    char* p, *q;
256    while (e!=EOF && isspace(e)) { //skip any spaces
257      if (e=='\n') GError(ERR_RANGEFMT, key);
258      e=fgetc(stdin);
259      }
260    if (e==EOF) GError(ERR_RANGEFMT, key);
261    //now e is the first non-space
262    p=buf;
263    q=p;
264    while (e!=EOF && !isspace(e)) {
265     *q=e;
266     q++;
267     e=fgetc(stdin);
268     }
269    *q='\0'; //now p is the starting coordinate string
270    return atoi(p);
271    //now the file pointer should be on the first space after the parsed value
272 }
273 
parse_int(char * & f,const char * key,int & e)274 int parse_int(char*& f, const char* key, int& e) {
275    char* p, *q;
276    char buf[16];
277    while (e!='\0' && isspace(e)) { //skip any spaces
278      if (e=='\n') GError(ERR_RANGEFMT, key);
279      f++;
280      e=*f;
281      }
282    if (e=='\0') GError(ERR_RANGEFMT, key);
283    //now e is the first non-space char
284    p=buf;
285    q=p;
286    while (e!='\0' && !isspace(e)) {
287      *q=e;
288      q++;
289      f++;
290      e=*f;
291      }
292    *q='\0';
293    return atoi(p);
294    //now f and e should be on the first space after the parsed value (or '\0')
295 }
296 
GCdbYank(const char * fidx,const char * recsep)297 GCdbYank::GCdbYank(const char* fidx, const char* recsep) {
298  is_compressed=false;
299  fd=-1;
300  cdb=NULL;
301  warnings=0;
302 #ifdef ENABLE_COMPRESSION
303  cdbz=NULL;
304 #endif
305  fdb=-1;
306  fz=NULL;
307  dbname=NULL;
308  recdelim=Gstrdup(recsep);
309  if (fidx==NULL) GError("GCdbYank Error: NULL index file name!");
310  idxfile=Gstrdup(fidx);
311  cdb=new GCdbRead(idxfile);
312  fd=cdb->getfd();
313  db_size=0;
314  dbstat.dbsize=0;
315  info_dbname=NULL;
316  int r=read_dbinfo(fd, &info_dbname, dbstat);
317  lseek(fd, 0, SEEK_SET);
318  if (r==1) GError("This file does not seem to be a cdbfasta generated file.\n");
319           else if (r==2)
320                  GError("Error reading info chunk!\n");
321   /*try to find the database file
322      rules: if given, only the -d given filename is used
323        otherwise:
324         1) the same directory with the given index file(stripping the suffix)
325         2) the dbstat filepath/name stored by cdbfasta
326    */
327  if (dbname==NULL) {
328    char* p = rstrchr(idxfile, '.');
329    if (p!=NULL) {
330       /*GError("%s\ncdbyank error: cannot use %s as an index file. When no -d is\n\
331       given, so the database file can be located in the same directory \n\
332       by removing the index file suffix (.cidx)\n", USAGE, idxfile);*/
333       int nlen=p-idxfile;
334       char* namebuf=NULL;
335       GMALLOC(namebuf, nlen+1);
336       strncpy(namebuf, idxfile, nlen);
337       namebuf[nlen]='\0';
338       if (fileExists(namebuf))
339          dbname=namebuf;
340 
341       }  // strip the index file extenstion
342     // 2) try the stored dbstat name
343    if (dbname==NULL) {
344        if (fileExists(info_dbname)) dbname=info_dbname;
345          else GError("Cannot locate the database file for this index\n");
346       }
347   }// database name not given
348   is_compressed=(dbstat.idxflags & CDBMSK_OPT_COMPRESS);
349        if (is_compressed)
350             //try to open the dbname as a compressed file
351              fz=fopen(dbname, "rb");
352        else  fdb=open(dbname, O_RDONLY|O_BINARY);
353      if (fdb==-1 && fz==NULL)
354            GError("Error: cannot open database file %s\n",dbname);
355      if (is_compressed) {
356     #ifndef ENABLE_COMPRESSION
357         GError(err_COMPRESSED);
358     #else
359         fclose(fz);//just to start fresh here
360         //determine size:
361         int ftmp = open(dbname, O_RDONLY|O_BINARY);
362         if (ftmp == -1) GError("Error reopening db '%s'?\n",dbname);
363         struct stat fdbstat;
364         fstat(ftmp, &fdbstat);
365         db_size=fdbstat.st_size;
366         close(ftmp);
367         //-------- reopen here
368         cdbz=openCdbz(dbname);
369         if (cdbz==NULL)
370            GError("Error opening the cdbz file '%s'\n");
371         fz=cdbz->getZFile();
372      #endif
373         }
374        else {
375         struct stat fdbstat;
376         if (stat(dbname, &fdbstat)!=0) {
377           perror("stat()");
378           exit(1);
379           }
380         db_size=fdbstat.st_size;
381         }
382      //abort if the database size was read and it doesn't match the cdbfasta stored size
383      if (dbstat.dbsize>0 && dbstat.dbsize!=db_size)
384        GError("Error: invalid %d database size - (%lld vs %lld) please rerun cdbfasta for '%s'\n",
385           fdb, dbstat.dbsize, db_size, dbname);
386    fastahandler=new GFastaCharHandler(recdelim);
387 } //* GCdbYank constructor *//
388 
~GCdbYank()389 GCdbYank::~GCdbYank() {
390  if (is_compressed) {
391      fclose(fz);
392     #ifdef ENABLE_COMPRESSION
393      delete cdbz;
394     #endif
395      }
396      else close(fdb);
397  GFREE(info_dbname);
398  delete fastahandler;
399  GFREE(recdelim);
400  GFREE(dbname);
401  GFREE(idxfile);
402  delete cdb;
403  close(fd);
404 }
405 
406 
getRecord(const char * key,FastaSeq & rec,charFunc * seqCallBack)407 int GCdbYank::getRecord(const char* key, FastaSeq& rec, charFunc* seqCallBack) {
408 //assumes fdb is open, cdb was created on the index file
409  int r=cdb->find(key);
410  if (r==0) return 0;
411  if (r==-1)
412    GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
413  /* while (r>0) { */
414  off_t pos = cdb->datapos(); //position of this key's record in the index file
415  unsigned int len=cdb->datalen(); // length of this key's record
416  char bytes[32]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
417  if (cdb->read(bytes,len,pos) == -1)
418        GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
419 
420  off_t fpos; //this will be the fastadb offset
421  uint32 reclen;  //this will be the fasta record offset
422  if (len>irec_size32) { //64 bit file offset was used
423   fpos=gcvt_offt(bytes);
424   reclen=gcvt_uint(&bytes[sizeof(uint32)<<1]);
425   }
426   else { //32bit offset used
427   fpos=gcvt_uint(bytes);
428   reclen=gcvt_uint(&bytes[sizeof(uint32)]);
429   }
430   //GMessage("reclen=%d\n", reclen);
431 
432  /* if (showQuery)
433   fprintf(fout, "%c%s%c\t", delimQuery, key, delimQuery);*/
434   /*========= FETCHING RECORD CONTENT ======= */
435    if (is_compressed) {
436      //for now: ignore special retrievals, just print the whole record
437      #ifdef ENABLE_COMPRESSION
438       return cdbz->decompress(rec, reclen, fpos, seqCallBack);
439      #else
440        GError(err_COMPRESSED);
441      #endif
442      }
443     else { // not compressed -- position into the file and build an ad hoc GFastaFile
444      lseek(fdb, fpos, SEEK_SET);
445      // read it char by char and return it as output
446      char c='\0';
447      int charsread=0;
448      fastahandler->init(&rec, seqCallBack);
449      while (reclen-- && read(fdb, &c, 1)==1) {
450        fastahandler->processChar(c);
451        charsread++;
452        }
453      fastahandler->done();
454      return charsread;
455      } // not compressed
456   /* if (many) r=cdb->findnext(key, strlen(key));
457         else r=0;
458    } */
459  return 0;
460 }
461 
getRecordPos(const char * key,uint32 * record_len)462 off_t GCdbYank::getRecordPos(const char* key, uint32* record_len) {
463 //assumes fdb is open, cdb was created on the index file
464  int r=cdb->find(key);
465  if (r==0 && warnings) {
466    GMessage("cdbyank: key \"%s\" not found in %s\n", key, idxfile);
467    return -1;
468    }
469  if (r==-1)
470    GError("cdbyank: error searching for key %s in %s\n", key, idxfile);
471  off_t pos = cdb->datapos(); //position of this key's record in the index file
472  unsigned int len=cdb->datalen(); // length of this key's record
473  char bytes[64]; // data buffer -- should just accomodate fastarec_pos, fastarec_length
474  if (cdb->read(bytes,len,pos) == -1)
475        GError("cdbyank: error at GCbd::read (%s)!\n", idxfile);
476 
477  off_t fpos; //this will be the fastadb offset
478  uint32 rlen;  //this will be the fasta record length
479  if (len>irec_size32) { //64 bit file offset was used
480   fpos=gcvt_offt(bytes);
481   rlen=gcvt_uint(&bytes[offsetof(CIdxData, reclen)]);
482   }
483   else { //32bit offset used
484   fpos=gcvt_uint(bytes);
485   rlen=gcvt_uint(&bytes[offsetof(CIdxData32, reclen)]);
486   }
487   if (record_len!=NULL) *record_len=rlen;
488   return fpos;
489 }
490