1 #include "GFaSeqGet.h"
2 #include "gdna.h"
3 #include <ctype.h>
4 
setup(uint sstart,int slen,int sovl,int qfrom,int qto,uint maxseqlen)5 void GSubSeq::setup(uint sstart, int slen, int sovl, int qfrom, int qto, uint maxseqlen) {
6      if (sovl==0) {
7        GFREE(sq);
8        sqstart=sstart;
9        uint max_len=(maxseqlen>0) ? maxseqlen : MAX_FASUBSEQ;
10        sqlen = (slen==0 ? max_len : slen);
11        GMALLOC(sq, sqlen);
12        return;
13        }
14   //overlap -- copy the overlapping region
15   char* newsq=NULL;
16   GMALLOC(newsq, slen);
17   memcpy((void*)&newsq[qto], (void*)&sq[qfrom], sovl);
18   GFREE(sq);
19   sq=newsq;
20   sqstart=sstart;
21   sqlen=slen;
22 }
23 
finit(const char * fn,off_t fofs,bool validate)24 void GFaSeqGet::finit(const char* fn, off_t fofs, bool validate) {
25  fh=fopen(fn,"rb");
26  if (fh==NULL) {
27    GError("Error (GFaSeqGet) opening file '%s'\n",fn);
28    }
29  fname=Gstrdup(fn);
30  initialParse(fofs, validate);
31  lastsub=new GSubSeq();
32 }
33 
GFaSeqGet(const char * faname,uint seqlen,off_t fseqofs,int l_len,int l_blen)34 GFaSeqGet::GFaSeqGet(const char* faname, uint seqlen, off_t fseqofs, int l_len, int l_blen) {
35 //for GFastaIndex use mostly -- the important difference is that
36 //the file offset is to the sequence, not to the defline
37   fh=fopen(faname,"rb");
38   if (fh==NULL) {
39     GError("Error (GFaSeqGet) opening file '%s'\n",faname);
40     }
41   fname=Gstrdup(faname);
42   line_len=l_len;
43   line_blen=l_blen;
44   seq_len=seqlen;
45   if (line_blen<line_len)
46        GError("Error (GFaSeqGet): invalid line length info (len=%d, blen=%d)\n",
47               line_len, line_blen);
48   fseqstart=fseqofs;
49   lastsub=new GSubSeq();
50 }
51 
GFaSeqGet(FILE * f,off_t fofs,bool validate)52 GFaSeqGet::GFaSeqGet(FILE* f, off_t fofs, bool validate) {
53   fname=NULL;
54   fseqstart=0;
55   if (f==NULL) GError("Error (GFaSeqGet) : null file handle!\n");
56   seq_len=0;
57   fh=f;
58   initialParse(fofs, validate);
59   lastsub=new GSubSeq();
60 }
61 
initialParse(off_t fofs,bool checkall)62 void GFaSeqGet::initialParse(off_t fofs, bool checkall) {
63  static const char gfa_ERRPARSE[]="Error (GFaSeqGet): invalid FASTA file format.\n";
64  if (fofs!=0) { fseeko(fh,fofs,SEEK_SET); } //e.g. for offsets provided by cdbyank
65  //read the first two lines to determine fasta parameters
66  fseqstart=fofs;
67  int c=getc(fh);
68  fseqstart++;
69  if (c!='>') GError("Error (GFaSeqGet): not a fasta header?\n");
70  while ((c=getc(fh))!=EOF) {
71    fseqstart++;
72    if (c=='\n' || c=='\r') { break; } //end of defline
73    }
74 
75  if (c==EOF) GError(gfa_ERRPARSE);
76  line_len=0;
77  int lendlen=0;
78  while ((c=getc(fh))!=EOF) {
79   if (c=='\n' || c=='\r') { //end of line encountered
80      if (line_len>0) { //end of the first "sequence" line
81         lendlen++;
82         break;
83         }
84       else {// another EoL char at the end of defline
85         fseqstart++;
86         continue;
87         }
88      }// end-of-line characters
89   line_len++;
90   }
91  //we are at the end of first sequence line
92  while ((c=getc(fh))!=EOF) {
93    if (c=='\n' || c=='\r') lendlen++;
94       else {
95        ungetc(c,fh);
96        break;
97        }
98    }
99  line_blen=line_len+lendlen;
100  if (c==EOF) return;
101  // -- you don't need to check it all if you're sure it's safe
102  if (checkall) { //validate the rest of the FASTA record
103    int llen=0; //last line length
104    int elen=0; //length of last line ending
105    bool waseol=true;
106    while ((c=getc(fh))!=EOF) {
107      if (c=='>' && waseol) { ungetc(c,fh); break; }
108      if (c=='\n' ||  c=='\r') {
109         // eol char
110         elen++;
111         if (waseol) continue; //2nd eol char
112         waseol=true;
113         elen=1;
114         continue;
115         }
116      if (c<=32) GError(gfa_ERRPARSE); //invalid character encountered
117      //--- on a seq char here:
118      if (waseol) {//beginning of a seq line
119        if (elen && (llen!=line_len || elen!=lendlen))
120            //GError(gfa_ERRPARSE);
121          GError("Error: invalid FASTA format for GFaSeqGet; make sure that\n\
122   the sequence lines have the same length (except for the last line)");
123        waseol=false;
124        llen=0;
125        elen=0;
126        }
127      llen++;
128      } //while reading chars
129    }// FASTA checking was requested
130  fseeko(fh,fseqstart,SEEK_SET);
131 }
132 
subseq(uint cstart,int & clen)133 const char* GFaSeqGet::subseq(uint cstart, int& clen) {
134   //cstart is 1-based genomic coordinate within current fasta sequence
135    int maxlen=(seq_len>0)?seq_len : MAX_FASUBSEQ;
136    //GMessage("--> call: subseq(%u, %d)\n", cstart, clen);
137   if (clen>maxlen) {
138     GMessage("Error (GFaSeqGet): subsequence cannot be larger than %d\n", maxlen);
139     return NULL;
140     }
141   if (seq_len>0 && clen+cstart-1>seq_len) {
142      GMessage("Error (GFaSeqGet): end coordinate (%d) cannot be larger than sequence length %d\n", clen+cstart-1, seq_len);
143      }
144   if (lastsub->sq==NULL || lastsub->sqlen==0) {
145     lastsub->setup(cstart, clen, 0,0,0,seq_len);
146     loadsubseq(cstart, clen);
147     lastsub->sqlen=clen;
148     return (const char*)lastsub->sq;
149     }
150   //allow extension up to MAX_FASUBSEQ
151   uint bstart=lastsub->sqstart;
152   uint bend=lastsub->sqstart+lastsub->sqlen-1;
153   uint cend=cstart+clen-1;
154   int qlen=0; //only the extra len to be allocated/appended/prepended
155   uint qstart=cstart; //start coordinate of the new seq block of length qlen to be read from file
156   int newlen=0; //the new total length of the buffered sequence lastsub->sq
157   int kovl=0;
158   int czfrom=0;//0-based offsets for copying a previously read sequence chunk
159   int czto=0;
160   uint newstart=cstart;
161   if (cstart>=bstart && cend<=bend) { //new reg contained within existing buffer
162      return (const char*) &(lastsub->sq[cstart-bstart]) ;
163     }
164   //extend downward
165   uint newend=GMAX(cend, bend);
166   if (cstart<bstart) { //requested start < old buffer start
167     newstart=cstart;
168     newlen=(newend-newstart+1);
169     if (newlen>MAX_FASUBSEQ) {
170        newlen=MAX_FASUBSEQ;
171        newend=cstart+newlen-1; //keep newstart, set newend
172        }
173     qlen=bstart-cstart;
174     if (newend>bstart) { //overlap
175        if (newend>bend) {// new region is larger & around the old one - so we have two regions to update
176          kovl=bend-bstart+1;
177          czfrom=0;
178          czto=bstart-cstart;
179          lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc and copy the kovl subseq
180          qlen=bstart-cstart;
181          loadsubseq(newstart, qlen);
182          qlen=newend-bend;
183          int toread=qlen;
184          loadsubseq(bend+1, qlen);
185          clen-=(toread-qlen);
186          lastsub->sqlen=clen;
187          return (const char*)lastsub->sq;
188          }
189         //newend<=bend
190        kovl=newend-bstart+1;
191        }
192      else { //no overlap with previous buffer
193        if (newend>bend) kovl=bend-bstart+1;
194                    else kovl=newend-bstart+1;
195        }
196      qlen=bstart-cstart;
197      czfrom=0;
198      czto=qlen;
199     } //cstart<bstart
200    else { //cstart>=bstart, possibly extend upwards
201     newstart=bstart;
202     newlen=(newend-newstart+1);
203     if (newlen>MAX_FASUBSEQ) {
204        newstart=bstart+(newlen-MAX_FASUBSEQ);//keep newend, assign newstart
205        newlen=MAX_FASUBSEQ;
206        if (newstart<=bend) { //overlap with old buffer
207           kovl=bend-newstart+1;
208           czfrom=newstart-bstart;
209           czto=0;
210           }
211        else { //not overlapping old buffer
212          kovl=0;
213          }
214        } //newstart reassigned
215     else { //we can extend the buffer to include the old one
216       qlen=newend-bend; //how much to read from file
217       qstart=bend+1;
218       kovl=bend-bstart+1;
219       czfrom=0;
220       czto=0;
221       }
222     }
223   lastsub->setup(newstart, newlen, kovl, czfrom, czto, seq_len); //this should realloc but copy any overlapping region
224   lastsub->sqlen-=qlen; //appending may result in a premature eof
225   int toread=qlen;
226   loadsubseq(qstart, qlen); //read the missing chunk, if any
227   clen-=(toread-qlen);
228   lastsub->sqlen+=qlen;
229   return (const char*)(lastsub->sq+(cstart-newstart));
230 }
231 
copyRange(uint cstart,uint cend,bool revCmpl,bool upCase)232 char* GFaSeqGet::copyRange(uint cstart, uint cend, bool revCmpl, bool upCase) {
233   if (cstart>cend) { Gswap(cstart, cend); }
234   int clen=cend-cstart+1;
235   const char* gs=subseq(cstart, clen);
236   if (gs==NULL) return NULL;
237   char* r=NULL;
238   GMALLOC(r,clen+1);
239   r[clen]=0;
240   memcpy((void*)r,(void*)gs, clen);
241   if (revCmpl) reverseComplement(r,clen);
242   if (upCase) {
243        for (int i=0;i<clen;i++)
244             r[i]=toupper(r[i]);
245        }
246   return r;
247  }
248 
loadsubseq(uint cstart,int & clen)249 const char* GFaSeqGet::loadsubseq(uint cstart, int& clen) {
250   //assumes enough lastsub->sq space allocated previously
251   //only loads the requested clen chars from file, at offset &lastsub->sq[cstart-lastsub->sqstart]
252   int sofs=cstart-lastsub->sqstart;
253   int lendlen=line_blen-line_len;
254   char* seqp=lastsub->sq+sofs;
255   //find the proper file offset and read the appropriate lines
256   uint seqofs=cstart-1;
257   uint startlno = seqofs/line_len;
258   int lineofs = seqofs % line_len;
259   off_t fstart=fseqstart + (startlno*line_blen);
260   fstart+=lineofs;
261 
262   fseeko(fh, fstart, SEEK_SET);
263   int toread=clen;
264   int maxlen=(seq_len>0)? seq_len-cstart+1 : MAX_FASUBSEQ ;
265   if (toread==0) toread=maxlen; //read max allowed, or to the end of file
266   int actualrlen=0;
267   int sublen=0;
268   if (lineofs>0) { //read the partial first line
269     int reqrlen=line_len-lineofs;
270     if (reqrlen>toread) reqrlen=toread; //in case we need to read just a few chars
271     actualrlen=fread((void*)seqp, 1, reqrlen, fh);
272     if (actualrlen<reqrlen) { //eof reached prematurely
273       while (seqp[actualrlen-1]=='\n' || seqp[actualrlen-1]=='\r') actualrlen--;
274       //check for new sequences in between
275       clen=actualrlen;
276       sublen+=actualrlen;
277       return (const char*)seqp;
278       }
279     toread-=reqrlen;
280     sublen+=reqrlen;
281     fseeko(fh, lendlen, SEEK_CUR);
282     }
283   //read the rest of the lines
284   while (toread>=line_len) {
285     char* rseqp=&(seqp[sublen]);
286     actualrlen=fread((void*)rseqp, 1, line_len, fh);
287     /*
288     char dbuf[256];dbuf[255]=0;
289     strncpy(dbuf,rseqp, actualrlen);
290     dbuf[actualrlen]=0;
291     GMessage("<<<read line: %s\n",dbuf);
292     */
293     if (actualrlen<line_len) {
294       while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r') actualrlen--;
295       sublen+=actualrlen;
296       clen=sublen;
297       return (const char*)seqp;
298       }
299     toread-=actualrlen;
300     sublen+=actualrlen;
301     fseeko(fh, lendlen, SEEK_CUR);
302     }
303   // read the last partial line, if any
304   if (toread>0) {
305     char* rseqp=&(seqp[sublen]);
306     actualrlen=fread((void*)rseqp, 1, toread, fh);
307     if (actualrlen<toread) {
308       while (rseqp[actualrlen-1]=='\n' || rseqp[actualrlen-1]=='\r')
309           actualrlen--;
310       }
311     sublen+=actualrlen;
312     }
313   //lastsub->sqlen+=sublen;
314   clen=sublen;
315 
316   return (const char*)seqp;
317   }
318 
319 
320