1 /* @source dbxflat application
2 **
3 ** Index fasta format sequence files
4 **
5 ** @author Copyright (C) Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 
23 #include "emboss.h"
24 
25 
26 static AjPRegexp dbxflat_wrdexp = NULL;
27 
28 static AjPStr dbxflatRdline = NULL;
29 static AjPStr dbxflatSumline = NULL;
30 
31 static AjPStr swissAccstr = NULL;
32 
33 static AjBool dbxflat_ParseFastq(EmbPBtreeEntry entry, AjPFile inf);
34 static AjBool dbxflat_ParseEmbl(EmbPBtreeEntry entry, AjPFile inf);
35 static AjBool dbxflat_ParseGenbank(EmbPBtreeEntry entry, AjPFile inf);
36 static AjBool dbxflat_ParseSwiss(EmbPBtreeEntry entry, AjPFile inf);
37 static AjBool dbxflat_ParseIguspto(EmbPBtreeEntry entry, AjPFile inf);
38 
39 static AjBool dbxflat_NextEntry(EmbPBtreeEntry entry, AjPFile inf);
40 
41 int global = 0;
42 
43 EmbPBtreeField accfield = NULL;
44 EmbPBtreeField svfield = NULL;
45 EmbPBtreeField orgfield = NULL;
46 EmbPBtreeField desfield = NULL;
47 EmbPBtreeField keyfield = NULL;
48 
49 ajuint idtot = 0;
50 ajuint acctot = 0;
51 ajuint svtot = 0;
52 ajuint orgtot = 0;
53 ajuint destot = 0;
54 ajuint keytot = 0;
55 
56 
57 /* @datastatic DbxflatPParser *************************************************
58 **
59 ** Parser definition structure
60 **
61 ** @alias DbxflatSParser
62 ** @alias DbxflatOParser
63 **
64 ** @attr Name [const char*] Parser name
65 ** @attr Parser [AjBool function] Parser function
66 ** @@
67 ******************************************************************************/
68 
69 typedef struct DbxflatSParser
70 {
71     const char* Name;
72     AjBool (*Parser) (EmbPBtreeEntry entry, AjPFile inf);
73 } DbxflatOParser;
74 #define DbxflatPParser DbxflatOParser*
75 
76 
77 
78 
79 static DbxflatOParser parser[] =
80 {
81     {"EMBL",   dbxflat_ParseEmbl},
82     {"SWISS",  dbxflat_ParseSwiss},
83     {"GB",     dbxflat_ParseGenbank},
84     {"REFSEQ", dbxflat_ParseGenbank},
85     {"FASTQ",  dbxflat_ParseFastq},
86     {"USPTO",  dbxflat_ParseIguspto},
87     {NULL,     NULL}
88 };
89 
90 
91 
92 
93 
94 /* @prog dbxflat **************************************************************
95 **
96 ** Index a flat file database
97 **
98 ******************************************************************************/
99 
main(int argc,char ** argv)100 int main(int argc, char **argv)
101 {
102     EmbPBtreeEntry entry = NULL;
103 
104     AjPStr dbname   = NULL;
105     AjPStr dbrs     = NULL;
106     AjPStr release  = NULL;
107     AjPStr datestr  = NULL;
108     AjBool statistics;
109     AjBool compressed;
110 
111     AjPStr directory;
112     AjPStr indexdir;
113     AjPStr filename;
114     AjPStr exclude;
115     AjPStr dbtype = NULL;
116     AjPFile outf = NULL;
117 
118     AjPStr *fieldarray = NULL;
119 
120     ajint nfields;
121     ajint nfiles;
122 
123     AjPStr tmpstr = NULL;
124     AjPStr thysfile = NULL;
125 
126     ajint i;
127     AjPFile inf = NULL;
128 
129     ajulong nentries = 0UL;
130     ajulong ientries = 0UL;
131     AjPTime starttime = NULL;
132     AjPTime begintime = NULL;
133     AjPTime nowtime = NULL;
134     ajlong startclock = 0UL;
135     ajlong beginclock = 0UL;
136     ajlong nowclock = 0UL;
137 
138     ajulong idpricache=0L, idpriread = 0L, idpriwrite = 0L, idprisize= 0L;
139     ajulong idseccache=0L, idsecread = 0L, idsecwrite = 0L, idsecsize= 0L;
140     ajulong acpricache=0L, acpriread = 0L, acpriwrite = 0L, acprisize= 0L;
141     ajulong acseccache=0L, acsecread = 0L, acsecwrite = 0L, acsecsize= 0L;
142     ajulong svpricache=0L, svpriread = 0L, svpriwrite = 0L, svprisize= 0L;
143     ajulong svseccache=0L, svsecread = 0L, svsecwrite = 0L, svsecsize= 0L;
144     ajulong kwpricache=0L, kwpriread = 0L, kwpriwrite = 0L, kwprisize= 0L;
145     ajulong kwseccache=0L, kwsecread = 0L, kwsecwrite = 0L, kwsecsize= 0L;
146     ajulong depricache=0L, depriread = 0L, depriwrite = 0L, deprisize= 0L;
147     ajulong deseccache=0L, desecread = 0L, desecwrite = 0L, desecsize= 0L;
148     ajulong txpricache=0L, txpriread = 0L, txpriwrite = 0L, txprisize= 0L;
149     ajulong txseccache=0L, txsecread = 0L, txsecwrite = 0L, txsecsize= 0L;
150 
151     ajulong splitrootid =0l, splitrootnum=0L;
152     ajulong splitrootkey=0L, splitrootsec=0L;
153     ajulong splitleafid =0L, splitleafnum=0L;
154     ajulong splitleafkey=0L, splitleafsec=0L;
155     ajulong reorderid   =0L, reordernum  =0L;
156     ajulong reorderkey  =0L, reordersec  =0L;
157 
158     double tdiff = 0.0;
159     ajint days = 0;
160     ajint hours = 0;
161     ajint mins = 0;
162 
163     embInit("dbxflat", argc, argv);
164 
165     dbtype     = ajAcdGetListSingle("idformat");
166     fieldarray = ajAcdGetList("fields");
167     directory  = ajAcdGetDirectoryName("directory");
168     outf       = ajAcdGetOutfile("outfile");
169     indexdir   = ajAcdGetOutdirName("indexoutdir");
170     filename   = ajAcdGetString("filenames");
171     exclude    = ajAcdGetString("exclude");
172     dbname     = ajAcdGetString("dbname");
173     dbrs       = ajAcdGetString("dbresource");
174     release    = ajAcdGetString("release");
175     datestr    = ajAcdGetString("date");
176     statistics = ajAcdGetBoolean("statistics");
177     compressed = ajAcdGetBoolean("compressed");
178 
179     entry = embBtreeEntryNew(0);
180     if(compressed)
181         embBtreeEntrySetCompressed(entry);
182 
183     tmpstr = ajStrNew();
184 
185     nfields = embBtreeSetFields(entry,fieldarray);
186     embBtreeSetDbInfo(entry,dbname,dbrs,datestr,release,dbtype,directory,
187 		      indexdir);
188 
189     for(i=0; i< nfields; i++)
190     {
191         if(ajStrMatchC(fieldarray[i], "acc"))
192         {
193             accfield = embBtreeGetFieldS(entry, fieldarray[i]);
194             if(compressed)
195                 embBtreeFieldSetCompressed(accfield);
196         }
197         else if(ajStrMatchC(fieldarray[i], "sv"))
198         {
199             svfield = embBtreeGetFieldS(entry, fieldarray[i]);
200             if(compressed)
201                 embBtreeFieldSetCompressed(svfield);
202         }
203         else if(ajStrMatchC(fieldarray[i], "des"))
204         {
205             desfield = embBtreeGetFieldS(entry, fieldarray[i]);
206             if(compressed)
207                 embBtreeFieldSetCompressed(desfield);
208         }
209         else if(ajStrMatchC(fieldarray[i], "key"))
210         {
211             keyfield = embBtreeGetFieldS(entry, fieldarray[i]);
212             if(compressed)
213                 embBtreeFieldSetCompressed(keyfield);
214         }
215         else if(ajStrMatchC(fieldarray[i], "org"))
216         {
217             orgfield = embBtreeGetFieldS(entry, fieldarray[i]);
218             if(compressed)
219                 embBtreeFieldSetCompressed(orgfield);
220         }
221         else if(!ajStrMatchC(fieldarray[i], "id"))
222             ajErr("Unknown field '%S' specified for indexing", fieldarray[i]);
223     }
224 
225     embBtreeGetRsInfo(entry);
226 
227     nfiles = embBtreeGetFiles(entry,directory,filename,exclude);
228     if(!nfiles)
229         ajDie("No input files in '%S' matched filename '%S'",
230               directory, filename);
231 
232     embBtreeWriteEntryFile(entry);
233 
234     embBtreeOpenCaches(entry);
235 
236     starttime = ajTimeNewToday();
237 
238     ajFmtPrintF(outf, "Processing directory: %S\n", directory);
239 
240     for(i=0;i<nfiles;++i)
241     {
242         begintime = ajTimeNewToday();
243         beginclock = ajClockNow();
244 
245 	ajListPop(entry->files,(void **)&thysfile);
246 	ajListPushAppend(entry->files,(void *)thysfile);
247 	ajFmtPrintS(&tmpstr,"%S%S",entry->directory,thysfile);
248 	if(!(inf=ajFileNewInNameS(tmpstr)))
249 	    ajFatal("Cannot open input file %S\n",tmpstr);
250 	ajFilenameTrimPath(&tmpstr);
251 	ajFmtPrintF(outf,"Processing file: %S\n",tmpstr);
252 
253 	ientries = 0L;
254 
255 	while(dbxflat_NextEntry(entry,inf))
256 	{
257 	    ++ientries;
258 
259 	    if(entry->do_id)
260 	    {
261 		embBtreeIndexEntry(entry, i);
262                 ++idtot;
263 	    }
264 
265 	    if(accfield)
266 	    {
267                 acctot += embBtreeIndexPrimary(accfield, entry, i);
268 	    }
269 
270 	    if(svfield)
271 	    {
272                 svtot += embBtreeIndexPrimary(svfield, entry, i);
273 	    }
274 
275 	    if(keyfield)
276 	    {
277                 keytot += embBtreeIndexSecondary(keyfield, entry);
278 	    }
279 
280 	    if(desfield)
281 	    {
282                 destot += embBtreeIndexSecondary(desfield, entry);
283 	    }
284 
285 	    if(orgfield)
286 	    {
287                 orgtot += embBtreeIndexSecondary(orgfield, entry);
288             }
289 	}
290 
291 	ajFileClose(&inf);
292 	nentries += ientries;
293 	nowtime = ajTimeNewToday();
294         nowclock = ajClockNow();
295 	ajFmtPrintF(outf, "entries: %Lu (%Lu) time: %.1f/%.1fs (%.1f/%.1fs)\n",
296 		    nentries, ientries,
297 		    ajClockDiff(startclock,nowclock),
298                     ajTimeDiff(starttime, nowtime),
299 		    ajClockDiff(beginclock,nowclock),
300                     ajTimeDiff(begintime, nowtime));
301 
302         if(statistics)
303         {
304             ajBtreeStatsOut(outf,
305                             &splitrootid, &splitrootnum,
306                             &splitrootkey, &splitrootsec,
307                             &splitleafid, &splitleafnum,
308                             &splitleafkey, &splitleafsec,
309                             &reorderid, &reordernum,
310                             &reorderkey, &reordersec);
311 
312             if(entry->do_id)
313                 ajBtreeCacheStatsOut(outf, entry->idcache,
314                                      &idpricache, &idseccache,
315                                      &idpriread, &idsecread,
316                                      &idpriwrite, &idsecwrite,
317                                      &idprisize, &idsecsize);
318             if(accfield)
319                 ajBtreeCacheStatsOut(outf, accfield->cache,
320                                      &acpricache, &acseccache,
321                                      &acpriread,  &acsecread,
322                                      &acpriwrite, &acsecwrite,
323                                      &acprisize, &acsecsize);
324             if(svfield)
325                 ajBtreeCacheStatsOut(outf, svfield->cache,
326                                      &svpricache, &svseccache,
327                                      &svpriread, &svsecread,
328                                      &svpriwrite, &svsecwrite,
329                                      &svprisize, &svsecsize);
330             if(keyfield)
331                 ajBtreeCacheStatsOut(outf, keyfield->cache,
332                                      &kwpricache, &kwseccache,
333                                      &kwpriread, &kwsecread,
334                                      &kwpriwrite, &kwsecwrite,
335                                      &kwprisize, &kwsecsize);
336             if(desfield)
337                 ajBtreeCacheStatsOut(outf, desfield->cache,
338                                      &depricache, &deseccache,
339                                      &depriread, &desecread,
340                                      &depriwrite, &desecwrite,
341                                      &deprisize, &desecsize);
342             if(orgfield)
343                 ajBtreeCacheStatsOut(outf, orgfield->cache,
344                                      &txpricache, &txseccache,
345                                      &txpriread, &txsecread,
346                                      &txpriwrite, &txsecwrite,
347                                      &txprisize, &txsecsize);
348         }
349 
350 	ajTimeDel(&begintime);
351 	ajTimeDel(&nowtime);
352     }
353 
354 
355 
356     embBtreeDumpParameters(entry);
357     embBtreeCloseCaches(entry);
358 
359     nowtime = ajTimeNewToday();
360     tdiff = ajTimeDiff(starttime, nowtime);
361     days = (ajint) (tdiff/(24.0*3600.0));
362     tdiff -= (24.0*3600.0)*(double)days;
363     hours = (ajint) (tdiff/3600.0);
364     tdiff -= 3600.0*(double)hours;
365     mins = (ajint) (tdiff/60.0);
366     tdiff -= 60.0 * (double) mins;
367     if(days)
368         ajFmtPrintF(outf, "Total time: %d %02d:%02d:%04.1f\n",
369                     days, hours, mins, tdiff);
370     else if (hours)
371         ajFmtPrintF(outf, "Total time: %d:%02d:%04.1f\n",
372                     hours, mins, tdiff);
373     else
374         ajFmtPrintF(outf, "Total time: %d:%04.1f\n",
375                     mins, tdiff);
376 
377     ajTimeDel(&nowtime);
378     ajTimeDel(&starttime);
379 
380     embBtreeReportEntry(outf, entry);
381 
382     if(accfield)
383         embBtreeReportField(outf, accfield);
384     if(svfield)
385         embBtreeReportField(outf, svfield);
386     if(orgfield)
387         embBtreeReportField(outf, orgfield);
388     if(desfield)
389         embBtreeReportField(outf, desfield);
390     if(keyfield)
391         embBtreeReportField(outf, keyfield);
392 
393     ajFileClose(&outf);
394     embBtreeEntryDel(&entry);
395     ajStrDel(&tmpstr);
396     ajStrDel(&filename);
397     ajStrDel(&exclude);
398     ajStrDel(&dbname);
399     ajStrDel(&dbrs);
400     ajStrDel(&release);
401     ajStrDel(&datestr);
402     ajStrDel(&directory);
403     ajStrDel(&indexdir);
404     ajStrDel(&dbtype);
405 
406     nfields = 0;
407     while(fieldarray[nfields])
408 	ajStrDel(&fieldarray[nfields++]);
409     AJFREE(fieldarray);
410 
411     ajRegFree(&dbxflat_wrdexp);
412     ajStrDel(&dbxflatRdline);
413     ajStrDel(&dbxflatSumline);
414     ajStrDel(&swissAccstr);
415 
416     embExit();
417 
418     return 0;
419 }
420 
421 
422 
423 
424 /* @funcstatic dbxflat_ParseEmbl **********************************************
425 **
426 ** Parse the ID, accession from an EMBL entry.
427 **
428 ** Reads to the end of the entry and then returns.
429 **
430 ** @param [w] entry [EmbPBtreeEntry] entry
431 ** @param [u] inf [AjPFile] Input file
432 **
433 ** @return [AjBool] ajTrue on success.
434 ** @@
435 ******************************************************************************/
436 
dbxflat_ParseEmbl(EmbPBtreeEntry entry,AjPFile inf)437 static AjBool dbxflat_ParseEmbl(EmbPBtreeEntry entry, AjPFile inf)
438 {
439     ajlong pos  = 0L;
440 
441     ajStrAssignC(&dbxflatRdline, "");
442 
443     while(!ajStrPrefixC(dbxflatRdline, "//"))
444     {
445 	pos = ajFileResetPos(inf);
446 
447 	if(!ajReadlineTrim(inf,&dbxflatRdline))
448 	{
449 	    ajStrDel(&dbxflatRdline);
450 	    return ajFalse;
451 	}
452 
453 	if(ajStrPrefixC(dbxflatRdline,"ID"))
454 	{
455 	    entry->fpos = pos;
456 	    ajFmtScanS(dbxflatRdline,"%*S%S",&entry->id);
457 	    ajStrTrimEndC(&entry->id, ";");
458 	    if(svfield)
459 		embBtreeParseEmblSv(dbxflatRdline,svfield);
460 	}
461 
462 
463 	if(svfield)
464 	    if(ajStrPrefixC(dbxflatRdline,"SV") ||
465 	       ajStrPrefixC(dbxflatRdline,"IV"))  /* emblcds database format */
466 		embBtreeParseEmblAc(dbxflatRdline,svfield);
467 
468 	if(accfield)
469 	    if(ajStrPrefixC(dbxflatRdline,"AC") ||
470 	       ajStrPrefixC(dbxflatRdline,"PA"))  /* emblcds database format */
471 		embBtreeParseEmblAc(dbxflatRdline,accfield);
472 
473 	if(keyfield)
474 	    if(ajStrPrefixC(dbxflatRdline,"KW"))
475 		embBtreeParseEmblKw(dbxflatRdline,keyfield);
476 
477 	if(desfield)
478 	    if(ajStrPrefixC(dbxflatRdline,"DE"))
479 		embBtreeParseEmblDe(dbxflatRdline,desfield);
480 
481 	if(orgfield)
482 	    if(ajStrPrefixC(dbxflatRdline,"OC") ||
483                ajStrPrefixC(dbxflatRdline,"OS"))
484 		embBtreeParseEmblTx(dbxflatRdline,orgfield);
485     }
486 
487     return ajTrue;
488 }
489 
490 
491 
492 
493 /* @funcstatic dbxflat_ParseGenbank *******************************************
494 **
495 ** Parse the ID, accession from a Genbank entry
496 **
497 ** @param [w] entry [EmbPBtreeEntry] entry
498 ** @param [u] inf [AjPFile] Input file
499 **
500 ** @return [AjBool] ajTrue on success.
501 ** @@
502 ******************************************************************************/
503 
dbxflat_ParseGenbank(EmbPBtreeEntry entry,AjPFile inf)504 static AjBool dbxflat_ParseGenbank(EmbPBtreeEntry entry, AjPFile inf)
505 {
506     ajlong pos  = 0L;
507     AjBool ret = ajTrue;
508 
509     ajStrAssignC(&dbxflatRdline, "");
510     ajStrAssignC(&dbxflatSumline, "");
511 
512     while(!ajStrPrefixC(dbxflatRdline,"//") && ret)
513     {
514 	if(ajStrPrefixC(dbxflatRdline,"LOCUS"))
515 	{
516 	    entry->fpos = pos;
517 	    ajFmtScanS(dbxflatRdline,"%*S%S",&entry->id);
518 	}
519 
520 	if(svfield)
521 	    if(ajStrPrefixC(dbxflatRdline,"VERSION"))
522 		embBtreeParseGenbankAc(dbxflatRdline,svfield);
523 
524 	if(accfield)
525 	    if(ajStrPrefixC(dbxflatRdline,"ACCESSION"))
526 		embBtreeParseGenbankAc(dbxflatRdline,accfield);
527 
528 	if(keyfield)
529 	    if(ajStrPrefixC(dbxflatRdline,"KEYWORDS"))
530 	    {
531 		ajStrAssignS(&dbxflatSumline,dbxflatRdline);
532 		ret = ajReadlineTrim(inf,&dbxflatRdline);
533                 while(ret && *MAJSTRGETPTR(dbxflatRdline)==' ')
534 		{
535 		    ajStrAppendS(&dbxflatSumline,dbxflatRdline);
536 		    ret = ajReadlineTrim(inf,&dbxflatRdline);
537 		}
538 		ajStrRemoveWhiteExcess(&dbxflatSumline);
539 		embBtreeParseGenbankKw(dbxflatSumline,keyfield);
540 		continue;
541 	    }
542 
543 	if(desfield)
544 	    if(ajStrPrefixC(dbxflatRdline,"DEFINITION"))
545 	    {
546 		ajStrAssignS(&dbxflatSumline,dbxflatRdline);
547 		ret = ajReadlineTrim(inf,&dbxflatRdline);
548 		while(ret && *MAJSTRGETPTR(dbxflatRdline)==' ')
549 		{
550 		    ajStrAppendS(&dbxflatSumline,dbxflatRdline);
551 		    ret = ajReadlineTrim(inf,&dbxflatRdline);
552 		}
553 		ajStrRemoveWhiteExcess(&dbxflatSumline);
554 		embBtreeParseGenbankDe(dbxflatSumline,desfield);
555 		continue;
556 	    }
557 
558 
559 	if(orgfield)
560 	    if(ajStrPrefixC(dbxflatRdline,"SOURCE"))
561 	    {
562 		ajStrAssignC(&dbxflatSumline,"");
563 		ret = ajReadlineTrim(inf,&dbxflatRdline);
564                 ajStrAppendC(&dbxflatRdline, ";");
565 		while(ret && *MAJSTRGETPTR(dbxflatRdline)==' ')
566 		{
567 		    ajStrAppendS(&dbxflatSumline,dbxflatRdline);
568 		    ret = ajReadlineTrim(inf,&dbxflatRdline);
569 		}
570 		ajStrRemoveWhiteExcess(&dbxflatSumline);
571 		embBtreeParseGenbankTx(dbxflatSumline,orgfield);
572 		continue;
573 	    }
574 
575 
576 	pos = ajFileResetPos(inf);
577 
578 	if(!ajReadlineTrim(inf,&dbxflatRdline))
579 	    ret = ajFalse;
580     }
581 
582     return ret;
583 }
584 
585 
586 
587 
588 /* @funcstatic dbxflat_ParseFastq *********************************************
589 **
590 ** Parse the ID, accession from a FASTQ format sequence entry.
591 **
592 ** Reads to the end of the entry and then returns.
593 **
594 ** @param [w] entry [EmbPBtreeEntry] entry
595 ** @param [u] inf [AjPFile] Input file
596 **
597 ** @return [AjBool] ajTrue on success.
598 ** @@
599 ******************************************************************************/
600 
dbxflat_ParseFastq(EmbPBtreeEntry entry,AjPFile inf)601 static AjBool dbxflat_ParseFastq(EmbPBtreeEntry entry, AjPFile inf)
602 {
603     ajlong pos  = 0L;
604     ajuint seqlen = 0;
605     ajuint qlen = 0;
606     AjPStr tmpfd  = NULL;
607     AjPStr str = NULL;
608     AjPStr de = NULL;
609     AjBool ok;
610 
611     if(!dbxflat_wrdexp)
612 	dbxflat_wrdexp = ajRegCompC("([A-Za-z0-9.:=]+)");
613 
614     ajStrAssignC(&dbxflatRdline, "");
615 
616     pos = ajFileResetPos(inf);
617 
618     if(!ajReadlineTrim(inf,&dbxflatRdline))
619     {
620         ajStrDel(&dbxflatRdline);
621         return ajFalse;
622     }
623 
624     /* first line of entry */
625 
626     if(!ajStrPrefixC(dbxflatRdline,"@"))
627         return ajFalse;
628 
629     entry->fpos = pos;
630     ajStrCutStart(&dbxflatRdline, 1);
631     ajStrExtractFirst(dbxflatRdline, &de, &entry->id);
632 
633     if(desfield && ajStrGetLen(de))
634     {
635 	while(ajRegExec(dbxflat_wrdexp,de))
636 	{
637 	    ajRegSubI(dbxflat_wrdexp, 1, &tmpfd);
638 	    str = ajStrNew();
639 	    ajStrAssignS(&str,tmpfd);
640 	    ajListstrPushAppend(desfield->data, str);
641 	    ajRegPost(dbxflat_wrdexp, &de);
642 	}
643     }
644 
645 /* now read sequence */
646     ok = ajReadlineTrim(inf,&dbxflatRdline);
647     while(ok && !ajStrPrefixC(dbxflatRdline, "+"))
648     {
649         ajStrRemoveWhite(&dbxflatRdline);
650         seqlen += MAJSTRGETLEN(dbxflatRdline);
651         ok = ajReadlineTrim(inf,&dbxflatRdline);
652     }
653 
654     if(!ok)
655         return ajFalse;
656 
657     ok = ajReadlineTrim(inf,&dbxflatRdline);
658     while(ok)
659     {
660         qlen += MAJSTRGETLEN(dbxflatRdline);
661         if(qlen < seqlen)
662             ok = ajReadlineTrim(inf,&dbxflatRdline);
663         else
664             ok = ajFalse;
665     }
666 
667     ajStrDel(&de);
668     ajStrDel(&tmpfd);
669 
670     return ajTrue;
671 }
672 
673 
674 
675 
676 
677 /* @funcstatic dbxflat_ParseIguspto *******************************************
678 **
679 ** Parse the ID, accession from a USPTO format sequence entry.
680 **
681 ** Reads to the end of the entry and then returns.
682 **
683 ** @param [w] entry [EmbPBtreeEntry] entry
684 ** @param [u] inf [AjPFile] Input file
685 **
686 ** @return [AjBool] ajTrue on success.
687 ** @@
688 ******************************************************************************/
689 
dbxflat_ParseIguspto(EmbPBtreeEntry entry,AjPFile inf)690 static AjBool dbxflat_ParseIguspto(EmbPBtreeEntry entry, AjPFile inf)
691 {
692     ajlong pos  = 0L;
693     ajuint seqlen = 0;
694     AjPStr tmpfd  = NULL;
695     AjPStr str = NULL;
696     AjPStr de = NULL;
697     AjBool ok = ajTrue;
698 
699     if(!dbxflat_wrdexp)
700 	dbxflat_wrdexp = ajRegCompC("([A-Za-z0-9.:=]+)");
701 
702     pos = ajFileResetPos(inf);
703 
704     if(!MAJSTRGETLEN(dbxflatRdline))
705         ok = ajReadlineTrim(inf,&dbxflatRdline);
706 
707     if(!ok)
708     {
709         ajStrDel(&dbxflatRdline);
710         return ajFalse;
711     }
712 
713     /* first line of entry */
714 
715     entry->fpos = pos;
716 
717     if(!ajStrPrefixC(dbxflatRdline,";"))
718         return ajFalse;
719 
720     while(ok && ajStrPrefixC(dbxflatRdline, ";"))
721     {
722         ajStrAssignSubS(&de, dbxflatRdline, 2, -1);
723 
724         if(desfield && ajStrGetLen(de))
725         {
726             while(ajRegExec(dbxflat_wrdexp,de))
727             {
728                 ajRegSubI(dbxflat_wrdexp, 1, &tmpfd);
729                 str = ajStrNew();
730                 ajStrAssignS(&str,tmpfd);
731                 ajListstrPushAppend(desfield->data, str);
732                 ajRegPost(dbxflat_wrdexp, &de);
733             }
734         }
735 
736         ok = ajReadlineTrim(inf,&dbxflatRdline);
737     }
738 
739     if(!ok)
740         return ajFalse;
741 
742     ajStrAssignS(&entry->id, dbxflatRdline);
743     ajStrRemoveWhite(&entry->id);
744 
745 /* now read sequence */
746     ok = ajReadlineTrim(inf,&dbxflatRdline);
747     while(ok && !ajStrPrefixC(dbxflatRdline, ";"))
748     {
749         ajStrRemoveWhite(&dbxflatRdline);
750         seqlen += MAJSTRGETLEN(dbxflatRdline);
751         ok = ajReadlineTrim(inf,&dbxflatRdline);
752     }
753 
754     ajStrDel(&de);
755     ajStrDel(&tmpfd);
756 
757     return ajTrue;
758 }
759 
760 
761 
762 
763 
764 /* @funcstatic dbxflat_ParseSwiss *********************************************
765 **
766 ** Parse the ID, accession from a SwissProt or UniProtKB entry.
767 **
768 ** Reads to the end of the entry and then returns.
769 **
770 ** @param [w] entry [EmbPBtreeEntry] entry
771 ** @param [u] inf [AjPFile] Input file
772 **
773 ** @return [AjBool] ajTrue on success.
774 ** @@
775 ******************************************************************************/
776 
dbxflat_ParseSwiss(EmbPBtreeEntry entry,AjPFile inf)777 static AjBool dbxflat_ParseSwiss(EmbPBtreeEntry entry, AjPFile inf)
778 {
779     ajlong pos  = 0L;
780     const char* swissprefix[] = {
781         "RecName: ", "AltName: ", "SubName: ",
782         "Includes:", "Contains:", "Flags: ",
783         "Full=", "Short=", "EC=",
784         "Allergen=", "Biotech=", "CD_antigen=", "INN=",
785         NULL
786     };
787     ajuint swisslen[] = {
788         9, 9, 9,
789         9, 9, 7,
790         5, 6, 3,
791         9, 8, 11, 4,
792         0
793     };
794 
795     ajuint i;
796 
797     if(!dbxflat_wrdexp)
798 	dbxflat_wrdexp = ajRegCompC("([A-Za-z0-9_-]+)");
799 
800     ajStrAssignC(&dbxflatRdline, "");
801 
802     while(!ajStrPrefixC(dbxflatRdline,"//"))
803     {
804 	pos = ajFileResetPos(inf);
805 
806 	if(!ajReadlineTrim(inf,&dbxflatRdline))
807 	{
808             if(svfield)
809                 ajStrDel(&swissAccstr);
810 
811 	    return ajFalse;
812 	}
813 
814 	if(ajStrPrefixC(dbxflatRdline,"ID"))
815 	{
816 	    entry->fpos = pos;
817 	    ajFmtScanS(dbxflatRdline,"%*S%S",&entry->id);
818 	    ajStrTrimEndC(&entry->id, ";");
819 	}
820 
821 
822 	if(svfield)
823         {
824 	    if(ajStrPrefixC(dbxflatRdline,"SV") ||
825 	       ajStrPrefixC(dbxflatRdline,"IV"))  /* emblcds database format */
826 		embBtreeParseEmblAc(dbxflatRdline,svfield);
827 
828             if(!MAJSTRGETLEN(swissAccstr) && ajStrPrefixC(dbxflatRdline,"AC"))
829                 embBtreeFindEmblAc(dbxflatRdline, svfield, &swissAccstr);
830 
831             if(MAJSTRGETLEN(swissAccstr) &&
832                ajStrMatchWildC(dbxflatRdline,
833                                "DT   \?\?-\?\?\?-\?\?\?\?, sequence version *"))
834             {
835                 ajStrAppendK(&swissAccstr, '.');
836                 ajStrAppendSubS(&swissAccstr, dbxflatRdline, 35, -3);
837                 ajStrTrimEndC(&swissAccstr, ".\n\r"); /* in case of \n\r */
838                 ajListstrPushAppend(svfield->data, swissAccstr);
839                 swissAccstr = NULL;
840             }
841         }
842 
843 	if(accfield)
844 	    if(ajStrPrefixC(dbxflatRdline,"AC") ||
845 	       ajStrPrefixC(dbxflatRdline,"PA"))  /* emblcds database format */
846 		embBtreeParseEmblAc(dbxflatRdline,accfield);
847 
848 	if(keyfield)
849 	    if(ajStrPrefixC(dbxflatRdline,"KW"))
850 		embBtreeParseEmblKw(dbxflatRdline,keyfield);
851 
852 	if(desfield)
853 	    if(ajStrPrefixC(dbxflatRdline,"DE"))
854             {
855                 ajStrCutStart(&dbxflatRdline, 5);
856                 ajStrTrimWhiteStart(&dbxflatRdline);
857 
858                 /*
859                 ** trim prefixes
860                 ** can be multiple
861                 ** e.g. SubName: Full=
862                 */
863 
864                 for(i=0; swissprefix[i]; i++)
865                 {
866                     if(ajStrPrefixC(dbxflatRdline, swissprefix[i]))
867                         ajStrCutStart(&dbxflatRdline, swisslen[i]);
868                 }
869 
870 		embBtreeParseField(dbxflatRdline,dbxflat_wrdexp, desfield);
871             }
872 
873 	if(orgfield)
874 	    if(ajStrPrefixC(dbxflatRdline,"OC") ||
875                ajStrPrefixC(dbxflatRdline,"OS"))
876 		embBtreeParseEmblTx(dbxflatRdline,orgfield);
877     }
878 
879     if(svfield)
880         ajStrDel(&swissAccstr);
881 
882     return ajTrue;
883 }
884 
885 
886 
887 
888 /* @funcstatic dbxflat_NextEntry ********************************************
889 **
890 ** Parse the next entry from a flatfile
891 **
892 ** @param [u] entry [EmbPBtreeEntry] entry object ptr
893 ** @param [u] inf [AjPFile] file object ptr
894 **
895 ** @return [AjBool] ajTrue on success, ajFalse if EOF
896 ** @@
897 ******************************************************************************/
898 
dbxflat_NextEntry(EmbPBtreeEntry entry,AjPFile inf)899 static AjBool dbxflat_NextEntry(EmbPBtreeEntry entry, AjPFile inf)
900 {
901     static AjBool init = AJFALSE;
902     static ajint  nparser = -1;
903     ajint i;
904 
905     if(!init)
906     {
907 	entry->fpos = 0L;
908 	for(i=0; parser[i].Name && nparser == -1; ++i)
909 	    if(ajStrMatchC(entry->dbtype, parser[i].Name))
910 		nparser = i;
911 	if(nparser == -1)
912 	    ajFatal("Database format (%S) unknown",entry->dbtype);
913 	init = ajTrue;
914     }
915 
916 
917     if(!(*parser[nparser].Parser)(entry,inf))
918 	return ajFalse;
919 
920 
921 
922     return ajTrue;
923 }
924