1 /* @source dbxflat application
2 **
3 ** Index fasta format sequence files
4 **
5 ** @author Copyright (C) Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 #include "emboss.h"
24
25
26 static AjPRegexp dbxflat_wrdexp = NULL;
27
28 static AjPStr dbxflatRdline = NULL;
29 static AjPStr dbxflatSumline = NULL;
30
31 static AjPStr swissAccstr = NULL;
32
33 static AjBool dbxflat_ParseFastq(EmbPBtreeEntry entry, AjPFile inf);
34 static AjBool dbxflat_ParseEmbl(EmbPBtreeEntry entry, AjPFile inf);
35 static AjBool dbxflat_ParseGenbank(EmbPBtreeEntry entry, AjPFile inf);
36 static AjBool dbxflat_ParseSwiss(EmbPBtreeEntry entry, AjPFile inf);
37 static AjBool dbxflat_ParseIguspto(EmbPBtreeEntry entry, AjPFile inf);
38
39 static AjBool dbxflat_NextEntry(EmbPBtreeEntry entry, AjPFile inf);
40
41 int global = 0;
42
43 EmbPBtreeField accfield = NULL;
44 EmbPBtreeField svfield = NULL;
45 EmbPBtreeField orgfield = NULL;
46 EmbPBtreeField desfield = NULL;
47 EmbPBtreeField keyfield = NULL;
48
49 ajuint idtot = 0;
50 ajuint acctot = 0;
51 ajuint svtot = 0;
52 ajuint orgtot = 0;
53 ajuint destot = 0;
54 ajuint keytot = 0;
55
56
57 /* @datastatic DbxflatPParser *************************************************
58 **
59 ** Parser definition structure
60 **
61 ** @alias DbxflatSParser
62 ** @alias DbxflatOParser
63 **
64 ** @attr Name [const char*] Parser name
65 ** @attr Parser [AjBool function] Parser function
66 ** @@
67 ******************************************************************************/
68
69 typedef struct DbxflatSParser
70 {
71 const char* Name;
72 AjBool (*Parser) (EmbPBtreeEntry entry, AjPFile inf);
73 } DbxflatOParser;
74 #define DbxflatPParser DbxflatOParser*
75
76
77
78
79 static DbxflatOParser parser[] =
80 {
81 {"EMBL", dbxflat_ParseEmbl},
82 {"SWISS", dbxflat_ParseSwiss},
83 {"GB", dbxflat_ParseGenbank},
84 {"REFSEQ", dbxflat_ParseGenbank},
85 {"FASTQ", dbxflat_ParseFastq},
86 {"USPTO", dbxflat_ParseIguspto},
87 {NULL, NULL}
88 };
89
90
91
92
93
94 /* @prog dbxflat **************************************************************
95 **
96 ** Index a flat file database
97 **
98 ******************************************************************************/
99
main(int argc,char ** argv)100 int main(int argc, char **argv)
101 {
102 EmbPBtreeEntry entry = NULL;
103
104 AjPStr dbname = NULL;
105 AjPStr dbrs = NULL;
106 AjPStr release = NULL;
107 AjPStr datestr = NULL;
108 AjBool statistics;
109 AjBool compressed;
110
111 AjPStr directory;
112 AjPStr indexdir;
113 AjPStr filename;
114 AjPStr exclude;
115 AjPStr dbtype = NULL;
116 AjPFile outf = NULL;
117
118 AjPStr *fieldarray = NULL;
119
120 ajint nfields;
121 ajint nfiles;
122
123 AjPStr tmpstr = NULL;
124 AjPStr thysfile = NULL;
125
126 ajint i;
127 AjPFile inf = NULL;
128
129 ajulong nentries = 0UL;
130 ajulong ientries = 0UL;
131 AjPTime starttime = NULL;
132 AjPTime begintime = NULL;
133 AjPTime nowtime = NULL;
134 ajlong startclock = 0UL;
135 ajlong beginclock = 0UL;
136 ajlong nowclock = 0UL;
137
138 ajulong idpricache=0L, idpriread = 0L, idpriwrite = 0L, idprisize= 0L;
139 ajulong idseccache=0L, idsecread = 0L, idsecwrite = 0L, idsecsize= 0L;
140 ajulong acpricache=0L, acpriread = 0L, acpriwrite = 0L, acprisize= 0L;
141 ajulong acseccache=0L, acsecread = 0L, acsecwrite = 0L, acsecsize= 0L;
142 ajulong svpricache=0L, svpriread = 0L, svpriwrite = 0L, svprisize= 0L;
143 ajulong svseccache=0L, svsecread = 0L, svsecwrite = 0L, svsecsize= 0L;
144 ajulong kwpricache=0L, kwpriread = 0L, kwpriwrite = 0L, kwprisize= 0L;
145 ajulong kwseccache=0L, kwsecread = 0L, kwsecwrite = 0L, kwsecsize= 0L;
146 ajulong depricache=0L, depriread = 0L, depriwrite = 0L, deprisize= 0L;
147 ajulong deseccache=0L, desecread = 0L, desecwrite = 0L, desecsize= 0L;
148 ajulong txpricache=0L, txpriread = 0L, txpriwrite = 0L, txprisize= 0L;
149 ajulong txseccache=0L, txsecread = 0L, txsecwrite = 0L, txsecsize= 0L;
150
151 ajulong splitrootid =0l, splitrootnum=0L;
152 ajulong splitrootkey=0L, splitrootsec=0L;
153 ajulong splitleafid =0L, splitleafnum=0L;
154 ajulong splitleafkey=0L, splitleafsec=0L;
155 ajulong reorderid =0L, reordernum =0L;
156 ajulong reorderkey =0L, reordersec =0L;
157
158 double tdiff = 0.0;
159 ajint days = 0;
160 ajint hours = 0;
161 ajint mins = 0;
162
163 embInit("dbxflat", argc, argv);
164
165 dbtype = ajAcdGetListSingle("idformat");
166 fieldarray = ajAcdGetList("fields");
167 directory = ajAcdGetDirectoryName("directory");
168 outf = ajAcdGetOutfile("outfile");
169 indexdir = ajAcdGetOutdirName("indexoutdir");
170 filename = ajAcdGetString("filenames");
171 exclude = ajAcdGetString("exclude");
172 dbname = ajAcdGetString("dbname");
173 dbrs = ajAcdGetString("dbresource");
174 release = ajAcdGetString("release");
175 datestr = ajAcdGetString("date");
176 statistics = ajAcdGetBoolean("statistics");
177 compressed = ajAcdGetBoolean("compressed");
178
179 entry = embBtreeEntryNew(0);
180 if(compressed)
181 embBtreeEntrySetCompressed(entry);
182
183 tmpstr = ajStrNew();
184
185 nfields = embBtreeSetFields(entry,fieldarray);
186 embBtreeSetDbInfo(entry,dbname,dbrs,datestr,release,dbtype,directory,
187 indexdir);
188
189 for(i=0; i< nfields; i++)
190 {
191 if(ajStrMatchC(fieldarray[i], "acc"))
192 {
193 accfield = embBtreeGetFieldS(entry, fieldarray[i]);
194 if(compressed)
195 embBtreeFieldSetCompressed(accfield);
196 }
197 else if(ajStrMatchC(fieldarray[i], "sv"))
198 {
199 svfield = embBtreeGetFieldS(entry, fieldarray[i]);
200 if(compressed)
201 embBtreeFieldSetCompressed(svfield);
202 }
203 else if(ajStrMatchC(fieldarray[i], "des"))
204 {
205 desfield = embBtreeGetFieldS(entry, fieldarray[i]);
206 if(compressed)
207 embBtreeFieldSetCompressed(desfield);
208 }
209 else if(ajStrMatchC(fieldarray[i], "key"))
210 {
211 keyfield = embBtreeGetFieldS(entry, fieldarray[i]);
212 if(compressed)
213 embBtreeFieldSetCompressed(keyfield);
214 }
215 else if(ajStrMatchC(fieldarray[i], "org"))
216 {
217 orgfield = embBtreeGetFieldS(entry, fieldarray[i]);
218 if(compressed)
219 embBtreeFieldSetCompressed(orgfield);
220 }
221 else if(!ajStrMatchC(fieldarray[i], "id"))
222 ajErr("Unknown field '%S' specified for indexing", fieldarray[i]);
223 }
224
225 embBtreeGetRsInfo(entry);
226
227 nfiles = embBtreeGetFiles(entry,directory,filename,exclude);
228 if(!nfiles)
229 ajDie("No input files in '%S' matched filename '%S'",
230 directory, filename);
231
232 embBtreeWriteEntryFile(entry);
233
234 embBtreeOpenCaches(entry);
235
236 starttime = ajTimeNewToday();
237
238 ajFmtPrintF(outf, "Processing directory: %S\n", directory);
239
240 for(i=0;i<nfiles;++i)
241 {
242 begintime = ajTimeNewToday();
243 beginclock = ajClockNow();
244
245 ajListPop(entry->files,(void **)&thysfile);
246 ajListPushAppend(entry->files,(void *)thysfile);
247 ajFmtPrintS(&tmpstr,"%S%S",entry->directory,thysfile);
248 if(!(inf=ajFileNewInNameS(tmpstr)))
249 ajFatal("Cannot open input file %S\n",tmpstr);
250 ajFilenameTrimPath(&tmpstr);
251 ajFmtPrintF(outf,"Processing file: %S\n",tmpstr);
252
253 ientries = 0L;
254
255 while(dbxflat_NextEntry(entry,inf))
256 {
257 ++ientries;
258
259 if(entry->do_id)
260 {
261 embBtreeIndexEntry(entry, i);
262 ++idtot;
263 }
264
265 if(accfield)
266 {
267 acctot += embBtreeIndexPrimary(accfield, entry, i);
268 }
269
270 if(svfield)
271 {
272 svtot += embBtreeIndexPrimary(svfield, entry, i);
273 }
274
275 if(keyfield)
276 {
277 keytot += embBtreeIndexSecondary(keyfield, entry);
278 }
279
280 if(desfield)
281 {
282 destot += embBtreeIndexSecondary(desfield, entry);
283 }
284
285 if(orgfield)
286 {
287 orgtot += embBtreeIndexSecondary(orgfield, entry);
288 }
289 }
290
291 ajFileClose(&inf);
292 nentries += ientries;
293 nowtime = ajTimeNewToday();
294 nowclock = ajClockNow();
295 ajFmtPrintF(outf, "entries: %Lu (%Lu) time: %.1f/%.1fs (%.1f/%.1fs)\n",
296 nentries, ientries,
297 ajClockDiff(startclock,nowclock),
298 ajTimeDiff(starttime, nowtime),
299 ajClockDiff(beginclock,nowclock),
300 ajTimeDiff(begintime, nowtime));
301
302 if(statistics)
303 {
304 ajBtreeStatsOut(outf,
305 &splitrootid, &splitrootnum,
306 &splitrootkey, &splitrootsec,
307 &splitleafid, &splitleafnum,
308 &splitleafkey, &splitleafsec,
309 &reorderid, &reordernum,
310 &reorderkey, &reordersec);
311
312 if(entry->do_id)
313 ajBtreeCacheStatsOut(outf, entry->idcache,
314 &idpricache, &idseccache,
315 &idpriread, &idsecread,
316 &idpriwrite, &idsecwrite,
317 &idprisize, &idsecsize);
318 if(accfield)
319 ajBtreeCacheStatsOut(outf, accfield->cache,
320 &acpricache, &acseccache,
321 &acpriread, &acsecread,
322 &acpriwrite, &acsecwrite,
323 &acprisize, &acsecsize);
324 if(svfield)
325 ajBtreeCacheStatsOut(outf, svfield->cache,
326 &svpricache, &svseccache,
327 &svpriread, &svsecread,
328 &svpriwrite, &svsecwrite,
329 &svprisize, &svsecsize);
330 if(keyfield)
331 ajBtreeCacheStatsOut(outf, keyfield->cache,
332 &kwpricache, &kwseccache,
333 &kwpriread, &kwsecread,
334 &kwpriwrite, &kwsecwrite,
335 &kwprisize, &kwsecsize);
336 if(desfield)
337 ajBtreeCacheStatsOut(outf, desfield->cache,
338 &depricache, &deseccache,
339 &depriread, &desecread,
340 &depriwrite, &desecwrite,
341 &deprisize, &desecsize);
342 if(orgfield)
343 ajBtreeCacheStatsOut(outf, orgfield->cache,
344 &txpricache, &txseccache,
345 &txpriread, &txsecread,
346 &txpriwrite, &txsecwrite,
347 &txprisize, &txsecsize);
348 }
349
350 ajTimeDel(&begintime);
351 ajTimeDel(&nowtime);
352 }
353
354
355
356 embBtreeDumpParameters(entry);
357 embBtreeCloseCaches(entry);
358
359 nowtime = ajTimeNewToday();
360 tdiff = ajTimeDiff(starttime, nowtime);
361 days = (ajint) (tdiff/(24.0*3600.0));
362 tdiff -= (24.0*3600.0)*(double)days;
363 hours = (ajint) (tdiff/3600.0);
364 tdiff -= 3600.0*(double)hours;
365 mins = (ajint) (tdiff/60.0);
366 tdiff -= 60.0 * (double) mins;
367 if(days)
368 ajFmtPrintF(outf, "Total time: %d %02d:%02d:%04.1f\n",
369 days, hours, mins, tdiff);
370 else if (hours)
371 ajFmtPrintF(outf, "Total time: %d:%02d:%04.1f\n",
372 hours, mins, tdiff);
373 else
374 ajFmtPrintF(outf, "Total time: %d:%04.1f\n",
375 mins, tdiff);
376
377 ajTimeDel(&nowtime);
378 ajTimeDel(&starttime);
379
380 embBtreeReportEntry(outf, entry);
381
382 if(accfield)
383 embBtreeReportField(outf, accfield);
384 if(svfield)
385 embBtreeReportField(outf, svfield);
386 if(orgfield)
387 embBtreeReportField(outf, orgfield);
388 if(desfield)
389 embBtreeReportField(outf, desfield);
390 if(keyfield)
391 embBtreeReportField(outf, keyfield);
392
393 ajFileClose(&outf);
394 embBtreeEntryDel(&entry);
395 ajStrDel(&tmpstr);
396 ajStrDel(&filename);
397 ajStrDel(&exclude);
398 ajStrDel(&dbname);
399 ajStrDel(&dbrs);
400 ajStrDel(&release);
401 ajStrDel(&datestr);
402 ajStrDel(&directory);
403 ajStrDel(&indexdir);
404 ajStrDel(&dbtype);
405
406 nfields = 0;
407 while(fieldarray[nfields])
408 ajStrDel(&fieldarray[nfields++]);
409 AJFREE(fieldarray);
410
411 ajRegFree(&dbxflat_wrdexp);
412 ajStrDel(&dbxflatRdline);
413 ajStrDel(&dbxflatSumline);
414 ajStrDel(&swissAccstr);
415
416 embExit();
417
418 return 0;
419 }
420
421
422
423
424 /* @funcstatic dbxflat_ParseEmbl **********************************************
425 **
426 ** Parse the ID, accession from an EMBL entry.
427 **
428 ** Reads to the end of the entry and then returns.
429 **
430 ** @param [w] entry [EmbPBtreeEntry] entry
431 ** @param [u] inf [AjPFile] Input file
432 **
433 ** @return [AjBool] ajTrue on success.
434 ** @@
435 ******************************************************************************/
436
dbxflat_ParseEmbl(EmbPBtreeEntry entry,AjPFile inf)437 static AjBool dbxflat_ParseEmbl(EmbPBtreeEntry entry, AjPFile inf)
438 {
439 ajlong pos = 0L;
440
441 ajStrAssignC(&dbxflatRdline, "");
442
443 while(!ajStrPrefixC(dbxflatRdline, "//"))
444 {
445 pos = ajFileResetPos(inf);
446
447 if(!ajReadlineTrim(inf,&dbxflatRdline))
448 {
449 ajStrDel(&dbxflatRdline);
450 return ajFalse;
451 }
452
453 if(ajStrPrefixC(dbxflatRdline,"ID"))
454 {
455 entry->fpos = pos;
456 ajFmtScanS(dbxflatRdline,"%*S%S",&entry->id);
457 ajStrTrimEndC(&entry->id, ";");
458 if(svfield)
459 embBtreeParseEmblSv(dbxflatRdline,svfield);
460 }
461
462
463 if(svfield)
464 if(ajStrPrefixC(dbxflatRdline,"SV") ||
465 ajStrPrefixC(dbxflatRdline,"IV")) /* emblcds database format */
466 embBtreeParseEmblAc(dbxflatRdline,svfield);
467
468 if(accfield)
469 if(ajStrPrefixC(dbxflatRdline,"AC") ||
470 ajStrPrefixC(dbxflatRdline,"PA")) /* emblcds database format */
471 embBtreeParseEmblAc(dbxflatRdline,accfield);
472
473 if(keyfield)
474 if(ajStrPrefixC(dbxflatRdline,"KW"))
475 embBtreeParseEmblKw(dbxflatRdline,keyfield);
476
477 if(desfield)
478 if(ajStrPrefixC(dbxflatRdline,"DE"))
479 embBtreeParseEmblDe(dbxflatRdline,desfield);
480
481 if(orgfield)
482 if(ajStrPrefixC(dbxflatRdline,"OC") ||
483 ajStrPrefixC(dbxflatRdline,"OS"))
484 embBtreeParseEmblTx(dbxflatRdline,orgfield);
485 }
486
487 return ajTrue;
488 }
489
490
491
492
493 /* @funcstatic dbxflat_ParseGenbank *******************************************
494 **
495 ** Parse the ID, accession from a Genbank entry
496 **
497 ** @param [w] entry [EmbPBtreeEntry] entry
498 ** @param [u] inf [AjPFile] Input file
499 **
500 ** @return [AjBool] ajTrue on success.
501 ** @@
502 ******************************************************************************/
503
dbxflat_ParseGenbank(EmbPBtreeEntry entry,AjPFile inf)504 static AjBool dbxflat_ParseGenbank(EmbPBtreeEntry entry, AjPFile inf)
505 {
506 ajlong pos = 0L;
507 AjBool ret = ajTrue;
508
509 ajStrAssignC(&dbxflatRdline, "");
510 ajStrAssignC(&dbxflatSumline, "");
511
512 while(!ajStrPrefixC(dbxflatRdline,"//") && ret)
513 {
514 if(ajStrPrefixC(dbxflatRdline,"LOCUS"))
515 {
516 entry->fpos = pos;
517 ajFmtScanS(dbxflatRdline,"%*S%S",&entry->id);
518 }
519
520 if(svfield)
521 if(ajStrPrefixC(dbxflatRdline,"VERSION"))
522 embBtreeParseGenbankAc(dbxflatRdline,svfield);
523
524 if(accfield)
525 if(ajStrPrefixC(dbxflatRdline,"ACCESSION"))
526 embBtreeParseGenbankAc(dbxflatRdline,accfield);
527
528 if(keyfield)
529 if(ajStrPrefixC(dbxflatRdline,"KEYWORDS"))
530 {
531 ajStrAssignS(&dbxflatSumline,dbxflatRdline);
532 ret = ajReadlineTrim(inf,&dbxflatRdline);
533 while(ret && *MAJSTRGETPTR(dbxflatRdline)==' ')
534 {
535 ajStrAppendS(&dbxflatSumline,dbxflatRdline);
536 ret = ajReadlineTrim(inf,&dbxflatRdline);
537 }
538 ajStrRemoveWhiteExcess(&dbxflatSumline);
539 embBtreeParseGenbankKw(dbxflatSumline,keyfield);
540 continue;
541 }
542
543 if(desfield)
544 if(ajStrPrefixC(dbxflatRdline,"DEFINITION"))
545 {
546 ajStrAssignS(&dbxflatSumline,dbxflatRdline);
547 ret = ajReadlineTrim(inf,&dbxflatRdline);
548 while(ret && *MAJSTRGETPTR(dbxflatRdline)==' ')
549 {
550 ajStrAppendS(&dbxflatSumline,dbxflatRdline);
551 ret = ajReadlineTrim(inf,&dbxflatRdline);
552 }
553 ajStrRemoveWhiteExcess(&dbxflatSumline);
554 embBtreeParseGenbankDe(dbxflatSumline,desfield);
555 continue;
556 }
557
558
559 if(orgfield)
560 if(ajStrPrefixC(dbxflatRdline,"SOURCE"))
561 {
562 ajStrAssignC(&dbxflatSumline,"");
563 ret = ajReadlineTrim(inf,&dbxflatRdline);
564 ajStrAppendC(&dbxflatRdline, ";");
565 while(ret && *MAJSTRGETPTR(dbxflatRdline)==' ')
566 {
567 ajStrAppendS(&dbxflatSumline,dbxflatRdline);
568 ret = ajReadlineTrim(inf,&dbxflatRdline);
569 }
570 ajStrRemoveWhiteExcess(&dbxflatSumline);
571 embBtreeParseGenbankTx(dbxflatSumline,orgfield);
572 continue;
573 }
574
575
576 pos = ajFileResetPos(inf);
577
578 if(!ajReadlineTrim(inf,&dbxflatRdline))
579 ret = ajFalse;
580 }
581
582 return ret;
583 }
584
585
586
587
588 /* @funcstatic dbxflat_ParseFastq *********************************************
589 **
590 ** Parse the ID, accession from a FASTQ format sequence entry.
591 **
592 ** Reads to the end of the entry and then returns.
593 **
594 ** @param [w] entry [EmbPBtreeEntry] entry
595 ** @param [u] inf [AjPFile] Input file
596 **
597 ** @return [AjBool] ajTrue on success.
598 ** @@
599 ******************************************************************************/
600
dbxflat_ParseFastq(EmbPBtreeEntry entry,AjPFile inf)601 static AjBool dbxflat_ParseFastq(EmbPBtreeEntry entry, AjPFile inf)
602 {
603 ajlong pos = 0L;
604 ajuint seqlen = 0;
605 ajuint qlen = 0;
606 AjPStr tmpfd = NULL;
607 AjPStr str = NULL;
608 AjPStr de = NULL;
609 AjBool ok;
610
611 if(!dbxflat_wrdexp)
612 dbxflat_wrdexp = ajRegCompC("([A-Za-z0-9.:=]+)");
613
614 ajStrAssignC(&dbxflatRdline, "");
615
616 pos = ajFileResetPos(inf);
617
618 if(!ajReadlineTrim(inf,&dbxflatRdline))
619 {
620 ajStrDel(&dbxflatRdline);
621 return ajFalse;
622 }
623
624 /* first line of entry */
625
626 if(!ajStrPrefixC(dbxflatRdline,"@"))
627 return ajFalse;
628
629 entry->fpos = pos;
630 ajStrCutStart(&dbxflatRdline, 1);
631 ajStrExtractFirst(dbxflatRdline, &de, &entry->id);
632
633 if(desfield && ajStrGetLen(de))
634 {
635 while(ajRegExec(dbxflat_wrdexp,de))
636 {
637 ajRegSubI(dbxflat_wrdexp, 1, &tmpfd);
638 str = ajStrNew();
639 ajStrAssignS(&str,tmpfd);
640 ajListstrPushAppend(desfield->data, str);
641 ajRegPost(dbxflat_wrdexp, &de);
642 }
643 }
644
645 /* now read sequence */
646 ok = ajReadlineTrim(inf,&dbxflatRdline);
647 while(ok && !ajStrPrefixC(dbxflatRdline, "+"))
648 {
649 ajStrRemoveWhite(&dbxflatRdline);
650 seqlen += MAJSTRGETLEN(dbxflatRdline);
651 ok = ajReadlineTrim(inf,&dbxflatRdline);
652 }
653
654 if(!ok)
655 return ajFalse;
656
657 ok = ajReadlineTrim(inf,&dbxflatRdline);
658 while(ok)
659 {
660 qlen += MAJSTRGETLEN(dbxflatRdline);
661 if(qlen < seqlen)
662 ok = ajReadlineTrim(inf,&dbxflatRdline);
663 else
664 ok = ajFalse;
665 }
666
667 ajStrDel(&de);
668 ajStrDel(&tmpfd);
669
670 return ajTrue;
671 }
672
673
674
675
676
677 /* @funcstatic dbxflat_ParseIguspto *******************************************
678 **
679 ** Parse the ID, accession from a USPTO format sequence entry.
680 **
681 ** Reads to the end of the entry and then returns.
682 **
683 ** @param [w] entry [EmbPBtreeEntry] entry
684 ** @param [u] inf [AjPFile] Input file
685 **
686 ** @return [AjBool] ajTrue on success.
687 ** @@
688 ******************************************************************************/
689
dbxflat_ParseIguspto(EmbPBtreeEntry entry,AjPFile inf)690 static AjBool dbxflat_ParseIguspto(EmbPBtreeEntry entry, AjPFile inf)
691 {
692 ajlong pos = 0L;
693 ajuint seqlen = 0;
694 AjPStr tmpfd = NULL;
695 AjPStr str = NULL;
696 AjPStr de = NULL;
697 AjBool ok = ajTrue;
698
699 if(!dbxflat_wrdexp)
700 dbxflat_wrdexp = ajRegCompC("([A-Za-z0-9.:=]+)");
701
702 pos = ajFileResetPos(inf);
703
704 if(!MAJSTRGETLEN(dbxflatRdline))
705 ok = ajReadlineTrim(inf,&dbxflatRdline);
706
707 if(!ok)
708 {
709 ajStrDel(&dbxflatRdline);
710 return ajFalse;
711 }
712
713 /* first line of entry */
714
715 entry->fpos = pos;
716
717 if(!ajStrPrefixC(dbxflatRdline,";"))
718 return ajFalse;
719
720 while(ok && ajStrPrefixC(dbxflatRdline, ";"))
721 {
722 ajStrAssignSubS(&de, dbxflatRdline, 2, -1);
723
724 if(desfield && ajStrGetLen(de))
725 {
726 while(ajRegExec(dbxflat_wrdexp,de))
727 {
728 ajRegSubI(dbxflat_wrdexp, 1, &tmpfd);
729 str = ajStrNew();
730 ajStrAssignS(&str,tmpfd);
731 ajListstrPushAppend(desfield->data, str);
732 ajRegPost(dbxflat_wrdexp, &de);
733 }
734 }
735
736 ok = ajReadlineTrim(inf,&dbxflatRdline);
737 }
738
739 if(!ok)
740 return ajFalse;
741
742 ajStrAssignS(&entry->id, dbxflatRdline);
743 ajStrRemoveWhite(&entry->id);
744
745 /* now read sequence */
746 ok = ajReadlineTrim(inf,&dbxflatRdline);
747 while(ok && !ajStrPrefixC(dbxflatRdline, ";"))
748 {
749 ajStrRemoveWhite(&dbxflatRdline);
750 seqlen += MAJSTRGETLEN(dbxflatRdline);
751 ok = ajReadlineTrim(inf,&dbxflatRdline);
752 }
753
754 ajStrDel(&de);
755 ajStrDel(&tmpfd);
756
757 return ajTrue;
758 }
759
760
761
762
763
764 /* @funcstatic dbxflat_ParseSwiss *********************************************
765 **
766 ** Parse the ID, accession from a SwissProt or UniProtKB entry.
767 **
768 ** Reads to the end of the entry and then returns.
769 **
770 ** @param [w] entry [EmbPBtreeEntry] entry
771 ** @param [u] inf [AjPFile] Input file
772 **
773 ** @return [AjBool] ajTrue on success.
774 ** @@
775 ******************************************************************************/
776
dbxflat_ParseSwiss(EmbPBtreeEntry entry,AjPFile inf)777 static AjBool dbxflat_ParseSwiss(EmbPBtreeEntry entry, AjPFile inf)
778 {
779 ajlong pos = 0L;
780 const char* swissprefix[] = {
781 "RecName: ", "AltName: ", "SubName: ",
782 "Includes:", "Contains:", "Flags: ",
783 "Full=", "Short=", "EC=",
784 "Allergen=", "Biotech=", "CD_antigen=", "INN=",
785 NULL
786 };
787 ajuint swisslen[] = {
788 9, 9, 9,
789 9, 9, 7,
790 5, 6, 3,
791 9, 8, 11, 4,
792 0
793 };
794
795 ajuint i;
796
797 if(!dbxflat_wrdexp)
798 dbxflat_wrdexp = ajRegCompC("([A-Za-z0-9_-]+)");
799
800 ajStrAssignC(&dbxflatRdline, "");
801
802 while(!ajStrPrefixC(dbxflatRdline,"//"))
803 {
804 pos = ajFileResetPos(inf);
805
806 if(!ajReadlineTrim(inf,&dbxflatRdline))
807 {
808 if(svfield)
809 ajStrDel(&swissAccstr);
810
811 return ajFalse;
812 }
813
814 if(ajStrPrefixC(dbxflatRdline,"ID"))
815 {
816 entry->fpos = pos;
817 ajFmtScanS(dbxflatRdline,"%*S%S",&entry->id);
818 ajStrTrimEndC(&entry->id, ";");
819 }
820
821
822 if(svfield)
823 {
824 if(ajStrPrefixC(dbxflatRdline,"SV") ||
825 ajStrPrefixC(dbxflatRdline,"IV")) /* emblcds database format */
826 embBtreeParseEmblAc(dbxflatRdline,svfield);
827
828 if(!MAJSTRGETLEN(swissAccstr) && ajStrPrefixC(dbxflatRdline,"AC"))
829 embBtreeFindEmblAc(dbxflatRdline, svfield, &swissAccstr);
830
831 if(MAJSTRGETLEN(swissAccstr) &&
832 ajStrMatchWildC(dbxflatRdline,
833 "DT \?\?-\?\?\?-\?\?\?\?, sequence version *"))
834 {
835 ajStrAppendK(&swissAccstr, '.');
836 ajStrAppendSubS(&swissAccstr, dbxflatRdline, 35, -3);
837 ajStrTrimEndC(&swissAccstr, ".\n\r"); /* in case of \n\r */
838 ajListstrPushAppend(svfield->data, swissAccstr);
839 swissAccstr = NULL;
840 }
841 }
842
843 if(accfield)
844 if(ajStrPrefixC(dbxflatRdline,"AC") ||
845 ajStrPrefixC(dbxflatRdline,"PA")) /* emblcds database format */
846 embBtreeParseEmblAc(dbxflatRdline,accfield);
847
848 if(keyfield)
849 if(ajStrPrefixC(dbxflatRdline,"KW"))
850 embBtreeParseEmblKw(dbxflatRdline,keyfield);
851
852 if(desfield)
853 if(ajStrPrefixC(dbxflatRdline,"DE"))
854 {
855 ajStrCutStart(&dbxflatRdline, 5);
856 ajStrTrimWhiteStart(&dbxflatRdline);
857
858 /*
859 ** trim prefixes
860 ** can be multiple
861 ** e.g. SubName: Full=
862 */
863
864 for(i=0; swissprefix[i]; i++)
865 {
866 if(ajStrPrefixC(dbxflatRdline, swissprefix[i]))
867 ajStrCutStart(&dbxflatRdline, swisslen[i]);
868 }
869
870 embBtreeParseField(dbxflatRdline,dbxflat_wrdexp, desfield);
871 }
872
873 if(orgfield)
874 if(ajStrPrefixC(dbxflatRdline,"OC") ||
875 ajStrPrefixC(dbxflatRdline,"OS"))
876 embBtreeParseEmblTx(dbxflatRdline,orgfield);
877 }
878
879 if(svfield)
880 ajStrDel(&swissAccstr);
881
882 return ajTrue;
883 }
884
885
886
887
888 /* @funcstatic dbxflat_NextEntry ********************************************
889 **
890 ** Parse the next entry from a flatfile
891 **
892 ** @param [u] entry [EmbPBtreeEntry] entry object ptr
893 ** @param [u] inf [AjPFile] file object ptr
894 **
895 ** @return [AjBool] ajTrue on success, ajFalse if EOF
896 ** @@
897 ******************************************************************************/
898
dbxflat_NextEntry(EmbPBtreeEntry entry,AjPFile inf)899 static AjBool dbxflat_NextEntry(EmbPBtreeEntry entry, AjPFile inf)
900 {
901 static AjBool init = AJFALSE;
902 static ajint nparser = -1;
903 ajint i;
904
905 if(!init)
906 {
907 entry->fpos = 0L;
908 for(i=0; parser[i].Name && nparser == -1; ++i)
909 if(ajStrMatchC(entry->dbtype, parser[i].Name))
910 nparser = i;
911 if(nparser == -1)
912 ajFatal("Database format (%S) unknown",entry->dbtype);
913 init = ajTrue;
914 }
915
916
917 if(!(*parser[nparser].Parser)(entry,inf))
918 return ajFalse;
919
920
921
922 return ajTrue;
923 }
924