1 /* @source dbxtax application
2 **
3 ** Index NCBI's taxonomy database
4 **
5 ** @author Copyright (C) Peter Rice (pmr@ebi.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 #include "emboss.h"
24
25 static AjPRegexp dbxtax_wrdexp = NULL;
26
27 static AjPStr nameline = NULL;
28
29 static AjPTable mergeTable = NULL;
30
31 static ajlong namePos = 0L;
32
33 static AjBool dbxtax_NextEntry(EmbPBtreeEntry entry, AjPFile infnode,
34 AjPFile infname);
35 static AjBool dbxtax_ParseName(EmbPBtreeEntry entry, const AjPStr line);
36 static AjBool dbxtax_ParseNode(EmbPBtreeEntry entry, const AjPStr line);
37 static void dbxtax_ParseMerged(AjPFile mergefile);
38
39 EmbPBtreeField accfield = NULL;
40 EmbPBtreeField taxfield = NULL;
41 EmbPBtreeField rnkfield = NULL;
42 EmbPBtreeField upfield = NULL;
43 EmbPBtreeField gcfield = NULL;
44 EmbPBtreeField mgcfield = NULL;
45
46
47
48
49 /* @prog dbxtax ************************************************************
50 **
51 ** Index the NCBI taxonomy database
52 **
53 ******************************************************************************/
54
main(int argc,char ** argv)55 int main(int argc, char **argv)
56 {
57 EmbPBtreeEntry entry = NULL;
58
59 AjPStr dbname = NULL;
60 AjPStr dbrs = NULL;
61 AjPStr release = NULL;
62 AjPStr datestr = NULL;
63
64 AjPStr directory;
65 AjPStr indexdir;
66 AjPStr filename = NULL;
67 AjPStr exclude = NULL;
68 AjPStr dbtype = NULL;
69 AjPFile outf = NULL;
70
71 AjBool compressed = ajTrue;
72
73 AjPFile mergefile = NULL;
74 AjPStr *fieldarray = NULL;
75
76 ajint nfields;
77 ajint nfiles;
78
79 AjPStr tmpstr = NULL;
80 AjPStr thysfile = NULL;
81
82 ajint i;
83 AjPFile infnode = NULL;
84 AjPFile infname = NULL;
85
86 ajulong nentries = 0L;
87 ajulong ientries = 0L;
88 AjPTime starttime = NULL;
89 AjPTime begintime = NULL;
90 AjPTime nowtime = NULL;
91
92 embInit("dbxtax", argc, argv);
93
94 fieldarray = ajAcdGetList("fields");
95 directory = ajAcdGetDirectoryName("directory");
96 outf = ajAcdGetOutfile("outfile");
97 indexdir = ajAcdGetOutdirName("indexoutdir");
98 dbname = ajAcdGetString("dbname");
99 dbrs = ajAcdGetString("dbresource");
100 release = ajAcdGetString("release");
101 datestr = ajAcdGetString("date");
102 compressed = ajAcdGetBoolean("compressed");
103
104 entry = embBtreeEntryNew(1);
105 tmpstr = ajStrNew();
106
107 dbtype = ajStrNewC("taxonomy");
108
109 nfields = embBtreeSetFields(entry,fieldarray);
110 embBtreeSetDbInfo(entry,dbname,dbrs,datestr,release,dbtype,directory,
111 indexdir);
112 entry->compressed = compressed;
113
114 for(i=0; i< nfields; i++)
115 {
116 if(ajStrMatchC(fieldarray[i], "acc"))
117 {
118 accfield = embBtreeGetFieldS(entry, fieldarray[i]);
119 if(compressed)
120 embBtreeFieldSetCompressed(accfield);
121 }
122 else if(ajStrMatchC(fieldarray[i], "up"))
123 {
124 upfield = embBtreeGetFieldS(entry, fieldarray[i]);
125 if(compressed)
126 embBtreeFieldSetCompressed(upfield);
127 embBtreeFieldSetIdtype(upfield);
128 }
129 else if(ajStrMatchC(fieldarray[i], "tax"))
130 {
131 taxfield = embBtreeGetFieldS(entry, fieldarray[i]);
132 if(compressed)
133 embBtreeFieldSetCompressed(taxfield);
134 embBtreeFieldSetIdtype(taxfield);
135 }
136 else if(ajStrMatchC(fieldarray[i], "rnk"))
137 {
138 rnkfield = embBtreeGetFieldS(entry, fieldarray[i]);
139 if(compressed)
140 embBtreeFieldSetCompressed(rnkfield);
141 }
142 else if(ajStrMatchC(fieldarray[i], "gc"))
143 {
144 gcfield = embBtreeGetFieldS(entry, fieldarray[i]);
145 if(compressed)
146 embBtreeFieldSetCompressed(gcfield);
147 }
148 else if(ajStrMatchC(fieldarray[i], "mgc"))
149 {
150 mgcfield = embBtreeGetFieldS(entry, fieldarray[i]);
151 if(compressed)
152 embBtreeFieldSetCompressed(mgcfield);
153 }
154 else if(!ajStrMatchC(fieldarray[i], "id"))
155 ajErr("Unknown field '%S' specified for indexing", fieldarray[i]);
156 }
157
158 embBtreeGetRsInfo(entry);
159
160 ajStrAssignC(&exclude, "");
161 ajStrAssignC(&filename, "nodes.dmp");
162 nfiles = embBtreeGetFiles(entry,directory,filename,exclude);
163 if(!nfiles)
164 ajDie("No input files in '%S' matched filename '%S'",
165 directory, filename);
166
167 ajStrAssignC(&filename, "names.dmp");
168 ajListPushAppend(entry->reffiles[0],(void *)filename);
169 filename = NULL;
170
171 embBtreeWriteEntryFile(entry);
172
173 embBtreeOpenCaches(entry);
174
175 starttime = ajTimeNewToday();
176
177 ajFmtPrintF(outf, "Processing directory: %S\n", directory);
178
179 ajFmtPrintS(&tmpstr,"%S%s",entry->directory,"merged.dmp");
180 mergefile = ajFileNewInNameS(tmpstr);
181
182 dbxtax_ParseMerged(mergefile);
183 ajFileClose(&mergefile);
184
185 for(i=0;i<nfiles;++i)
186 {
187 begintime = ajTimeNewToday();
188
189 ajListPop(entry->reffiles[0],(void **)&thysfile);
190 ajListPushAppend(entry->files,(void *)thysfile);
191 ajFmtPrintS(&tmpstr,"%S%S",entry->directory,thysfile);
192 if(!(infname=ajFileNewInNameS(tmpstr)))
193 ajFatal("Cannot open input file %S\n",tmpstr);
194
195 ajListPop(entry->files,(void **)&thysfile);
196 ajListPushAppend(entry->files,(void *)thysfile);
197 ajFmtPrintS(&tmpstr,"%S%S",entry->directory,thysfile);
198 if(!(infnode=ajFileNewInNameS(tmpstr)))
199 ajFatal("Cannot open input file %S\n",tmpstr);
200
201 ajFilenameTrimPath(&tmpstr);
202 ajFmtPrintF(outf,"Processing file: %S\n",tmpstr);
203
204 ientries = 0L;
205
206 while(dbxtax_NextEntry(entry, infnode, infname))
207 {
208 ++ientries;
209
210 if(entry->do_id)
211 embBtreeIndexEntry(entry, i);
212
213 if(accfield)
214 embBtreeIndexField(accfield, entry, i);
215
216 if(taxfield)
217 embBtreeIndexField(taxfield, entry, i);
218
219 if(rnkfield)
220 embBtreeIndexField(rnkfield, entry, i);
221
222 if(upfield)
223 embBtreeIndexField(upfield, entry, i);
224
225 if(gcfield)
226 embBtreeIndexField(gcfield, entry, i);
227
228 if(mgcfield)
229 embBtreeIndexField(mgcfield, entry, i);
230 }
231
232 ajFileClose(&infnode);
233 ajFileClose(&infname);
234 nentries += ientries;
235 nowtime = ajTimeNewToday();
236 ajFmtPrintF(outf, "entries: %Lu (%Lu) time: %.1fs (%.1fs)\n",
237 nentries, ientries,
238 ajTimeDiff(starttime, nowtime),
239 ajTimeDiff(begintime, nowtime));
240 ajTimeDel(&begintime);
241 ajTimeDel(&nowtime);
242 }
243
244 embBtreeDumpParameters(entry);
245 embBtreeCloseCaches(entry);
246
247 nowtime = ajTimeNewToday();
248 ajFmtPrintF(outf, "Total time: %.1fs\n", ajTimeDiff(starttime, nowtime));
249 ajTimeDel(&nowtime);
250 ajTimeDel(&starttime);
251
252
253 embBtreeReportEntry(outf, entry);
254
255 if(accfield)
256 embBtreeReportField(outf, accfield);
257 if(taxfield)
258 embBtreeReportField(outf, taxfield);
259 if(rnkfield)
260 embBtreeReportField(outf, rnkfield);
261 if(upfield)
262 embBtreeReportField(outf, upfield);
263 if(gcfield)
264 embBtreeReportField(outf, gcfield);
265 if(mgcfield)
266 embBtreeReportField(outf, mgcfield);
267
268 ajFileClose(&outf);
269 embBtreeEntryDel(&entry);
270 ajStrDel(&tmpstr);
271 ajStrDel(&filename);
272 ajStrDel(&exclude);
273 ajStrDel(&dbname);
274 ajStrDel(&dbrs);
275 ajStrDel(&release);
276 ajStrDel(&datestr);
277 ajStrDel(&directory);
278 ajStrDel(&indexdir);
279 ajStrDel(&dbtype);
280
281
282 nfields = 0;
283 while(fieldarray[nfields])
284 ajStrDel(&fieldarray[nfields++]);
285 AJFREE(fieldarray);
286
287 ajRegFree(&dbxtax_wrdexp);
288 ajStrDel(&nameline);
289
290 ajTablestrFree(&mergeTable);
291
292 embExit();
293
294 return 0;
295 }
296
297
298
299
300 /* @funcstatic dbxtax_NextEntry ********************************************
301 **
302 ** Parse the next entry from nodes and names files
303 **
304 ** @param [u] entry [EmbPBtreeEntry] entry object ptr
305 ** @param [u] infnode [AjPFile] nodes file object ptr
306 ** @param [u] infname [AjPFile] names file object ptr
307 **
308 ** @return [AjBool] ajTrue on success, ajFalse if EOF
309 ** @@
310 ******************************************************************************/
311
dbxtax_NextEntry(EmbPBtreeEntry entry,AjPFile infnode,AjPFile infname)312 static AjBool dbxtax_NextEntry(EmbPBtreeEntry entry, AjPFile infnode,
313 AjPFile infname)
314 {
315 AjPStr line = NULL;
316 AjBool ok = ajTrue;
317
318 ajStrAssignC(&line,"");
319
320 entry->fpos = ajFileResetPos(infnode);
321
322 if(!ajReadlineTrim(infnode,&line))
323 {
324 ajStrDel(&line);
325 return ajFalse;
326 }
327
328 dbxtax_ParseNode(entry, line);
329
330 entry->reffpos[0] = namePos;
331
332 if(!nameline)
333 ok = ajReadlineTrim(infname,&nameline);
334
335 while(ok && dbxtax_ParseName(entry, nameline))
336 {
337 namePos = ajFileResetPos(infname);
338 ok = ajReadlineTrim(infname,&nameline);
339 }
340
341 ajStrDel(&line);
342
343 return ajTrue;
344 }
345
346
347
348
349 /* @funcstatic dbxtax_ParseNode ************************************************
350 **
351 ** Parse the node record
352 **
353 ** @param [u] entry [EmbPBtreeEntry] entry object ptr
354 ** @param [r] line [const AjPStr] nodes.dmp record
355 ** @return [AjBool] ajTrue on success.
356 ** @@
357 ******************************************************************************/
358
dbxtax_ParseNode(EmbPBtreeEntry entry,const AjPStr line)359 static AjBool dbxtax_ParseNode(EmbPBtreeEntry entry,
360 const AjPStr line)
361 {
362 AjPStr tmpstr = NULL;
363 AjPStr tmpfd = NULL;
364
365 AjPStrTok handle = NULL;
366 const AjPStr oldids = NULL;
367
368 if(!dbxtax_wrdexp)
369 dbxtax_wrdexp = ajRegCompC("([A-Za-z0-9]+)");
370
371 handle = ajStrTokenNewC(line, "|");
372
373 if(!ajStrTokenNextParse(handle, &tmpstr)) /* taxid */
374 return ajFalse;
375 ajStrTrimWhite(&tmpstr);
376
377 ajStrAssignS(&entry->id, tmpstr);
378 oldids = ajTableFetchS(mergeTable, tmpstr);
379 if(oldids && accfield)
380 embBtreeParseField(oldids, dbxtax_wrdexp, accfield);
381
382 if(!ajStrTokenNextParse(handle, &tmpstr)) /* parent taxid */
383 return ajFalse;
384 ajStrTrimWhite(&tmpstr);
385 if(upfield && ajStrGetLen(tmpstr))
386 ajListPush(upfield->data,ajStrNewS(tmpstr));
387
388 if(!ajStrTokenNextParse(handle, &tmpstr)) /* rank */
389 return ajFalse;
390
391 ajStrTrimWhite(&tmpstr);
392
393 ajStrTrimWhite(&tmpstr);
394 if(rnkfield && ajStrGetLen(tmpstr))
395 ajListPush(rnkfield->data,ajStrNewS(tmpstr));
396
397 if(!ajStrTokenNextParse(handle, &tmpstr)) /* embl code */
398 return ajFalse;
399
400 if(!ajStrTokenNextParse(handle, &tmpstr)) /* division */
401 return ajFalse;
402
403 if(!ajStrTokenNextParse(handle, &tmpstr)) /* division flag */
404 return ajFalse;
405
406 if(!ajStrTokenNextParse(handle, &tmpstr)) /* gencode */
407 return ajFalse;
408 ajStrTrimWhite(&tmpstr);
409 if(gcfield && ajStrGetLen(tmpstr))
410 ajListPush(gcfield->data,ajStrNewS(tmpstr));
411
412 if(!ajStrTokenNextParse(handle, &tmpstr)) /* gencode flag */
413 return ajFalse;
414
415 if(!ajStrTokenNextParse(handle, &tmpstr)) /* mitocode */
416 return ajFalse;
417 ajStrTrimWhite(&tmpstr);
418 if(mgcfield && ajStrGetLen(tmpstr) && !ajStrMatchC(tmpstr, "0"))
419 ajListPush(mgcfield->data,ajStrNewS(tmpstr));
420
421 if(!ajStrTokenNextParse(handle, &tmpstr)) /* mitocode flag */
422 return ajFalse;
423
424 if(!ajStrTokenNextParse(handle, &tmpstr)) /* genbank hidden flag */
425 return ajFalse;
426
427 if(!ajStrTokenNextParse(handle, &tmpstr)) /* nosequence flag */
428 return ajFalse;
429
430 if(!ajStrTokenNextParse(handle, &tmpstr)) /* comments */
431 return ajFalse;
432
433 ajStrDel(&tmpstr);
434 ajStrDel(&tmpfd);
435 ajStrTokenDel(&handle);
436
437 return ajTrue;
438 }
439
440
441
442
443 /* @funcstatic dbxtax_ParseName ************************************************
444 **
445 ** Parse the next name record until the name does not match the entry
446 **
447 ** @param [u] entry [EmbPBtreeEntry] entry object ptr
448 ** @param [r] line [const AjPStr] names.dmp record
449 ** @return [AjBool] ajTrue on success.
450 ** @@
451 ******************************************************************************/
452
dbxtax_ParseName(EmbPBtreeEntry entry,const AjPStr line)453 static AjBool dbxtax_ParseName(EmbPBtreeEntry entry, const AjPStr line)
454 {
455 AjPStr tmpstr = NULL;
456 AjPStr tmpname = NULL;
457
458 AjPStrTok handle = NULL;
459 ajlong pos;
460
461 if(!dbxtax_wrdexp)
462 dbxtax_wrdexp = ajRegCompC("([A-Za-z0-9]+)");
463
464 handle = ajStrTokenNewC(line, "|");
465
466 if(!ajStrTokenNextParse(handle, &tmpstr)) /* taxid */
467 return ajFalse;
468 ajStrTrimWhite(&tmpstr);
469 if(!ajStrMatchS(entry->id, tmpstr))
470 {
471 ajStrTokenDel(&handle);
472 ajStrDel(&tmpstr);
473 return ajFalse;
474 }
475
476 if(!ajStrTokenNextParse(handle, &tmpname)) /* name */
477 {
478 ajStrTokenDel(&handle);
479 ajStrDel(&tmpstr);
480 ajStrDel(&tmpname);
481 return ajFalse;
482 }
483
484 ajStrTrimWhite(&tmpname);
485
486 if(!ajStrTokenNextParse(handle, &tmpstr)) /* uniquename */
487 {
488 ajStrTokenDel(&handle);
489 ajStrDel(&tmpstr);
490 ajStrDel(&tmpname);
491 return ajFalse;
492 }
493
494 ajStrTrimWhite(&tmpstr);
495 /* if(ajStrGetLen(tmpstr))
496 ajStrAssignS(&tmpname, tmpstr);*/
497
498 if(!ajStrTokenNextParse(handle, &tmpstr)) /* nameclass */
499 {
500 ajStrTokenDel(&handle);
501 ajStrDel(&tmpstr);
502 ajStrDel(&tmpname);
503 return ajFalse;
504 }
505
506 ajStrTrimWhite(&tmpstr);
507
508 if(taxfield &&
509 ajStrGetLen(tmpname))
510 {
511 pos = ajStrFindAnyK(tmpname, '<');
512 if(pos > 0)
513 ajStrKeepRange(&tmpname, 0, pos-1);
514
515 ajStrRemoveWhiteExcess(&tmpname);
516 if(ajStrMatchC(tmpstr, "scientific name"))
517 ajListPush(taxfield->data,ajStrNewS(tmpname));
518 if(ajStrSuffixC(tmpstr, "common name"))
519 ajListPush(taxfield->data,ajStrNewS(tmpname));
520 if(ajStrMatchC(tmpstr, "synonym"))
521 {
522 pos = ajStrFindAnyK(tmpname, '(');
523 if(pos > 0)
524 ajStrKeepRange(&tmpname, 0, pos-1);
525 ajListPush(taxfield->data,ajStrNewS(tmpname));
526 }
527 }
528
529 ajStrDel(&tmpname);
530 ajStrDel(&tmpstr);
531 ajStrTokenDel(&handle);
532
533 return ajTrue;
534 }
535
536
537
538
539 /* @funcstatic dbxtax_ParseMerged *********************************************
540 **
541 ** Parse the merged taxid file
542 **
543 ** @param [u] mergefile [AjPFile] Input file (merged.dmp)
544 **
545 ** @return [void]
546 ** @@
547 ******************************************************************************/
548
dbxtax_ParseMerged(AjPFile mergefile)549 static void dbxtax_ParseMerged(AjPFile mergefile)
550 {
551 AjPStr line = NULL;
552 AjPStr oldid = NULL;
553 AjPStr newid = NULL;
554 AjPStr record = NULL;
555 AjPStrTok handle = NULL;
556
557 mergeTable = ajTablestrNew(20000);
558
559 while(ajReadlineTrim(mergefile, &line))
560 {
561 ajStrTokenAssignC(&handle, line, "\t|");
562 ajStrTokenNextParse(handle, &oldid);
563 ajStrTokenNextParse(handle, &newid);
564 record = ajTableFetchmodS(mergeTable, newid);
565 if(!record)
566 {
567 ajTablePut(mergeTable, ajStrNewS(newid), ajStrNewS(oldid));
568 }
569 else
570 {
571 ajStrAppendK(&record, ' ');
572 ajStrAppendS(&record, oldid);
573 }
574 }
575
576 ajStrTokenDel(&handle);
577 ajStrDel(&newid);
578 ajStrDel(&oldid);
579 ajStrDel(&line);
580
581 return;
582 }
583
584
585
586
587