1 /* @source dbxtax application
2 **
3 ** Index NCBI's taxonomy database
4 **
5 ** @author Copyright (C) Peter Rice (pmr@ebi.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 
23 #include "emboss.h"
24 
25 static AjPRegexp dbxtax_wrdexp = NULL;
26 
27 static AjPStr nameline = NULL;
28 
29 static AjPTable mergeTable = NULL;
30 
31 static ajlong namePos = 0L;
32 
33 static AjBool dbxtax_NextEntry(EmbPBtreeEntry entry, AjPFile infnode,
34                                AjPFile infname);
35 static AjBool dbxtax_ParseName(EmbPBtreeEntry entry, const AjPStr line);
36 static AjBool dbxtax_ParseNode(EmbPBtreeEntry entry, const AjPStr line);
37 static void   dbxtax_ParseMerged(AjPFile mergefile);
38 
39 EmbPBtreeField accfield = NULL;
40 EmbPBtreeField taxfield = NULL;
41 EmbPBtreeField rnkfield = NULL;
42 EmbPBtreeField upfield = NULL;
43 EmbPBtreeField gcfield = NULL;
44 EmbPBtreeField mgcfield = NULL;
45 
46 
47 
48 
49 /* @prog dbxtax ************************************************************
50 **
51 ** Index the NCBI taxonomy database
52 **
53 ******************************************************************************/
54 
main(int argc,char ** argv)55 int main(int argc, char **argv)
56 {
57     EmbPBtreeEntry entry = NULL;
58 
59     AjPStr dbname   = NULL;
60     AjPStr dbrs     = NULL;
61     AjPStr release  = NULL;
62     AjPStr datestr  = NULL;
63 
64     AjPStr directory;
65     AjPStr indexdir;
66     AjPStr filename = NULL;
67     AjPStr exclude = NULL;
68     AjPStr dbtype = NULL;
69     AjPFile outf = NULL;
70 
71     AjBool compressed = ajTrue;
72 
73     AjPFile mergefile = NULL;
74     AjPStr *fieldarray = NULL;
75 
76     ajint nfields;
77     ajint nfiles;
78 
79     AjPStr tmpstr = NULL;
80     AjPStr thysfile = NULL;
81 
82     ajint i;
83     AjPFile infnode = NULL;
84     AjPFile infname = NULL;
85 
86     ajulong nentries = 0L;
87     ajulong ientries = 0L;
88     AjPTime starttime = NULL;
89     AjPTime begintime = NULL;
90     AjPTime nowtime = NULL;
91 
92     embInit("dbxtax", argc, argv);
93 
94     fieldarray = ajAcdGetList("fields");
95     directory  = ajAcdGetDirectoryName("directory");
96     outf       = ajAcdGetOutfile("outfile");
97     indexdir   = ajAcdGetOutdirName("indexoutdir");
98     dbname     = ajAcdGetString("dbname");
99     dbrs       = ajAcdGetString("dbresource");
100     release    = ajAcdGetString("release");
101     datestr    = ajAcdGetString("date");
102     compressed = ajAcdGetBoolean("compressed");
103 
104     entry = embBtreeEntryNew(1);
105     tmpstr = ajStrNew();
106 
107     dbtype = ajStrNewC("taxonomy");
108 
109     nfields = embBtreeSetFields(entry,fieldarray);
110     embBtreeSetDbInfo(entry,dbname,dbrs,datestr,release,dbtype,directory,
111 		      indexdir);
112     entry->compressed = compressed;
113 
114     for(i=0; i< nfields; i++)
115     {
116         if(ajStrMatchC(fieldarray[i], "acc"))
117         {
118             accfield = embBtreeGetFieldS(entry, fieldarray[i]);
119             if(compressed)
120                 embBtreeFieldSetCompressed(accfield);
121         }
122         else if(ajStrMatchC(fieldarray[i], "up"))
123         {
124             upfield = embBtreeGetFieldS(entry, fieldarray[i]);
125             if(compressed)
126                 embBtreeFieldSetCompressed(upfield);
127             embBtreeFieldSetIdtype(upfield);
128         }
129         else if(ajStrMatchC(fieldarray[i], "tax"))
130         {
131             taxfield = embBtreeGetFieldS(entry, fieldarray[i]);
132             if(compressed)
133                 embBtreeFieldSetCompressed(taxfield);
134             embBtreeFieldSetIdtype(taxfield);
135         }
136         else if(ajStrMatchC(fieldarray[i], "rnk"))
137         {
138             rnkfield = embBtreeGetFieldS(entry, fieldarray[i]);
139             if(compressed)
140                 embBtreeFieldSetCompressed(rnkfield);
141         }
142         else if(ajStrMatchC(fieldarray[i], "gc"))
143         {
144             gcfield = embBtreeGetFieldS(entry, fieldarray[i]);
145             if(compressed)
146                 embBtreeFieldSetCompressed(gcfield);
147         }
148         else if(ajStrMatchC(fieldarray[i], "mgc"))
149         {
150             mgcfield = embBtreeGetFieldS(entry, fieldarray[i]);
151             if(compressed)
152                 embBtreeFieldSetCompressed(mgcfield);
153         }
154         else if(!ajStrMatchC(fieldarray[i], "id"))
155             ajErr("Unknown field '%S' specified for indexing", fieldarray[i]);
156     }
157 
158     embBtreeGetRsInfo(entry);
159 
160     ajStrAssignC(&exclude, "");
161     ajStrAssignC(&filename, "nodes.dmp");
162     nfiles = embBtreeGetFiles(entry,directory,filename,exclude);
163     if(!nfiles)
164         ajDie("No input files in '%S' matched filename '%S'",
165               directory, filename);
166 
167     ajStrAssignC(&filename, "names.dmp");
168     ajListPushAppend(entry->reffiles[0],(void *)filename);
169     filename = NULL;
170 
171     embBtreeWriteEntryFile(entry);
172 
173     embBtreeOpenCaches(entry);
174 
175     starttime = ajTimeNewToday();
176 
177     ajFmtPrintF(outf, "Processing directory: %S\n", directory);
178 
179     ajFmtPrintS(&tmpstr,"%S%s",entry->directory,"merged.dmp");
180     mergefile = ajFileNewInNameS(tmpstr);
181 
182     dbxtax_ParseMerged(mergefile);
183     ajFileClose(&mergefile);
184 
185     for(i=0;i<nfiles;++i)
186     {
187         begintime = ajTimeNewToday();
188 
189 	ajListPop(entry->reffiles[0],(void **)&thysfile);
190 	ajListPushAppend(entry->files,(void *)thysfile);
191 	ajFmtPrintS(&tmpstr,"%S%S",entry->directory,thysfile);
192 	if(!(infname=ajFileNewInNameS(tmpstr)))
193 	    ajFatal("Cannot open input file %S\n",tmpstr);
194 
195 	ajListPop(entry->files,(void **)&thysfile);
196 	ajListPushAppend(entry->files,(void *)thysfile);
197 	ajFmtPrintS(&tmpstr,"%S%S",entry->directory,thysfile);
198 	if(!(infnode=ajFileNewInNameS(tmpstr)))
199 	    ajFatal("Cannot open input file %S\n",tmpstr);
200 
201 	ajFilenameTrimPath(&tmpstr);
202 	ajFmtPrintF(outf,"Processing file: %S\n",tmpstr);
203 
204 	ientries = 0L;
205 
206 	while(dbxtax_NextEntry(entry, infnode, infname))
207 	{
208 	    ++ientries;
209 
210 	    if(entry->do_id)
211                 embBtreeIndexEntry(entry, i);
212 
213             if(accfield)
214                 embBtreeIndexField(accfield, entry, i);
215 
216 	    if(taxfield)
217                 embBtreeIndexField(taxfield, entry, i);
218 
219 	    if(rnkfield)
220                 embBtreeIndexField(rnkfield, entry, i);
221 
222 	    if(upfield)
223                 embBtreeIndexField(upfield, entry, i);
224 
225 	    if(gcfield)
226                 embBtreeIndexField(gcfield, entry, i);
227 
228 	    if(mgcfield)
229                 embBtreeIndexField(mgcfield, entry, i);
230 	}
231 
232 	ajFileClose(&infnode);
233 	ajFileClose(&infname);
234 	nentries += ientries;
235 	nowtime = ajTimeNewToday();
236 	ajFmtPrintF(outf, "entries: %Lu (%Lu) time: %.1fs (%.1fs)\n",
237 		    nentries, ientries,
238 		    ajTimeDiff(starttime, nowtime),
239 		    ajTimeDiff(begintime, nowtime));
240 	ajTimeDel(&begintime);
241 	ajTimeDel(&nowtime);
242     }
243 
244     embBtreeDumpParameters(entry);
245     embBtreeCloseCaches(entry);
246 
247     nowtime = ajTimeNewToday();
248     ajFmtPrintF(outf, "Total time: %.1fs\n", ajTimeDiff(starttime, nowtime));
249     ajTimeDel(&nowtime);
250     ajTimeDel(&starttime);
251 
252 
253     embBtreeReportEntry(outf, entry);
254 
255     if(accfield)
256         embBtreeReportField(outf, accfield);
257     if(taxfield)
258         embBtreeReportField(outf, taxfield);
259     if(rnkfield)
260         embBtreeReportField(outf, rnkfield);
261     if(upfield)
262         embBtreeReportField(outf, upfield);
263     if(gcfield)
264         embBtreeReportField(outf, gcfield);
265     if(mgcfield)
266         embBtreeReportField(outf, mgcfield);
267 
268     ajFileClose(&outf);
269     embBtreeEntryDel(&entry);
270     ajStrDel(&tmpstr);
271     ajStrDel(&filename);
272     ajStrDel(&exclude);
273     ajStrDel(&dbname);
274     ajStrDel(&dbrs);
275     ajStrDel(&release);
276     ajStrDel(&datestr);
277     ajStrDel(&directory);
278     ajStrDel(&indexdir);
279     ajStrDel(&dbtype);
280 
281 
282     nfields = 0;
283     while(fieldarray[nfields])
284 	ajStrDel(&fieldarray[nfields++]);
285     AJFREE(fieldarray);
286 
287     ajRegFree(&dbxtax_wrdexp);
288     ajStrDel(&nameline);
289 
290     ajTablestrFree(&mergeTable);
291 
292     embExit();
293 
294     return 0;
295 }
296 
297 
298 
299 
300 /* @funcstatic dbxtax_NextEntry ********************************************
301 **
302 ** Parse the next entry from nodes and names files
303 **
304 ** @param [u] entry [EmbPBtreeEntry] entry object ptr
305 ** @param [u] infnode [AjPFile] nodes file object ptr
306 ** @param [u] infname [AjPFile] names file object ptr
307 **
308 ** @return [AjBool] ajTrue on success, ajFalse if EOF
309 ** @@
310 ******************************************************************************/
311 
dbxtax_NextEntry(EmbPBtreeEntry entry,AjPFile infnode,AjPFile infname)312 static AjBool dbxtax_NextEntry(EmbPBtreeEntry entry, AjPFile infnode,
313                                AjPFile infname)
314 {
315     AjPStr line = NULL;
316     AjBool ok = ajTrue;
317 
318     ajStrAssignC(&line,"");
319 
320     entry->fpos = ajFileResetPos(infnode);
321 
322     if(!ajReadlineTrim(infnode,&line))
323     {
324         ajStrDel(&line);
325         return ajFalse;
326     }
327 
328     dbxtax_ParseNode(entry, line);
329 
330     entry->reffpos[0] = namePos;
331 
332     if(!nameline)
333         ok = ajReadlineTrim(infname,&nameline);
334 
335     while(ok && dbxtax_ParseName(entry, nameline))
336     {
337         namePos = ajFileResetPos(infname);
338         ok = ajReadlineTrim(infname,&nameline);
339     }
340 
341     ajStrDel(&line);
342 
343     return ajTrue;
344 }
345 
346 
347 
348 
349 /* @funcstatic dbxtax_ParseNode ************************************************
350 **
351 ** Parse the node record
352 **
353 ** @param [u] entry [EmbPBtreeEntry] entry object ptr
354 ** @param [r] line [const AjPStr] nodes.dmp record
355 ** @return [AjBool] ajTrue on success.
356 ** @@
357 ******************************************************************************/
358 
dbxtax_ParseNode(EmbPBtreeEntry entry,const AjPStr line)359 static AjBool dbxtax_ParseNode(EmbPBtreeEntry entry,
360                                const AjPStr line)
361 {
362     AjPStr tmpstr  = NULL;
363     AjPStr tmpfd  = NULL;
364 
365     AjPStrTok handle = NULL;
366     const AjPStr oldids = NULL;
367 
368     if(!dbxtax_wrdexp)
369 	dbxtax_wrdexp = ajRegCompC("([A-Za-z0-9]+)");
370 
371     handle = ajStrTokenNewC(line, "|");
372 
373     if(!ajStrTokenNextParse(handle, &tmpstr)) /* taxid */
374         return ajFalse;
375     ajStrTrimWhite(&tmpstr);
376 
377     ajStrAssignS(&entry->id, tmpstr);
378     oldids = ajTableFetchS(mergeTable, tmpstr);
379     if(oldids && accfield)
380         embBtreeParseField(oldids, dbxtax_wrdexp, accfield);
381 
382     if(!ajStrTokenNextParse(handle, &tmpstr)) /* parent taxid */
383         return ajFalse;
384     ajStrTrimWhite(&tmpstr);
385     if(upfield && ajStrGetLen(tmpstr))
386 	ajListPush(upfield->data,ajStrNewS(tmpstr));
387 
388     if(!ajStrTokenNextParse(handle, &tmpstr)) /* rank */
389         return ajFalse;
390 
391     ajStrTrimWhite(&tmpstr);
392 
393     ajStrTrimWhite(&tmpstr);
394     if(rnkfield && ajStrGetLen(tmpstr))
395 	ajListPush(rnkfield->data,ajStrNewS(tmpstr));
396 
397     if(!ajStrTokenNextParse(handle, &tmpstr)) /* embl code */
398         return ajFalse;
399 
400     if(!ajStrTokenNextParse(handle, &tmpstr)) /* division */
401         return ajFalse;
402 
403     if(!ajStrTokenNextParse(handle, &tmpstr)) /* division flag */
404         return ajFalse;
405 
406     if(!ajStrTokenNextParse(handle, &tmpstr)) /* gencode */
407         return ajFalse;
408     ajStrTrimWhite(&tmpstr);
409     if(gcfield && ajStrGetLen(tmpstr))
410 	ajListPush(gcfield->data,ajStrNewS(tmpstr));
411 
412     if(!ajStrTokenNextParse(handle, &tmpstr)) /* gencode flag */
413         return ajFalse;
414 
415     if(!ajStrTokenNextParse(handle, &tmpstr)) /* mitocode */
416         return ajFalse;
417     ajStrTrimWhite(&tmpstr);
418     if(mgcfield && ajStrGetLen(tmpstr) && !ajStrMatchC(tmpstr, "0"))
419 	ajListPush(mgcfield->data,ajStrNewS(tmpstr));
420 
421     if(!ajStrTokenNextParse(handle, &tmpstr)) /* mitocode flag */
422         return ajFalse;
423 
424     if(!ajStrTokenNextParse(handle, &tmpstr)) /* genbank hidden flag */
425         return ajFalse;
426 
427     if(!ajStrTokenNextParse(handle, &tmpstr)) /* nosequence flag */
428         return ajFalse;
429 
430     if(!ajStrTokenNextParse(handle, &tmpstr)) /* comments */
431         return ajFalse;
432 
433     ajStrDel(&tmpstr);
434     ajStrDel(&tmpfd);
435     ajStrTokenDel(&handle);
436 
437     return ajTrue;
438 }
439 
440 
441 
442 
443 /* @funcstatic dbxtax_ParseName ************************************************
444 **
445 ** Parse the next name record until the name does not match the entry
446 **
447 ** @param [u] entry [EmbPBtreeEntry] entry object ptr
448 ** @param [r] line [const AjPStr] names.dmp record
449 ** @return [AjBool] ajTrue on success.
450 ** @@
451 ******************************************************************************/
452 
dbxtax_ParseName(EmbPBtreeEntry entry,const AjPStr line)453 static AjBool dbxtax_ParseName(EmbPBtreeEntry entry, const AjPStr line)
454 {
455     AjPStr tmpstr = NULL;
456     AjPStr tmpname = NULL;
457 
458     AjPStrTok handle = NULL;
459     ajlong pos;
460 
461     if(!dbxtax_wrdexp)
462 	dbxtax_wrdexp = ajRegCompC("([A-Za-z0-9]+)");
463 
464     handle = ajStrTokenNewC(line, "|");
465 
466     if(!ajStrTokenNextParse(handle, &tmpstr)) /* taxid */
467         return ajFalse;
468     ajStrTrimWhite(&tmpstr);
469     if(!ajStrMatchS(entry->id, tmpstr))
470     {
471         ajStrTokenDel(&handle);
472         ajStrDel(&tmpstr);
473         return ajFalse;
474     }
475 
476     if(!ajStrTokenNextParse(handle, &tmpname)) /* name */
477     {
478         ajStrTokenDel(&handle);
479         ajStrDel(&tmpstr);
480         ajStrDel(&tmpname);
481         return ajFalse;
482     }
483 
484     ajStrTrimWhite(&tmpname);
485 
486     if(!ajStrTokenNextParse(handle, &tmpstr)) /* uniquename */
487     {
488         ajStrTokenDel(&handle);
489         ajStrDel(&tmpstr);
490         ajStrDel(&tmpname);
491         return ajFalse;
492     }
493 
494     ajStrTrimWhite(&tmpstr);
495 /*    if(ajStrGetLen(tmpstr))
496       ajStrAssignS(&tmpname, tmpstr);*/
497 
498     if(!ajStrTokenNextParse(handle, &tmpstr)) /* nameclass */
499     {
500         ajStrTokenDel(&handle);
501         ajStrDel(&tmpstr);
502         ajStrDel(&tmpname);
503         return ajFalse;
504     }
505 
506     ajStrTrimWhite(&tmpstr);
507 
508     if(taxfield &&
509        ajStrGetLen(tmpname))
510     {
511         pos = ajStrFindAnyK(tmpname, '<');
512         if(pos > 0)
513             ajStrKeepRange(&tmpname, 0, pos-1);
514 
515         ajStrRemoveWhiteExcess(&tmpname);
516         if(ajStrMatchC(tmpstr, "scientific name"))
517             ajListPush(taxfield->data,ajStrNewS(tmpname));
518         if(ajStrSuffixC(tmpstr, "common name"))
519             ajListPush(taxfield->data,ajStrNewS(tmpname));
520         if(ajStrMatchC(tmpstr, "synonym"))
521         {
522             pos = ajStrFindAnyK(tmpname, '(');
523             if(pos > 0)
524                 ajStrKeepRange(&tmpname, 0, pos-1);
525             ajListPush(taxfield->data,ajStrNewS(tmpname));
526         }
527     }
528 
529     ajStrDel(&tmpname);
530     ajStrDel(&tmpstr);
531     ajStrTokenDel(&handle);
532 
533     return ajTrue;
534 }
535 
536 
537 
538 
539 /* @funcstatic dbxtax_ParseMerged *********************************************
540 **
541 ** Parse the merged taxid file
542 **
543 ** @param [u] mergefile [AjPFile] Input file (merged.dmp)
544 **
545 ** @return [void]
546 ** @@
547 ******************************************************************************/
548 
dbxtax_ParseMerged(AjPFile mergefile)549 static void dbxtax_ParseMerged(AjPFile mergefile)
550 {
551     AjPStr line = NULL;
552     AjPStr oldid = NULL;
553     AjPStr newid = NULL;
554     AjPStr record = NULL;
555     AjPStrTok handle = NULL;
556 
557     mergeTable = ajTablestrNew(20000);
558 
559     while(ajReadlineTrim(mergefile, &line))
560     {
561         ajStrTokenAssignC(&handle, line, "\t|");
562         ajStrTokenNextParse(handle, &oldid);
563         ajStrTokenNextParse(handle, &newid);
564         record = ajTableFetchmodS(mergeTable, newid);
565         if(!record)
566         {
567             ajTablePut(mergeTable, ajStrNewS(newid), ajStrNewS(oldid));
568         }
569         else
570         {
571             ajStrAppendK(&record, ' ');
572             ajStrAppendS(&record, oldid);
573         }
574     }
575 
576     ajStrTokenDel(&handle);
577     ajStrDel(&newid);
578     ajStrDel(&oldid);
579     ajStrDel(&line);
580 
581     return;
582 }
583 
584 
585 
586 
587