1 /* @source dbiblast application
2 **
3 ** Index blast databases
4 **
5 ** @author Copyright (C) Peter Rice, Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22 /******************************************************************************
23 **
24 ** EMBOSS/Staden/EMBLCD indexing
25 **
26 ** This version reads a BLAST formatted database,
27 ** and writes entryname and accession index files.
28 **
29 ** It helps to know the format in order to
30 ** parse the entryname and accession number,
31 ** but it can guess if necessary.
32 **
33 ** It also helps to know the type (blast1 or blast2)
34 ** and the sequence type (protein or nucleic) but again
35 ** it can guess by looking at the file extensions.
36 **
37 ** To save memory, it is also helpful to know the maximum number of
38 ** entries in the database and the maximum entryname length so that
39 ** space can be preallocated for storage.
40 **
41 ** Entry names and accession numbers are held in list structures,
42 ** then converted to arrays and sorted.
43 **
44 ** Multiple input files are allowed.
45 **
46 ** EMBLCD and Staden index files use different names but have essentially
47 ** the same contents.
48 **
49 ******************************************************************************/
50
51 #include "emboss.h"
52 #ifndef WIN32
53 #include <dirent.h>
54 #include <sys/wait.h>
55 #include <sys/stat.h>
56 #include <sys/mman.h>
57 #ifndef _AIX
58 #include <sys/fcntl.h>
59 #else
60 #include <fcntl.h>
61 #endif
62 #endif
63 #include <string.h>
64
65 #ifndef MAP_FILE /* Solaris does not have MAP_FILE */
66 #define MAP_FILE 0
67 #endif
68
69 #define BLASTIDUNKNOWN 0
70 #define BLASTIDANY 1
71 #define BLASTIDNCBI 2
72 #define BLASTIDGCG 3
73 #define BLASTIDSANGER 4
74 #define BLASTPREFNCBI 1
75
76 #ifdef WIN32
77 typedef char* caddr_t;
78 #endif
79
80 #define TABLESIZE 10000
81 #define HDRSIZE 1000
82
83 /* Definition of global variables */
84
85 static AjPList* dbiblastGFdl = NULL;
86
87 static AjPRegexp dbiblastGWrdexp = NULL;
88
89 static AjPStr dbiblastGId = NULL;
90 static AjPStr dbiblastGHline = NULL;
91 static AjPStr dbiblastGAcc = NULL;
92 static AjPStr dbiblastGTmpT = NULL;
93 static AjPStr dbiblastGTmpDes = NULL;
94 static AjPStr dbiblastGTmpFd = NULL;
95 static AjPStr dbiblastGTmpAc = NULL;
96 static AjPStr dbiblastGTmpSv = NULL;
97 static AjPStr dbiblastGTmpGi = NULL;
98 static AjPStr dbiblastGTmpDb = NULL;
99
100 static EmbPEntry dbiblastGEntry = NULL;
101
102
103
104
105 /* @datastatic PMemFile *******************************************************
106 **
107 ** DbiBlast in-memory file
108 **
109 ** @attr File [AjPFile] Ajax file
110 ** @attr IsMem [AjBool] True if in memory mapped
111 ** @attr Fd [ajint] Unix file descriptor (integer)
112 ** @attr Pos [ajlong] Position in file/memory
113 ** @attr Size [ajlong] Size of file/memory
114 ** @attr Name [AjPStr] Name of file
115 ** @attr Mem [caddr_t] Memory map
116 ******************************************************************************/
117
118 typedef struct SMemFile
119 {
120 AjPFile File;
121 AjBool IsMem;
122 ajint Fd;
123 ajlong Pos;
124 ajlong Size;
125 AjPStr Name;
126 caddr_t Mem;
127 } OMemFile;
128
129 #define PMemFile OMemFile*
130
131
132
133
134 /* @datastatic PBlastDb *******************************************************
135 **
136 ** DbiBlast database
137 **
138 ** @attr DbType [ajint] database type indicator
139 ** @attr DbFormat [ajint] database format (version) indicator
140 ** @attr IsProtein [ajint] 1 for protein
141 ** @attr IsBlast2 [ajint] 1 for blast2, 0 for blast1
142 ** @attr TitleLen [ajint] length of database title
143 ** @attr DateLen [ajint] length of database date string
144 ** @attr LineLen [ajint] length of database lines
145 ** @attr HeaderLen [ajint] bytes before tables start
146 ** @attr CompLen [ajint] length of compressed seq file
147 ** @attr MaxSeqLen [ajint] max. entry length
148 ** @attr TotLen [ajint] number of bases or residues in database
149 ** @attr CleanCount [ajint] count of cleaned 8mers
150 ** @attr TopCmp [ajint] bytes before compressed table starts
151 ** @attr TopSrc [ajint] bytes before source table starts
152 ** @attr TopHdr [ajint] bytes before headers table starts
153 ** @attr TopAmb [ajint] bytes before ambiguity table starts
154 ** @attr IdType [ajint] ID type
155 ** @attr IdPrefix [ajint] ID prefix type
156 ** @attr TFile [PMemFile] table of offsets, also DB info
157 ** @attr HFile [PMemFile] description lines
158 ** @attr SFile [PMemFile] binary sequence data
159 ** @attr FFile [PMemFile] source sequence data
160 ** @attr Title [AjPStr] database title
161 ** @attr Date [AjPStr] database date
162 ** @attr Name [AjPStr] database base file name
163 ** @attr Size [ajint] number of database entries
164 ** @attr Padding [char[4]] Padding to alignment boundary
165 ******************************************************************************/
166
167 typedef struct SBlastDb
168 {
169 ajint DbType;
170 ajint DbFormat;
171 ajint IsProtein;
172 ajint IsBlast2;
173 ajint TitleLen;
174 ajint DateLen;
175 ajint LineLen;
176 ajint HeaderLen;
177 ajint CompLen;
178 ajint MaxSeqLen;
179 ajint TotLen;
180 ajint CleanCount;
181 ajint TopCmp;
182 ajint TopSrc;
183 ajint TopHdr;
184 ajint TopAmb;
185 ajint IdType;
186 ajint IdPrefix;
187 PMemFile TFile;
188 PMemFile HFile;
189 PMemFile SFile;
190 PMemFile FFile;
191 AjPStr Title;
192 AjPStr Date;
193 AjPStr Name;
194 ajint Size;
195 char Padding[4];
196 } OBlastDb;
197
198 #define PBlastDb OBlastDb*
199
200
201
202
203 /* @datastatic PBlastType *****************************************************
204 **
205 ** DbiBlast types
206 **
207 ** @attr ExtT [const char*] Table filename extension
208 ** @attr ExtH [const char*] Header filename extension
209 ** @attr ExtS [const char*] Sequence filename extension
210 ** @attr IsProtein [AjBool] true for protein
211 ** @attr IsBlast2 [AjBool] blast2.x or blast 1.x
212 ** @attr Type [ajint] enumerated type
213 ** @attr Padding [char[4]] Padding to alignment boundary
214 ******************************************************************************/
215
216 typedef struct SBlastType
217 {
218 const char* ExtT;
219 const char* ExtH;
220 const char* ExtS;
221 AjBool IsProtein;
222 AjBool IsBlast2;
223 ajint Type;
224 char Padding[4];
225 } OBlastType;
226
227 #define PBlastType OBlastType*
228
229
230
231
232 enum blastdbtype {BLAST1P, BLAST1N, BLAST2P, BLAST2N};
233
234 static OBlastType blasttypes[] =
235 {
236 {"atb", "ahd", "bsq", AJTRUE, AJFALSE, BLAST1P, ""},
237 {"ntb", "nhd", "csq", AJFALSE, AJFALSE, BLAST1N, ""},
238 {"pin", "phr", "psq", AJTRUE, AJTRUE, BLAST2P, ""},
239 {"nin", "nhr", "nsq", AJFALSE, AJTRUE, BLAST2N, ""},
240 {NULL, NULL, NULL, 0, 0, 0, ""}
241 };
242
243
244
245
246 static AjBool dbiblast_parseNcbi(const AjPStr line, AjPFile * alistfile,
247 AjBool systemsort, AjPStr const * fields,
248 ajint* maxFieldLen,
249 ajuint* countfield,
250 AjPStr* myid, AjPList* myfdl);
251 static AjBool dbiblast_parseGcg(const AjPStr line, AjPFile * alistfile,
252 AjBool systemsort, AjPStr const * fields,
253 ajint* maxFieldLen,
254 ajuint* countfield,
255 AjPStr* myid, AjPList* myfdl);
256 static AjBool dbiblast_parseSimple(const AjPStr line,
257 AjPFile * alistfile,
258 AjBool systemsort, AjPStr const * fields,
259 ajint* maxFieldLen,
260 ajuint* countfield,
261 AjPStr* myid, AjPList* myfdl);
262 static AjBool dbiblast_parseId(const AjPStr line, AjPFile * alistfile,
263 AjBool systemsort, AjPStr const * fields,
264 ajint* maxFieldLen,
265 ajuint* countfield,
266 AjPStr* myid, AjPList* myfdl);
267 static AjBool dbiblast_parseUnknown(const AjPStr line,
268 AjPFile * alistfile,
269 AjBool systemsort, AjPStr const * fields,
270 ajint* maxFieldLen,
271 ajuint* countfield,
272 AjPStr* myid, AjPList* myfdl);
273
274
275
276
277 /* @datastatic OParser ********************************************************
278 **
279 ** Parser definition structure
280 **
281 ** @alias SParser
282 ** @alias OParser
283 **
284 ** @attr Name [const char*] Parser name
285 ** @attr Parser [AjBool function] Parser function
286 ** @@
287 ******************************************************************************/
288
289 typedef struct SParser
290 {
291 const char* Name;
292 AjBool (*Parser) (const AjPStr line, AjPFile * alistfile,
293 AjBool systemsort, AjPStr const * fields,
294 ajint* maxFieldLen, ajuint* countfield,
295 AjPStr* myid, AjPList* myfdl);
296 } OParser;
297
298 static OParser parser[] =
299 {
300 {"NCBI", dbiblast_parseNcbi},
301 {"GCG", dbiblast_parseGcg},
302 {"SIMPLE", dbiblast_parseSimple},
303 {"ID", dbiblast_parseId},
304 {"UNKNOWN", dbiblast_parseUnknown},
305 {NULL, NULL}
306 };
307
308 static EmbPEntry dbiblast_nextblastentry(PBlastDb db, ajint ifile,
309 const AjPStr idformat,
310 AjBool systemsort,
311 AjPStr const * fields,
312 ajint* maxFieldLen,
313 ajuint* maxidlen,
314 ajuint* countfield,
315 AjPFile elistfile,
316 AjPFile * alistfile);
317 static AjBool dbiblast_blastopenlib(const AjPStr lname, AjBool usesrc,
318 ajint blastv, char dbtype,
319 PBlastDb* pdb);
320
321 static void dbiblast_dbfree(PBlastDb* pdb);
322
323 static void dbiblast_dbname(AjPStr* dbname,
324 const AjPStr oname, const char *suff);
325 static void dbiblast_newname(AjPStr* nname,
326 const AjPStr oname, const char *suff);
327
328 static void dbiblast_memreadUInt4(PMemFile fd, ajuint *val);
329
330 static PMemFile dbiblast_memfopenfile(const AjPStr name);
331 static void dbiblast_memfclosefile(PMemFile* pfd);
332 static size_t dbiblast_memfseek(PMemFile mf, ajlong offset, ajint whence);
333 static size_t dbiblast_memfread(void* dest, size_t size, size_t num_items,
334 PMemFile mf);
335 static size_t dbiblast_memfreadS(AjPStr* dest, size_t size, size_t num_items,
336 PMemFile mf);
337
338 static ajint dbiblast_loadtable(ajuint* table, ajint isize, PBlastDb db,
339 ajint top, ajint pos);
340 static ajint dbiblast_ncblreadhdr(AjPStr* hdrline, PBlastDb db,
341 ajint start, ajint end);
342 static AjBool dbiblast_wrongtype(const AjPStr oname, const char *suff);
343
344 static AjBool readReverse = AJFALSE;
345
346
347
348
349 /* @prog dbiblast *************************************************************
350 **
351 ** Index a BLAST database
352 **
353 ******************************************************************************/
354
main(int argc,char ** argv)355 int main(int argc, char **argv)
356 {
357
358 AjPList idlist;
359 AjPList* fieldList = NULL;
360
361 AjBool systemsort;
362 AjBool cleanup;
363
364 ajint blastv = 0;
365 char dbtype = '\0';
366
367 ajuint maxindex;
368 ajuint maxidlen = 0;
369 ajuint maxlen;
370
371 AjPStr version = NULL;
372 AjPStr seqtype = NULL;
373
374 AjPFile elistfile = NULL;
375 AjPFile* alistfile = NULL;
376
377 AjPStr dbname = NULL;
378 AjPStr release = NULL;
379 AjPStr datestr = NULL;
380 AjPStr sortopt = NULL;
381 void **entryIds = NULL;
382
383 AjBool usesrc = AJTRUE;
384
385 AjPStr directory;
386 AjPStr indexdir;
387 AjPStr filename;
388 AjPStr exclude;
389 AjPStr curfilename = NULL;
390
391 AjPStr idformat = NULL;
392
393 EmbPEntry entry;
394
395 PBlastDb db = NULL;
396
397 ajuint idCount = 0;
398 ajuint idDone;
399 AjPList listTestFiles = NULL;
400 void ** testFiles = NULL;
401 ajuint nfiles;
402 ajuint ifile;
403 ajuint jfile;
404
405 ajuint filesize;
406 short recsize;
407 ajuint maxfilelen = 20;
408 char date[4] =
409 {
410 0,0,0,0
411 };
412
413 AjPStr tmpfname = NULL;
414 AjPStr* fields = NULL;
415
416 AjPFile entFile = NULL;
417
418 AjPStr* divfiles = NULL;
419 ajint* maxFieldLen = NULL;
420
421 ajuint ifield = 0;
422 ajuint nfields = 0;
423
424 AjPFile logfile = NULL;
425 ajuint* countField = NULL;
426 ajuint* fieldTot = NULL;
427 ajuint idCountFile = 0;
428 ajuint i = 0;
429
430 embInit("dbiblast", argc, argv);
431
432 idformat = ajStrNewC("NCBI");
433
434 fields = ajAcdGetList("fields");
435 directory = ajAcdGetDirectoryName("directory");
436 indexdir = ajAcdGetOutdirName("indexoutdir");
437 filename = ajAcdGetString("filenames");
438 exclude = ajAcdGetString("exclude");
439 dbname = ajAcdGetString("dbname");
440 release = ajAcdGetString("release");
441 datestr = ajAcdGetString("date");
442 systemsort = ajAcdGetBoolean("systemsort");
443 cleanup = ajAcdGetBoolean("cleanup");
444 sortopt = ajAcdGetString("sortoptions");
445 maxindex = ajAcdGetInt("maxindex");
446 version = ajAcdGetListSingle("blastversion");
447 seqtype = ajAcdGetListSingle("seqtype");
448 usesrc = ajAcdGetBoolean("sourcefile");
449 logfile = ajAcdGetOutfile("outfile");
450
451 while(fields[nfields]) /* array ends with a NULL */
452 nfields++;
453
454 if(nfields)
455 {
456 AJCNEW(maxFieldLen, nfields);
457 AJCNEW0(countField, nfields);
458 AJCNEW0(fieldTot, nfields);
459 for(ifield=0; ifield < nfields; ifield++)
460 maxFieldLen[ifield] = (ajint) maxindex * -1;
461
462 if(systemsort)
463 AJCNEW(alistfile, nfields);
464 else
465 {
466 AJCNEW(fieldList, nfields);
467 for(ifield=0; ifield < nfields; ifield++)
468 fieldList[ifield] = ajListNew();
469 }
470 }
471
472 if(ajStrMatchC(datestr, "00/00/00"))
473 ajFmtPrintS(&datestr, "%D", ajTimeRefTodayFmt("dbindex"));
474
475 ajStrRemoveWhite(&dbname); /* used for temp filenames */
476 embDbiDateSet(datestr, date);
477 idlist = ajListNew();
478
479 if(ajUtilGetBigendian())
480 readReverse = ajFalse;
481 else
482 readReverse = ajTrue;
483
484 ajStrToInt(version, &blastv);
485 dbtype = ajStrGetCharFirst(seqtype);
486
487 ajDebug("reading '%S/%S'\n", directory, filename);
488 ajDebug("writing '%S/'\n", indexdir);
489
490 listTestFiles = embDbiFileListExc(directory, filename, exclude);
491 ajListSort(listTestFiles, &ajStrVcmp);
492 nfiles = (ajuint) ajListToarray(listTestFiles, &testFiles);
493 if(!nfiles)
494 ajDie("No input files in '%S' matched filename '%S'",
495 directory, filename);
496
497 embDbiLogHeader(logfile, dbname, release, datestr,
498 indexdir, maxindex);
499
500 embDbiLogFields(logfile, fields, nfields);
501 embDbiLogSource(logfile, directory, filename, exclude,
502 (AjPStr*) testFiles, nfiles);
503 embDbiLogCmdline(logfile);
504
505 AJCNEW0(divfiles, nfiles);
506
507 /*
508 ** process each input file, one at a time
509 */
510
511 jfile = 0;
512 for(ifile=0; ifile < nfiles; ifile++)
513 {
514 curfilename = (AjPStr) testFiles[ifile];
515 if(!dbiblast_blastopenlib(curfilename,
516 usesrc, blastv, dbtype, &db))
517 continue; /* could be the wrong file type with "*.*" */
518
519 ajDebug("processing filename '%S' ...\n", curfilename);
520 ajDebug("processing file '%S' ...\n", db->TFile->Name);
521
522
523 ajStrAssignS(&divfiles[jfile], db->TFile->Name);
524 ajFilenameTrimPath(&divfiles[jfile]);
525 if(ajStrGetLen(divfiles[jfile]) >= maxfilelen)
526 maxfilelen = ajStrGetLen(divfiles[jfile]) + 1;
527
528 if(systemsort) /* elistfile for entries, alist for fields */
529 elistfile = embDbiSortOpen(alistfile, jfile,
530 dbname, fields, nfields);
531
532 idCountFile = 0;
533 for(i=0;i<nfields;i++)
534 countField[i] = 0;
535 while((entry=dbiblast_nextblastentry(db, jfile,
536 idformat, systemsort,
537 fields,
538 maxFieldLen,
539 &maxidlen, countField,
540 elistfile, alistfile)))
541 {
542 idCountFile++;
543 if(!systemsort) /* save the entry data in lists */
544 {
545 embDbiMemEntry(idlist, fieldList, nfields, entry, jfile);
546 }
547 }
548 idCount += idCountFile;
549 if(systemsort)
550 {
551 embDbiSortClose(&elistfile, alistfile, nfields);
552 /* lost the entry, so can't free it :-) */
553 }
554
555 embDbiLogFile(logfile, curfilename, idCountFile, fields,
556 countField, nfields);
557 dbiblast_dbfree(&db);
558 jfile++;
559 }
560 nfiles = jfile;
561
562 /*
563 ** write the division.lkp file
564 */
565
566 embDbiWriteDivision(indexdir, dbname, release, date,
567 maxfilelen, nfiles, divfiles, NULL);
568
569 /*
570 ** Write the entryname.idx index
571 */
572
573 ajStrAssignC(&tmpfname, "entrynam.idx");
574 entFile = ajFileNewOutNamePathS(tmpfname, indexdir);
575
576 recsize = maxidlen+10;
577 filesize = 300 + (idCount*(ajint)recsize);
578 embDbiHeader(entFile, filesize, idCount, recsize, dbname, release, date);
579
580 if(systemsort)
581 idDone = embDbiSortWriteEntry(entFile, maxidlen,
582 dbname, nfiles, cleanup, sortopt);
583 else /* save entries in entryIds array */
584 {
585 idDone = embDbiMemWriteEntry(entFile, maxidlen,
586 idlist, &entryIds);
587 if(idDone != idCount)
588 ajFatal("Duplicates not allowed for in-memory processing");
589 }
590
591 embDbiHeaderSize(entFile, 300+(idDone*(ajint)recsize), idDone);
592 ajFileClose(&entFile);
593
594 /*
595 ** Write the fields index files
596 */
597
598 for(ifield=0; ifield < nfields; ifield++)
599 {
600
601 if(maxindex)
602 maxlen = maxindex;
603 else
604 {
605 if(maxFieldLen[ifield] >= 0)
606 maxlen = maxFieldLen[ifield];
607 else
608 maxlen = - maxFieldLen[ifield];
609 }
610
611 if(systemsort)
612 fieldTot[ifield] = embDbiSortWriteFields(dbname, release,
613 date, indexdir,
614 fields[ifield], maxlen,
615 nfiles, idCount,
616 cleanup, sortopt);
617 else
618 fieldTot[ifield] = embDbiMemWriteFields(dbname, release,
619 date, indexdir,
620 fields[ifield], maxlen,
621 fieldList[ifield],
622 entryIds);
623 }
624
625 embDbiLogFinal(logfile,maxindex, maxFieldLen, fields, fieldTot,
626 nfields, nfiles, idDone, idCount);
627
628 if(systemsort)
629 embDbiRmEntryFile(dbname, cleanup);
630
631 ajListMap(idlist, &embDbiEntryDelMap, NULL);
632 ajListFree(&idlist);
633 AJFREE(entryIds);
634
635 ajStrDelarray(&fields);
636
637 for(i=0;i<nfields;i++)
638 {
639 if(systemsort)
640 {
641 ajFileClose(&alistfile[i]);
642 }
643 else
644 {
645 ajListMap(fieldList[i], &embDbiFieldDelMap, NULL);
646 ajListFree(&fieldList[i]);
647 }
648 }
649 AJFREE(alistfile);
650 AJFREE(fieldList);
651 ajStrDel(&version);
652 ajStrDel(&seqtype);
653 ajFileClose(&elistfile);
654 for(i=0;i<nfiles;i++)
655 {
656 ajStrDel(&divfiles[i]);
657 }
658 AJFREE(countField);
659 AJFREE(fieldTot);
660
661 ajStrDel(&dbname);
662 ajStrDel(&release);
663 ajStrDel(&datestr);
664 ajStrDel(&sortopt);
665 ajStrDel(&directory);
666 ajStrDel(&indexdir);
667 ajStrDel(&filename);
668 ajStrDel(&exclude);
669 ajStrDel(&idformat);
670 ajStrDel(&tmpfname);
671
672 AJFREE(maxFieldLen);
673
674 ajFileClose(&logfile);
675
676 ajListstrFreeData(&listTestFiles);
677
678 ajStrDel(&dbiblastGTmpT);
679 ajStrDel(&dbiblastGId);
680 ajStrDel(&dbiblastGAcc);
681 ajStrDel(&dbiblastGHline);
682 ajStrDel(&dbiblastGTmpDes);
683 ajStrDel(&dbiblastGTmpFd);
684 ajStrDel(&dbiblastGTmpGi);
685 ajStrDel(&dbiblastGTmpDb);
686 ajStrDel(&dbiblastGTmpAc);
687 ajStrDel(&dbiblastGTmpSv);
688 ajRegFree(&dbiblastGWrdexp);
689
690 embDbiEntryDel(&dbiblastGEntry);
691
692 if(dbiblastGFdl)
693 {
694 for(i=0; i < nfields; i++)
695 ajListFree(&dbiblastGFdl[i]);
696 AJFREE(dbiblastGFdl);
697 }
698
699 for(i=0;i<nfiles;i++)
700 {
701 ajStrDel(&divfiles[i]);
702 }
703 AJFREE(divfiles);
704 AJFREE(testFiles);
705
706 embExit();
707
708 return 0;
709 }
710
711
712
713
714 /* @funcstatic dbiblast_nextblastentry ****************************************
715 **
716 ** Returns next database entry as an EmbPEntry object
717 **
718 ** @param [u] db [PBlastDb] Blast database object
719 ** @param [r] ifile [ajint] File number.
720 ** @param [r] idformat [const AjPStr] Id format in FASTA file
721 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
722 ** @param [r] fields [AjPStr const *] Field names to be indexed
723 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
724 ** @param [w] maxidlen [ajuint*] Maximum entry ID length
725 ** @param [w] countfield [ajuint*] Number of tokens for each field
726 ** @param [u] elistfile [AjPFile] entry file
727 ** @param [u] alistfile [AjPFile *] field data files array
728 ** @return [EmbPEntry] Entry data object.
729 ** @@
730 ******************************************************************************/
731
dbiblast_nextblastentry(PBlastDb db,ajint ifile,const AjPStr idformat,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * maxidlen,ajuint * countfield,AjPFile elistfile,AjPFile * alistfile)732 static EmbPEntry dbiblast_nextblastentry(PBlastDb db, ajint ifile,
733 const AjPStr idformat,
734 AjBool systemsort,
735 AjPStr const * fields,
736 ajint * maxFieldLen,
737 ajuint* maxidlen, ajuint* countfield,
738 AjPFile elistfile,
739 AjPFile * alistfile)
740 {
741 ajint i;
742 static ajint lastfile = -1;
743 static ajint iparser = -1;
744 static ajint called = 0;
745 static ajuint tabhdr[TABLESIZE];
746 static ajint iload = TABLESIZE-1;
747 static ajint irest = 0;
748 static ajint ipos = 0;
749
750 static ajint jpos = 0;
751 ajint ir;
752 ajint j;
753 static ajint is = 0;
754 char* token;
755 static ajint nfields;
756 ajint ifield;
757
758 if(!called)
759 {
760 for(i=0; parser[i].Name; i++)
761 if(ajStrMatchC(idformat, parser[i].Name))
762 {
763 iparser = i;
764 break;
765 }
766
767 if(iparser < 0)
768 ajFatal("idformat '%S' unknown", idformat);
769 ajDebug("idformat '%S' Parser %d\n", idformat, iparser);
770 ajStrSetRes(&dbiblastGId, HDRSIZE);
771 ajStrSetRes(&dbiblastGAcc, HDRSIZE);
772 ajStrSetRes(&dbiblastGHline, HDRSIZE);
773 called = 1;
774 }
775
776 if(!dbiblastGFdl)
777 {
778 nfields=0;
779 while(fields[nfields])
780 nfields++;
781 if(nfields)
782 AJCNEW(dbiblastGFdl, nfields);
783 for(i=0; i < nfields; i++)
784 dbiblastGFdl[i] = ajListNew();
785 }
786
787 if(lastfile != ifile)
788 {
789 lastfile = ifile;
790 ipos = 1;
791 /* isize = 0;*/
792 irest = 0;
793 iload = TABLESIZE-1;
794 }
795
796 if(!dbiblastGEntry || !systemsort)
797 dbiblastGEntry = embDbiEntryNew(nfields);
798
799 /* pick up the next entry, parse it and dump it */
800
801 if(ipos > db->Size)
802 return NULL;
803
804 if( ipos >= irest)
805 {
806 ajDebug("ipos: %d iload: %d irest: %d\n", ipos, iload, irest);
807 irest = ipos + TABLESIZE - 2;
808 if(irest > db->Size)
809 {
810 iload = db->Size - ipos + 1;
811 irest = db->Size;
812 }
813
814 jpos=0;
815 j = dbiblast_loadtable(tabhdr, iload, db, db->TopHdr, ipos-1);
816 if(!j)
817 ajDebug("No elements read");
818 }
819
820 j = dbiblast_ncblreadhdr(&dbiblastGHline, db, tabhdr[jpos], tabhdr[jpos+1]);
821
822 if(!(*parser[iparser].Parser)(dbiblastGHline, alistfile, systemsort,
823 fields, maxFieldLen, countfield,
824 &dbiblastGId, dbiblastGFdl))
825 ajFatal("failed to parse '%S'", dbiblastGHline);
826
827 ir = ipos;
828
829 if(ajStrGetLen(dbiblastGId) > *maxidlen)
830 *maxidlen = ajStrGetLen(dbiblastGId);
831
832 if(systemsort)
833 ajFmtPrintF(elistfile, "%S %d %d %d\n",
834 dbiblastGId, ir, is, ifile+1);
835 else
836 {
837 dbiblastGEntry->entry = ajCharNewS(dbiblastGId);
838 dbiblastGEntry->rpos = ir;
839 dbiblastGEntry->spos = is;
840 dbiblastGEntry->filenum = ifile+1;
841
842 /* field tokens as list, then move to dbiblastGEntry->field */
843 for(ifield=0; ifield < nfields; ifield++)
844 {
845 dbiblastGEntry->nfield[ifield] =
846 (ajuint) ajListGetLength(dbiblastGFdl[ifield]);
847
848 if(dbiblastGEntry->nfield[ifield])
849 {
850 AJCNEW(dbiblastGEntry->field[ifield],
851 dbiblastGEntry->nfield[ifield]);
852
853 i = 0;
854 while(ajListPop(dbiblastGFdl[ifield], (void**) &token))
855 dbiblastGEntry->field[ifield][i++] = token;
856 }
857 else
858 dbiblastGEntry->field[ifield] = NULL;
859 }
860 }
861 ipos++;
862 jpos++;
863
864 return dbiblastGEntry;
865 }
866
867
868
869
870 /* @funcstatic dbiblast_dbfree ************************************************
871 **
872 ** Free BLAST library object
873 **
874 ** @param [u] pdb [PBlastDb*] Blast dababase structure.
875 ** @return [void]
876 ** @@
877 ******************************************************************************/
878
dbiblast_dbfree(PBlastDb * pdb)879 static void dbiblast_dbfree( PBlastDb* pdb)
880 {
881 PBlastDb db;
882
883 if(!pdb)
884 return;
885
886 if(!*pdb)
887 return;
888
889 db = *pdb;
890
891 dbiblast_memfclosefile(&db->TFile);
892 dbiblast_memfclosefile(&db->HFile);
893 dbiblast_memfclosefile(&db->SFile);
894 dbiblast_memfclosefile(&db->FFile);
895
896 ajStrDel(&db->Name);
897 ajStrDel(&db->Date);
898 ajStrDel(&db->Title);
899
900 AJFREE(*pdb);
901
902 return;
903 }
904
905
906
907
908 /* @funcstatic dbiblast_blastopenlib ******************************************
909 **
910 ** Open BLAST library
911 **
912 ** @param [r] name [const AjPStr] Source file name
913 ** @param [r] usesrc [AjBool] If ajTrue, use the source (fasta) file
914 ** @param [r] blastv [ajint] Blast version number (1 or 2)
915 ** @param [r] dbtype [char] Blast database type (p)rotein or (n)ucleotide
916 ** @param [u] pdb [PBlastDb*] Blast dababase structure.
917 ** @return [AjBool] ajTrue on success
918 ** @@
919 ******************************************************************************/
920
dbiblast_blastopenlib(const AjPStr name,AjBool usesrc,ajint blastv,char dbtype,PBlastDb * pdb)921 static AjBool dbiblast_blastopenlib(const AjPStr name, AjBool usesrc,
922 ajint blastv, char dbtype,
923 PBlastDb* pdb)
924 {
925
926 AjPStr hname = NULL;
927 AjPStr sname = NULL;
928 AjPStr tname = NULL;
929 static AjPStr dbname = NULL;
930 ajint rdtmp = 0;
931 ajint rdtmp2 = 0;
932 ajint itype;
933 ajint ttop;
934 PMemFile TFile = NULL;
935
936 PBlastDb ret;
937
938 for(itype=0; blasttypes[itype].ExtT; itype++)
939 {
940 if((blastv == 1) && blasttypes[itype].IsBlast2)
941 continue;
942
943 if((blastv == 2) && !blasttypes[itype].IsBlast2)
944 continue;
945
946 if((dbtype == 'P') && !blasttypes[itype].IsProtein)
947 continue;
948
949 if((dbtype == 'N') && blasttypes[itype].IsProtein)
950 continue;
951
952 if(dbiblast_wrongtype(name, blasttypes[itype].ExtT))
953 continue;
954
955 dbiblast_dbname(&dbname,name,blasttypes[itype].ExtT);
956 dbiblast_newname(&tname,dbname,blasttypes[itype].ExtT);
957 TFile = dbiblast_memfopenfile(tname);
958
959 if(TFile)
960 break;
961 }
962
963 if(!TFile)
964 return ajFalse;
965
966 AJNEW0(*pdb);
967
968 ret = *pdb;
969
970 ret->TFile = TFile;
971
972 ajStrAssignS(&ret->Name, dbname);
973 ajDebug("Name '%S'\n", ret->Name);
974
975 /* find and open the 'table' file(s) */
976
977 if(!ret->TFile)
978 ajFatal(" cannot open %S table file %S\n", dbname, tname);
979
980 ajDebug("Successfully opened table file for type %d\n", itype);
981
982 ret->IsProtein = blasttypes[itype].IsProtein;
983 ret->IsBlast2 = blasttypes[itype].IsBlast2;
984
985 /* read the type and format - all databases */
986
987 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->DbType);
988 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->DbFormat);
989 ret->HeaderLen += 8;
990
991 ajDebug("dbtype: %x dbformat: %x\n", ret->DbType, ret->DbFormat);
992
993 /* Open the header and (compressed) sequence files */
994 /* for DNA, also look for the FASTA file */
995
996 dbiblast_newname(&hname,dbname,blasttypes[itype].ExtH);
997 if((ret->HFile = dbiblast_memfopenfile(hname))==NULL)
998 ajFatal(" cannot open %S header file\n",hname);
999
1000
1001 dbiblast_newname(&sname,dbname,blasttypes[itype].ExtS);
1002 if((ret->SFile = dbiblast_memfopenfile(sname))==NULL)
1003 ajFatal(" cannot open %S sequence file\n",sname);
1004
1005
1006 if(!ret->IsBlast2 && !ret->IsProtein && usesrc)
1007 /* this can fail */
1008 if((ret->FFile = dbiblast_memfopenfile(dbname))==NULL)
1009 ajDebug(" cannot open %S source file\n",dbname);
1010
1011 /* read the title - all formats */
1012 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->TitleLen);
1013
1014 /* blast2 does not align after the title */
1015 if(ret->IsBlast2)
1016 rdtmp = ret->TitleLen;
1017 else
1018 rdtmp = ret->TitleLen + ((ret->TitleLen%4 !=0 ) ?
1019 4-(ret->TitleLen%4) : 0);
1020 ajStrAssignResC(&ret->Title, rdtmp+1, "");
1021 ajDebug("IsBlast2: %B title_len: %d rdtmp: %d title_str: '%S'\n",
1022 ret->IsBlast2, ret->TitleLen, rdtmp, ret->Title);
1023 ajStrTrace(ret->Title);
1024 dbiblast_memfreadS(&ret->Title,(size_t)1,(size_t)rdtmp,ret->TFile);
1025
1026 if(ret->IsBlast2)
1027 ajStrSetValidLen(&ret->Title, ret->TitleLen);
1028 else
1029 ajStrSetValidLen(&ret->Title, ret->TitleLen-1);
1030
1031 ajDebug("title_len: %d rdtmp: %d title_str: '%S'\n",
1032 ret->TitleLen, rdtmp, ret->Title);
1033
1034 ret->HeaderLen += 4 + rdtmp;
1035
1036 /* read the date - blast2 */
1037 if(ret->IsBlast2)
1038 {
1039 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->DateLen);
1040 rdtmp2 = ret->DateLen;
1041 ajStrAssignResC(&ret->Date, rdtmp2+1, "");
1042 dbiblast_memfreadS(&ret->Date,(size_t)1,(size_t)rdtmp2,ret->TFile);
1043 ajStrSetValid(&ret->Date);
1044 ret->DateLen = ajStrGetLen(ret->Date);
1045 ajDebug("datelen: %d rdtmp: %d date: '%S'\n",
1046 ret->DateLen, rdtmp2, ret->Date);
1047 ret->HeaderLen += 4 + rdtmp2;
1048 }
1049
1050 /* read the rest of the header (different for protein and DNA) */
1051 if(!ret->IsBlast2 && !ret->IsProtein)
1052 {
1053 /* length of source lines */
1054 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->LineLen);
1055 ret->HeaderLen += 4;
1056 }
1057
1058 /* all formats have the next 3 */
1059 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->Size);
1060 if(ret->IsProtein)
1061 { /* mad, but they are the other way for DNA */
1062 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->TotLen);
1063 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->MaxSeqLen);
1064 }
1065 else
1066 {
1067 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->MaxSeqLen);
1068 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->TotLen);
1069 }
1070
1071 ret->HeaderLen += 12;
1072
1073 if(!ret->IsBlast2 && !ret->IsProtein)
1074 { /* Blast 1.4 DNA only */
1075 /* compressed db length */
1076 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->CompLen);
1077 /* count of nt's cleaned */
1078 dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->CleanCount);
1079 ret->HeaderLen += 8;
1080 }
1081
1082 ajDebug(" size: %u, totlen: %d maxseqlen: %u\n",
1083 ret->Size, ret->TotLen, ret->MaxSeqLen);
1084 ajDebug(" linelen: %u, complen: %d cleancount: %d\n",
1085 ret->LineLen, ret->CompLen, ret->CleanCount);
1086
1087
1088 /* Now for the tables of offsets. Again maddeningly different in each */
1089 if(ret->IsBlast2)
1090 {
1091 ttop = ret->TopHdr = ret->HeaderLen; /* header first */
1092 ttop = ret->TopCmp = ttop + (ret->Size+1) * 4; /* then sequence */
1093 if(!ret->IsProtein) /* Blast 2 DNA only */
1094 ttop = ret->TopAmb = ttop + (ret->Size+1) * 4;
1095 }
1096 else
1097 {
1098 ttop = ret->TopCmp = ret->HeaderLen + ret->CleanCount*4; /* comp seq */
1099 if(!ret->IsProtein) /* Blast 1.4 DNA only */
1100 ttop = ret->TopSrc = ttop + (ret->Size+1) * 4;
1101 ttop = ret->TopHdr = ttop + (ret->Size+1) * 4; /* headers for all */
1102 if(!ret->IsProtein) /* Blast 1.4 DNA only */
1103 ttop = ret->TopAmb = ttop + (ret->Size+1) * 4;
1104 }
1105
1106 ajDebug("table file index starts at %d\n", ret->HeaderLen);
1107 ajDebug("table file csq starts at %d\n", ret->TopCmp);
1108 ajDebug("table file src starts at %d\n", ret->TopSrc);
1109 ajDebug("table file hdr starts at %d\n", ret->TopHdr);
1110 ajDebug("table file amb starts at %d\n", ret->TopAmb);
1111
1112 ajStrDel(&hname);
1113 ajStrDel(&sname);
1114 ajStrDel(&tname);
1115 ajStrDel(&dbname);
1116
1117 return ajTrue;
1118 }
1119
1120
1121
1122
1123 /* @funcstatic dbiblast_parseNcbi *********************************************
1124 **
1125 ** Parses an NCBI style header from the BLAST header table.
1126 **
1127 ** @param [r] line [const AjPStr] Input line
1128 ** @param [u] alistfile [AjPFile *] List of field temporary files
1129 ** @param [r] systemsort [AjBool] If ajTrue, use the system sort utility,
1130 ** else sort in memory
1131 ** @param [r] fields [AjPStr const*] Field names
1132 ** @param [w] maxFieldLen [ajint*] Maximum token lengths for each field
1133 ** @param [w] countfield [ajuint*] Number of tokens for each field
1134 ** @param [w] myid [AjPStr*] ID
1135 ** @param [w] fdlist [AjPList *] Field token lists
1136 ** (one list for each field)
1137 ** @return [AjBool] ajTrue on success
1138 ** @@
1139 ******************************************************************************/
1140
dbiblast_parseNcbi(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * fdlist)1141 static AjBool dbiblast_parseNcbi(const AjPStr line, AjPFile * alistfile,
1142 AjBool systemsort, AjPStr const * fields,
1143 ajint* maxFieldLen,
1144 ajuint* countfield,
1145 AjPStr* myid,
1146 AjPList* fdlist)
1147 {
1148 char* fd;
1149
1150 static ajint numFields;
1151 static ajint accfield = -1;
1152 static ajint desfield = -1;
1153 static ajint svnfield = -1;
1154 static AjBool reset = AJTRUE;
1155
1156 if(!fields)
1157 {
1158 reset = ajTrue;
1159 accfield = svnfield = desfield = -1;
1160 return ajFalse;
1161 }
1162
1163 if(reset)
1164 {
1165 numFields = 0;
1166 while(fields[numFields])
1167 {
1168 if(ajStrMatchCaseC(fields[numFields], "acc"))
1169 accfield=numFields;
1170 else if(ajStrMatchCaseC(fields[numFields], "sv"))
1171 svnfield=numFields;
1172 else if(ajStrMatchCaseC(fields[numFields], "des"))
1173 desfield=numFields;
1174 else
1175 ajWarn("EMBL parsing unknown field '%S' ignored",
1176 fields[numFields]);
1177 numFields++;
1178 }
1179 reset = ajFalse;
1180 }
1181
1182 if(!dbiblastGWrdexp)
1183 dbiblastGWrdexp = ajRegCompC("([A-Za-z0-9]+)");
1184
1185 ajStrAssignC(&dbiblastGTmpDes, "");
1186 ajStrAssignC(&dbiblastGTmpT, "");
1187 ajStrAssignC(&dbiblastGTmpAc, "");
1188 ajStrAssignC(&dbiblastGTmpSv, "");
1189 ajStrAssignC(&dbiblastGTmpGi, "");
1190 ajStrAssignC(&dbiblastGTmpDb, "");
1191
1192 ajFmtPrintS(&dbiblastGTmpT, ">%S", line);
1193
1194 if(!ajSeqParseNcbi(dbiblastGTmpT, myid, &dbiblastGTmpAc, &dbiblastGTmpSv,
1195 &dbiblastGTmpGi, &dbiblastGTmpDb, &dbiblastGTmpDes))
1196 return ajFalse;
1197
1198 if(ajStrGetLen(dbiblastGTmpAc))
1199 ajStrFmtUpper(&dbiblastGTmpAc);
1200
1201 if(accfield >= 0)
1202 embDbiMaxlen(&dbiblastGTmpAc, &maxFieldLen[accfield]);
1203
1204 if(svnfield >= 0)
1205 {
1206 embDbiMaxlen(&dbiblastGTmpSv, &maxFieldLen[svnfield]);
1207 embDbiMaxlen(&dbiblastGTmpGi, &maxFieldLen[svnfield]);
1208 }
1209
1210
1211 ajStrFmtUpper(myid);
1212
1213 /* ajDebug("parseNCBI success\n"); */
1214
1215 if(systemsort)
1216 {
1217 if(accfield >= 0 && ajStrGetLen(dbiblastGTmpAc))
1218 {
1219 countfield[accfield]++;
1220 ajFmtPrintF(alistfile[accfield], "%S %S\n", *myid, dbiblastGTmpAc);
1221 }
1222 if(svnfield >= 0 && ajStrGetLen(dbiblastGTmpSv))
1223 {
1224 countfield[svnfield]++;
1225 ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiblastGTmpSv);
1226 }
1227 if(svnfield >= 0 && ajStrGetLen(dbiblastGTmpGi))
1228 {
1229 countfield[svnfield]++;
1230 ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiblastGTmpGi);
1231 }
1232 if(desfield >= 0 && ajStrGetLen(dbiblastGTmpDes))
1233 while(ajRegExec(dbiblastGWrdexp, dbiblastGTmpDes))
1234 {
1235 ajRegSubI(dbiblastGWrdexp, 1, &dbiblastGTmpFd);
1236 embDbiMaxlen(&dbiblastGTmpFd, &maxFieldLen[desfield]);
1237 ajStrFmtUpper(&dbiblastGTmpFd);
1238 ajDebug("++des '%S'\n", dbiblastGTmpFd);
1239 countfield[desfield]++;
1240 ajFmtPrintF(alistfile[desfield], "%S %S\n",
1241 *myid, dbiblastGTmpFd);
1242 ajRegPost(dbiblastGWrdexp, &dbiblastGTmpDes);
1243 }
1244 }
1245 else
1246 {
1247 if(accfield >= 0 && ajStrGetLen(dbiblastGTmpAc))
1248 {
1249 fd = ajCharNewS(dbiblastGTmpAc);
1250 countfield[accfield]++;
1251 ajListPushAppend(fdlist[accfield], fd);
1252 }
1253
1254 if(svnfield >= 0 && ajStrGetLen(dbiblastGTmpSv))
1255 {
1256 fd = ajCharNewS(dbiblastGTmpSv);
1257 countfield[svnfield]++;
1258 ajListPushAppend(fdlist[svnfield], fd);
1259 }
1260
1261 if(svnfield >= 0 && ajStrGetLen(dbiblastGTmpGi))
1262 {
1263 fd = ajCharNewS(dbiblastGTmpGi);
1264 ajListPushAppend(fdlist[svnfield], fd);
1265 }
1266
1267 if(desfield >= 0 && ajStrGetLen(dbiblastGTmpDes))
1268 {
1269 while(ajRegExec(dbiblastGWrdexp, dbiblastGTmpDes))
1270 {
1271 ajRegSubI(dbiblastGWrdexp, 1, &dbiblastGTmpFd);
1272 embDbiMaxlen(&dbiblastGTmpFd, &maxFieldLen[desfield]);
1273 ajStrFmtUpper(&dbiblastGTmpFd);
1274 ajDebug("++des '%S'\n", dbiblastGTmpFd);
1275 fd = ajCharNewS(dbiblastGTmpFd);
1276 countfield[desfield]++;
1277 ajListPushAppend(fdlist[desfield], fd);
1278 ajRegPost(dbiblastGWrdexp, &dbiblastGTmpDes);
1279 }
1280 }
1281 }
1282
1283 /* ajDebug("parseNCBI '%S' '%S'\n", *myid, dbiblastGTmpAc); */
1284
1285 return ajTrue;
1286 }
1287
1288
1289
1290
1291 /* @funcstatic dbiblast_parseGcg **********************************************
1292 **
1293 ** Parses a GCG style header from the BLAST header table.
1294 **
1295 ** @param [r] line [const AjPStr] Input line
1296 ** @param [u] alistfile [AjPFile *] field data files array
1297 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1298 ** @param [r] fields [AjPStr const *] Field names to be indexed
1299 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1300 ** @param [w] countfield [ajuint*] Number of tokens for each field
1301 ** @param [w] myid [AjPStr*] ID
1302 ** @param [w] myfdl [AjPList *] Accession number list
1303 ** @return [AjBool] ajTrue on success
1304 ** @@
1305 ******************************************************************************/
1306
dbiblast_parseGcg(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * myfdl)1307 static AjBool dbiblast_parseGcg(const AjPStr line, AjPFile * alistfile,
1308 AjBool systemsort, AjPStr const * fields,
1309 ajint* maxFieldLen,
1310 ajuint* countfield,
1311 AjPStr* myid, AjPList* myfdl)
1312 {
1313 static AjPRegexp idexp = NULL;
1314 static AjPStr mytmpac = NULL;
1315 char* ac;
1316 static ajint numFields;
1317 static ajint accfield = -1;
1318 static AjBool reset = AJTRUE;
1319
1320 if(!fields)
1321 {
1322 reset = ajTrue;
1323 accfield = -1;
1324 return ajFalse;
1325 }
1326
1327 if(reset)
1328 {
1329 numFields = 0;
1330 while(fields[numFields])
1331 {
1332 countfield[numFields]=0;
1333 if(ajStrMatchCaseC(fields[numFields], "acc"))
1334 accfield=numFields;
1335 else if(!ajStrMatchCaseC(fields[numFields], "sv") &&
1336 !ajStrMatchCaseC(fields[numFields], "des"))
1337 ajWarn("GCG ID parsing unknown field '%S' ignored",
1338 fields[numFields]);
1339
1340 numFields++;
1341 }
1342 reset = ajFalse;
1343 }
1344
1345
1346 if(!idexp)
1347 idexp = ajRegCompC("^[^:]+:([^ ]+)( +([A-Za-z][A-Za-z0-9]+[0-9]))");
1348
1349 if(!ajRegExec(idexp, line))
1350 return ajFalse;
1351
1352 ajRegSubI(idexp, 1, myid);
1353 ajRegSubI(idexp, 3, &mytmpac);
1354 ajStrFmtUpper(myid);
1355 ajStrFmtUpper(&mytmpac); /* GCG mixes case on new SwissProt acnums */
1356
1357 if(accfield >= 0)
1358 {
1359 embDbiMaxlen(&mytmpac, &maxFieldLen[accfield]);
1360
1361 countfield[accfield]++;
1362 if(systemsort)
1363 ajFmtPrintF(alistfile[accfield], "%S %S\n", *myid, mytmpac);
1364 else
1365 {
1366 ac = ajCharNewS(mytmpac);
1367 ajListPushAppend(myfdl[accfield], ac);
1368 }
1369 }
1370
1371 ajDebug("parseGCG '%S' '%S'\n", *myid, mytmpac);
1372
1373 return ajTrue;
1374 }
1375
1376
1377
1378
1379 /* @funcstatic dbiblast_parseSimple *******************************************
1380 **
1381 ** Parses a plain header from the BLAST header table.
1382 **
1383 ** @param [r] line [const AjPStr] Input line
1384 ** @param [u] alistfile [AjPFile *] field data files array
1385 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1386 ** @param [r] fields [AjPStr const *] Field names to be indexed
1387 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1388 ** @param [w] countfield [ajuint*] Number of tokens for each field
1389 ** @param [w] myid [AjPStr*] ID
1390 ** @param [w] myfdl [AjPList*] Accession number list
1391 ** @return [AjBool] ajTrue on success
1392 ** @@
1393 ******************************************************************************/
1394
dbiblast_parseSimple(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * myfdl)1395 static AjBool dbiblast_parseSimple(const AjPStr line,
1396 AjPFile * alistfile,
1397 AjBool systemsort, AjPStr const * fields,
1398 ajint* maxFieldLen,
1399 ajuint* countfield,
1400 AjPStr* myid,
1401 AjPList* myfdl)
1402 {
1403 static AjPRegexp idexp = NULL;
1404 static AjPStr mytmpac = NULL;
1405 char* ac;
1406 static ajint numFields;
1407 static ajint accfield = -1;
1408 static AjBool reset = AJTRUE;
1409
1410 if(!fields)
1411 {
1412 reset = ajTrue;
1413 accfield = -1;
1414 return ajFalse;
1415 }
1416
1417 if(reset)
1418 {
1419 numFields = 0;
1420 while(fields[numFields])
1421 {
1422 if(ajStrMatchCaseC(fields[numFields], "acc"))
1423 accfield=numFields;
1424 else if(!ajStrMatchCaseC(fields[numFields], "sv") &&
1425 !ajStrMatchCaseC(fields[numFields], "des"))
1426 ajWarn("Simple ID parsing unknown field '%S' ignored",
1427 fields[numFields]);
1428 numFields++;
1429 }
1430 reset = ajFalse;
1431 }
1432
1433
1434 if(!idexp)
1435 idexp = ajRegCompC("^([^ ]+)( +([A-Za-z][A-Za-z0-9]+[0-9]))");
1436
1437 if(!ajRegExec(idexp, line))
1438 return ajFalse;
1439
1440 ajRegSubI(idexp, 1, myid);
1441 ajRegSubI(idexp, 3, &mytmpac);
1442 ajStrFmtUpper(myid);
1443 ajStrFmtUpper(&mytmpac); /* GCG mixes case on new SwissProt acnums */
1444
1445 if(accfield >= 0)
1446 {
1447 embDbiMaxlen(&mytmpac, &maxFieldLen[accfield]);
1448 countfield[accfield]++;
1449 if(systemsort)
1450 ajFmtPrintF(alistfile[accfield], "%S %S\n", *myid, mytmpac);
1451 else
1452 {
1453 ac = ajCharNewS(mytmpac);
1454 ajListPushAppend(myfdl[accfield], ac);
1455 }
1456 }
1457
1458 ajDebug("parseSimple '%S' '%S'\n", *myid, mytmpac);
1459
1460 return ajTrue;
1461 }
1462
1463
1464
1465
1466 /* @funcstatic dbiblast_parseId ***********************************************
1467 **
1468 ** Parses a simple FASTA ID from the BLAST header table.
1469 **
1470 ** @param [r] line [const AjPStr] Input line
1471 ** @param [u] alistfile [AjPFile *] field data files array
1472 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1473 ** @param [r] fields [AjPStr const *] Field names to be indexed
1474 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1475 ** @param [w] countfield [ajuint*] Number of tokens for each field
1476 ** @param [w] myid [AjPStr*] ID
1477 ** @param [w] myfdl [AjPList*] Accession number list
1478 ** @return [AjBool] ajTrue on success
1479 ** @@
1480 ******************************************************************************/
1481
dbiblast_parseId(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * myfdl)1482 static AjBool dbiblast_parseId(const AjPStr line, AjPFile * alistfile,
1483 AjBool systemsort, AjPStr const * fields,
1484 ajint* maxFieldLen,
1485 ajuint* countfield,
1486 AjPStr* myid,
1487 AjPList * myfdl)
1488 {
1489 static AjPRegexp idexp = NULL;
1490 static AjBool reset = AJTRUE;
1491
1492 (void) alistfile;
1493 (void) systemsort;
1494 (void) maxFieldLen;
1495 (void) countfield;
1496 (void) myfdl;
1497
1498 if(!fields)
1499 {
1500 reset = ajTrue;
1501 return ajFalse;
1502 }
1503
1504 if(reset)
1505 {
1506 reset = ajFalse;
1507 }
1508
1509
1510 if(!idexp)
1511 idexp = ajRegCompC("^([^ ]+)");
1512
1513 if(!ajRegExec(idexp, line))
1514 return ajFalse;
1515
1516 ajRegSubI(idexp, 1, myid);
1517 ajStrFmtUpper(myid);
1518
1519 ajDebug("parseId '%S'\n", *myid);
1520
1521 return ajTrue;
1522 }
1523
1524
1525
1526
1527 /* @funcstatic dbiblast_parseUnknown ******************************************
1528 **
1529 ** Parses an unknown type ID from the BLAST header table.
1530 **
1531 ** @param [r] line [const AjPStr] Input line
1532 ** @param [u] alistfile [AjPFile *] field data files array
1533 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1534 ** @param [r] fields [AjPStr const *] Field names to be indexed
1535 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1536 ** @param [w] countfield [ajuint*] Number of tokens for each field
1537 ** @param [w] myid [AjPStr*] ID
1538 ** @param [w] myfdl [AjPList*] Accession number list
1539 ** @return [AjBool] ajTrue on success
1540 ** @@
1541 ******************************************************************************/
1542
dbiblast_parseUnknown(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * myfdl)1543 static AjBool dbiblast_parseUnknown(const AjPStr line,
1544 AjPFile * alistfile,
1545 AjBool systemsort, AjPStr const * fields,
1546 ajint* maxFieldLen,
1547 ajuint* countfield,
1548 AjPStr* myid,
1549 AjPList* myfdl)
1550 {
1551 static ajint called = 0;
1552 static AjBool reset = AJTRUE;
1553
1554 (void) line;
1555 (void) alistfile;
1556 (void) systemsort;
1557 (void) maxFieldLen;
1558 (void) countfield;
1559 (void) myid;
1560 (void) myfdl;
1561
1562 if(!fields)
1563 {
1564 reset = ajTrue;
1565 return ajFalse;
1566 }
1567
1568 if(reset)
1569 {
1570 reset = ajFalse;
1571 }
1572
1573
1574 if(!called) /* first time - find out the format */
1575 called = 1;
1576
1577 return ajFalse;
1578 }
1579
1580
1581
1582
1583 /* @funcstatic dbiblast_memreadUInt4 ******************************************
1584 **
1585 ** Reads a 4 byte unsigned integer from a (possibly memory mapped)
1586 ** binary file, with the correct byte orientation
1587 **
1588 ** @param [u] fd [PMemFile] Input file
1589 ** @param [w] val [ajuint *] Unsigned integer
1590 ** @return [void]
1591 ** @@
1592 ******************************************************************************/
1593
dbiblast_memreadUInt4(PMemFile fd,ajuint * val)1594 static void dbiblast_memreadUInt4(PMemFile fd, ajuint *val)
1595 {
1596
1597 dbiblast_memfread((char *)val,(size_t)4,(size_t)1,fd);
1598 if(readReverse)
1599 ajByteRevLen4((ajint *)val);
1600
1601 return;
1602 }
1603
1604
1605
1606
1607 /* @funcstatic dbiblast_memfreadS *********************************************
1608 **
1609 ** Reads a string from a (possibly memory mapped)
1610 ** binary file, with the correct byte orientation
1611 **
1612 ** @param [w] dest [AjPStr*] Output string, must be already the right size
1613 ** @param [r] size [size_t] Size of string (1)
1614 ** @param [r] num_items [size_t] Number of bytes
1615 ** @param [u] mf [PMemFile] Input file
1616 ** @return [size_t] fread return code
1617 ** @@
1618 ******************************************************************************/
1619
dbiblast_memfreadS(AjPStr * dest,size_t size,size_t num_items,PMemFile mf)1620 static size_t dbiblast_memfreadS(AjPStr* dest, size_t size, size_t num_items,
1621 PMemFile mf)
1622 {
1623
1624 return dbiblast_memfread(ajStrGetuniquePtr(dest), size, num_items, mf);
1625 }
1626
1627
1628
1629
1630 /* @funcstatic dbiblast_memfseek **********************************************
1631 **
1632 ** fseek in a (possibly memory mapped) binary file
1633 **
1634 ** @param [u] mf [PMemFile] Input file
1635 ** @param [r] offset [ajlong] Offset in file
1636 ** @param [r] whence [ajint] Start of offset, as defined for 'fseek'
1637 ** @return [size_t] Result of 'fseek'
1638 ** @@
1639 ******************************************************************************/
1640
dbiblast_memfseek(PMemFile mf,ajlong offset,ajint whence)1641 static size_t dbiblast_memfseek(PMemFile mf, ajlong offset, ajint whence)
1642 {
1643
1644 if(mf->IsMem)
1645 { /* memory mapped */
1646 switch(whence)
1647 {
1648 case 0:
1649 mf->Pos = offset;
1650 break;
1651 case 1:
1652 mf->Pos += offset;
1653 break;
1654 case 2:
1655 mf->Pos = mf->Size + offset;
1656 break;
1657 default:
1658 ajErr("invalid memfseek code %d", whence);
1659 embExitBad();
1660 }
1661 if(mf->Pos > mf->Size)
1662 mf->Pos = mf->Size;
1663 if(mf->Pos < 0)
1664 mf->Pos = 0;
1665 return 0;
1666 }
1667
1668 return ajFileSeek( mf->File, offset, whence);
1669 }
1670
1671
1672
1673
1674 /* @funcstatic dbiblast_memfread **********************************************
1675 **
1676 ** fread in a (possibly memory mapped) binary file
1677 **
1678 ** @param [w] dest [void*] Output text string
1679 ** @param [r] size [size_t] Size of string (1)
1680 ** @param [r] num_items [size_t] Number of bytes
1681 ** @param [u] mf [PMemFile] Input file
1682 ** @return [size_t] Result of 'fread'
1683 ** @@
1684 ******************************************************************************/
1685
dbiblast_memfread(void * dest,size_t size,size_t num_items,PMemFile mf)1686 static size_t dbiblast_memfread(void* dest, size_t size, size_t num_items,
1687 PMemFile mf)
1688 {
1689 size_t i;
1690
1691 if(mf->IsMem)
1692 { /* memory mapped */
1693 i = size * num_items;
1694 memcpy(dest, &mf->Mem[mf->Pos], i);
1695 mf->Pos += (ajlong) i;
1696 return i;
1697 }
1698
1699 return ajReadbinBinary(mf->File, num_items, size, dest);
1700 }
1701
1702
1703
1704
1705 /* @funcstatic dbiblast_newname ***********************************************
1706 **
1707 ** Generate a new filename with a different suffix.
1708 **
1709 ** @param [w] nname [AjPStr*] New filename
1710 ** @param [r] oname [const AjPStr] Original file name
1711 ** @param [r] suff [const char*] New suffix
1712 ** @return [void]
1713 ** @@
1714 ******************************************************************************/
1715
dbiblast_newname(AjPStr * nname,const AjPStr oname,const char * suff)1716 static void dbiblast_newname(AjPStr* nname, const AjPStr oname,
1717 const char *suff)
1718 {
1719
1720 ajStrAssignS(nname, oname);
1721 if(ajStrGetCharFirst(oname)=='@')
1722 ajStrCutStart(nname, 1);
1723 ajStrAppendK(nname, '.');
1724 ajStrAppendC(nname, suff);
1725
1726 return;
1727 }
1728
1729
1730
1731
1732 /* @funcstatic dbiblast_dbname ************************************************
1733 **
1734 ** Generate the database name (original fasta file name)
1735 ** by stripping off the suffix
1736 **
1737 ** @param [w] dbname [AjPStr*] Database filename
1738 ** @param [r] oname [const AjPStr] Original file name
1739 ** @param [r] suff [const char*] New suffix
1740 ** @return [void]
1741 ** @@
1742 ******************************************************************************/
1743
dbiblast_dbname(AjPStr * dbname,const AjPStr oname,const char * suff)1744 static void dbiblast_dbname(AjPStr* dbname,
1745 const AjPStr oname, const char *suff)
1746 {
1747 AjPStr suffix = NULL;
1748
1749 ajFmtPrintS(&suffix, ".%s", suff);
1750
1751 ajStrAssignS(dbname, oname);
1752
1753 if(ajStrGetCharFirst(oname)=='@')
1754 ajStrCutStart(dbname, 1);
1755
1756 if(!ajStrSuffixS(*dbname, suffix))
1757 {
1758 ajStrDel(&suffix);
1759 return;
1760 }
1761
1762 ajStrCutEnd(dbname, ajStrGetLen(suffix));
1763
1764 ajStrDel(&suffix);
1765
1766 return;
1767 }
1768
1769
1770
1771
1772 /* @funcstatic dbiblast_wrongtype *********************************************
1773 **
1774 ** Tests for the other database filenames in case the user asked
1775 ** for "*.*". Used to test we have the *.suff file before opening all files.
1776 **
1777 ** @param [r] oname [const AjPStr] Original file name
1778 ** @param [r] suff [const char*] Required suffix
1779 ** @return [AjBool] ajTrue if any other filename suffix is recognized
1780 ** @@
1781 ******************************************************************************/
1782
dbiblast_wrongtype(const AjPStr oname,const char * suff)1783 static AjBool dbiblast_wrongtype(const AjPStr oname, const char *suff)
1784 {
1785 ajint itype;
1786
1787 for(itype=0; blasttypes[itype].ExtT; itype++)
1788 {
1789 if(strcmp(suff, blasttypes[itype].ExtT))
1790 if(ajStrSuffixC(oname, blasttypes[itype].ExtT))
1791 return ajTrue;
1792
1793 if(strcmp(suff, blasttypes[itype].ExtH))
1794 if(ajStrSuffixC(oname, blasttypes[itype].ExtH))
1795 return ajTrue;
1796
1797 if(strcmp(suff, blasttypes[itype].ExtS))
1798 if(ajStrSuffixC(oname, blasttypes[itype].ExtS))
1799 return ajTrue;
1800 }
1801
1802 return ajFalse;
1803 }
1804
1805
1806
1807
1808 /* @funcstatic dbiblast_memfclosefile *****************************************
1809 **
1810 ** Close a (possibly memory mapped) binary file
1811 **
1812 ** @param [d] pfd [PMemFile*] File
1813 ** @return [void]
1814 ** @@
1815 ******************************************************************************/
1816
dbiblast_memfclosefile(PMemFile * pfd)1817 static void dbiblast_memfclosefile(PMemFile* pfd)
1818 {
1819 PMemFile fd;
1820
1821 if(!pfd)
1822 return;
1823
1824 if(!*pfd)
1825 return;
1826
1827 fd = *pfd;
1828 ajFileClose(&fd->File);
1829 ajStrDel(&fd->Name);
1830
1831 AJFREE(*pfd);
1832
1833 return;
1834 }
1835
1836
1837
1838
1839 /* @funcstatic dbiblast_memfopenfile ******************************************
1840 **
1841 ** Open a (possibly memory mapped) binary file
1842 **
1843 ** @param [r] name [const AjPStr] File name
1844 ** @return [PMemFile] Memory mapped file object created
1845 ** @@
1846 ******************************************************************************/
1847
dbiblast_memfopenfile(const AjPStr name)1848 static PMemFile dbiblast_memfopenfile(const AjPStr name)
1849 {
1850 PMemFile ret;
1851 AjPFile fp;
1852
1853 fp = ajFileNewInNameS(name);
1854 if(!fp)
1855 return NULL;
1856
1857 AJNEW0(ret);
1858
1859 ajStrAssignS(&ret->Name, name);
1860 ret->IsMem = 0;
1861 ret->File = fp;
1862 ret->Size = 0;
1863 ret->Mem = NULL;
1864
1865 ajDebug("fopened '%S'\n", name);
1866
1867 return ret;
1868 }
1869
1870
1871
1872
1873 /* @funcstatic dbiblast_loadtable *********************************************
1874 **
1875 ** Load part of the BLAST binary table into memory
1876 **
1877 ** @param [w] table [ajuint*] table array to be read
1878 ** @param [r] isize [ajint] Number of elements to read
1879 ** @param [u] db [PBlastDb] Blast database structure
1880 ** @param [r] top [ajint] Byte offset for start of table
1881 ** @param [r] pos [ajint] Current element number in table.
1882 ** @return [ajint] Number of elements read.
1883 ** @@
1884 ******************************************************************************/
1885
dbiblast_loadtable(ajuint * table,ajint isize,PBlastDb db,ajint top,ajint pos)1886 static ajint dbiblast_loadtable(ajuint* table, ajint isize, PBlastDb db,
1887 ajint top, ajint pos)
1888 {
1889 ajint i;
1890 ajint j;
1891 ajint imax;
1892
1893 imax = pos + isize;
1894 if(imax > (db->Size+1))
1895 imax = db->Size+1;
1896
1897 ajDebug("loadtable size %d top %d db->Size %d pos %d imax %d\n",
1898 isize, top, db->Size, pos, imax);
1899
1900 dbiblast_memfseek(db->TFile, top + 4*(pos), 0);
1901 j = 0;
1902
1903 for(i=pos; i<=imax; i++)
1904 {
1905 /* ajDebug("reading at %d\n", ajFileResetPos(db->TFile->File));*/
1906 dbiblast_memreadUInt4(db->TFile,&table[j++]);
1907 /* ajDebug("read i: %d j: %d value: %d\n", i, j-1, table[j-1]);*/
1908 }
1909
1910 return imax - pos + 1;
1911 }
1912
1913
1914
1915
1916 /* @funcstatic dbiblast_ncblreadhdr *******************************************
1917 **
1918 ** Read the FASTA header line for one entry
1919 **
1920 ** @param [w] hdrline [AjPStr*] Header line
1921 ** @param [u] db [PBlastDb] Blast database structure
1922 ** @param [r] start [ajint] Byte offset for start of header
1923 ** @param [r] end [ajint] Byte offset for end of header
1924 ** @return [ajint] Number of bytes read.
1925 ** @@
1926 ******************************************************************************/
1927
dbiblast_ncblreadhdr(AjPStr * hdrline,PBlastDb db,ajint start,ajint end)1928 static ajint dbiblast_ncblreadhdr(AjPStr* hdrline, PBlastDb db, ajint start,
1929 ajint end)
1930 {
1931 ajint size;
1932 ajint llen;
1933 PMemFile hfp;
1934
1935 size = ajStrGetRes(*hdrline);
1936 hfp = db->HFile;
1937
1938 if(end)
1939 {
1940 llen = end - start;
1941
1942 if(db->IsBlast2)
1943 llen += 1;
1944
1945 if(llen > size)
1946 llen = size;
1947 }
1948 else
1949 llen = size;
1950
1951 /*ajDebug("ncblreadhdr start %d end %d llen %d\n", start, end, llen);*/
1952
1953 if(db->IsBlast2)
1954 {
1955 dbiblast_memfseek(hfp,start,0);
1956 dbiblast_memfreadS(hdrline,(size_t)1,(size_t)(llen-1),hfp);
1957 }
1958 else
1959 {
1960 dbiblast_memfseek(hfp,start+1,0); /* skip the '>' character */
1961 dbiblast_memfreadS(hdrline,(size_t)1,(size_t)(llen-1),hfp);
1962 }
1963
1964 ajStrSetValidLen(hdrline, (llen-1));
1965
1966 return llen;
1967 }
1968