1 /* @source dbiblast application
2 **
3 ** Index blast databases
4 **
5 ** @author Copyright (C) Peter Rice, Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 /******************************************************************************
23 **
24 ** EMBOSS/Staden/EMBLCD indexing
25 **
26 ** This version reads a BLAST formatted database,
27 ** and writes entryname and accession index files.
28 **
29 ** It helps to know the format in order to
30 ** parse the entryname and accession number,
31 ** but it can guess if necessary.
32 **
33 ** It also helps to know the type (blast1 or blast2)
34 ** and the sequence type (protein or nucleic) but again
35 ** it can guess by looking at the file extensions.
36 **
37 ** To save memory, it is also helpful to know the maximum number of
38 ** entries in the database and the maximum entryname length so that
39 ** space can be preallocated for storage.
40 **
41 ** Entry names and accession numbers are held in list structures,
42 ** then converted to arrays and sorted.
43 **
44 ** Multiple input files are allowed.
45 **
46 ** EMBLCD and Staden index files use different names but have essentially
47 ** the same contents.
48 **
49 ******************************************************************************/
50 
51 #include "emboss.h"
52 #ifndef WIN32
53 #include <dirent.h>
54 #include <sys/wait.h>
55 #include <sys/stat.h>
56 #include <sys/mman.h>
57 #ifndef _AIX
58 #include <sys/fcntl.h>
59 #else
60 #include <fcntl.h>
61 #endif
62 #endif
63 #include <string.h>
64 
65 #ifndef MAP_FILE      /* Solaris does not have MAP_FILE */
66 #define MAP_FILE 0
67 #endif
68 
69 #define BLASTIDUNKNOWN 0
70 #define BLASTIDANY     1
71 #define BLASTIDNCBI    2
72 #define BLASTIDGCG     3
73 #define BLASTIDSANGER  4
74 #define BLASTPREFNCBI  1
75 
76 #ifdef WIN32
77 typedef char* caddr_t;
78 #endif
79 
80 #define TABLESIZE 10000
81 #define HDRSIZE 1000
82 
83 /* Definition of global variables */
84 
85 static AjPList*  dbiblastGFdl = NULL;
86 
87 static AjPRegexp dbiblastGWrdexp = NULL;
88 
89 static AjPStr dbiblastGId     = NULL;
90 static AjPStr dbiblastGHline  = NULL;
91 static AjPStr dbiblastGAcc    = NULL;
92 static AjPStr dbiblastGTmpT   = NULL;
93 static AjPStr dbiblastGTmpDes = NULL;
94 static AjPStr dbiblastGTmpFd  = NULL;
95 static AjPStr dbiblastGTmpAc  = NULL;
96 static AjPStr dbiblastGTmpSv  = NULL;
97 static AjPStr dbiblastGTmpGi  = NULL;
98 static AjPStr dbiblastGTmpDb  = NULL;
99 
100 static EmbPEntry dbiblastGEntry = NULL;
101 
102 
103 
104 
105 /* @datastatic PMemFile *******************************************************
106 **
107 ** DbiBlast in-memory file
108 **
109 ** @attr File [AjPFile] Ajax file
110 ** @attr IsMem [AjBool] True if in memory mapped
111 ** @attr Fd [ajint] Unix file descriptor (integer)
112 ** @attr Pos [ajlong] Position in file/memory
113 ** @attr Size [ajlong] Size of file/memory
114 ** @attr Name [AjPStr] Name of file
115 ** @attr Mem [caddr_t] Memory map
116 ******************************************************************************/
117 
118 typedef struct SMemFile
119 {
120   AjPFile File;
121   AjBool IsMem;
122   ajint Fd;
123   ajlong Pos;
124   ajlong Size;
125   AjPStr Name;
126   caddr_t Mem;
127 } OMemFile;
128 
129 #define PMemFile OMemFile*
130 
131 
132 
133 
134 /* @datastatic PBlastDb *******************************************************
135 **
136 ** DbiBlast database
137 **
138 ** @attr DbType [ajint] database type indicator
139 ** @attr DbFormat [ajint] database format (version) indicator
140 ** @attr IsProtein [ajint] 1 for protein
141 ** @attr IsBlast2 [ajint] 1 for blast2, 0 for blast1
142 ** @attr TitleLen [ajint] length of database title
143 ** @attr DateLen [ajint] length of database date string
144 ** @attr LineLen [ajint] length of database lines
145 ** @attr HeaderLen [ajint] bytes before tables start
146 ** @attr CompLen [ajint] length of compressed seq file
147 ** @attr MaxSeqLen [ajint] max. entry length
148 ** @attr TotLen [ajint] number of bases or residues in database
149 ** @attr CleanCount [ajint] count of cleaned 8mers
150 ** @attr TopCmp [ajint] bytes before compressed table starts
151 ** @attr TopSrc [ajint] bytes before source table starts
152 ** @attr TopHdr [ajint] bytes before headers table starts
153 ** @attr TopAmb [ajint] bytes before ambiguity table starts
154 ** @attr IdType [ajint] ID type
155 ** @attr IdPrefix [ajint] ID prefix type
156 ** @attr TFile [PMemFile] table of offsets, also DB info
157 ** @attr HFile [PMemFile] description lines
158 ** @attr SFile [PMemFile] binary sequence data
159 ** @attr FFile [PMemFile] source sequence data
160 ** @attr Title [AjPStr] database title
161 ** @attr Date [AjPStr] database date
162 ** @attr Name [AjPStr] database base file name
163 ** @attr Size [ajint] number of database entries
164 ** @attr Padding [char[4]] Padding to alignment boundary
165 ******************************************************************************/
166 
167 typedef struct SBlastDb
168 {
169   ajint DbType;
170   ajint DbFormat;
171   ajint IsProtein;
172   ajint IsBlast2;
173   ajint TitleLen;
174   ajint DateLen;
175   ajint LineLen;
176   ajint HeaderLen;
177   ajint CompLen;
178   ajint MaxSeqLen;
179   ajint TotLen;
180   ajint CleanCount;
181   ajint TopCmp;
182   ajint TopSrc;
183   ajint TopHdr;
184   ajint TopAmb;
185   ajint IdType;
186   ajint IdPrefix;
187   PMemFile TFile;
188   PMemFile HFile;
189   PMemFile SFile;
190   PMemFile FFile;
191   AjPStr Title;
192   AjPStr Date;
193   AjPStr Name;
194   ajint Size;
195   char Padding[4];
196 } OBlastDb;
197 
198 #define PBlastDb OBlastDb*
199 
200 
201 
202 
203 /* @datastatic PBlastType *****************************************************
204 **
205 ** DbiBlast types
206 **
207 ** @attr ExtT [const char*] Table filename extension
208 ** @attr ExtH [const char*] Header filename extension
209 ** @attr ExtS [const char*] Sequence filename extension
210 ** @attr IsProtein [AjBool] true for protein
211 ** @attr IsBlast2 [AjBool] blast2.x or blast 1.x
212 ** @attr Type [ajint] enumerated type
213 ** @attr Padding [char[4]] Padding to alignment boundary
214 ******************************************************************************/
215 
216 typedef struct SBlastType
217 {
218   const char* ExtT;
219   const char* ExtH;
220   const char* ExtS;
221   AjBool  IsProtein;
222   AjBool IsBlast2;
223   ajint   Type;
224   char Padding[4];
225 } OBlastType;
226 
227 #define PBlastType OBlastType*
228 
229 
230 
231 
232 enum blastdbtype {BLAST1P, BLAST1N, BLAST2P, BLAST2N};
233 
234 static OBlastType blasttypes[] =
235 {
236   {"atb", "ahd", "bsq",	AJTRUE,  AJFALSE, BLAST1P, ""},
237   {"ntb", "nhd", "csq", AJFALSE, AJFALSE, BLAST1N, ""},
238   {"pin", "phr", "psq", AJTRUE,  AJTRUE, BLAST2P, ""},
239   {"nin", "nhr", "nsq", AJFALSE, AJTRUE, BLAST2N, ""},
240   {NULL, NULL, NULL, 0, 0, 0, ""}
241 };
242 
243 
244 
245 
246 static AjBool dbiblast_parseNcbi(const AjPStr line, AjPFile * alistfile,
247 				 AjBool systemsort, AjPStr const * fields,
248 				 ajint* maxFieldLen,
249 				 ajuint* countfield,
250 				 AjPStr* myid, AjPList* myfdl);
251 static AjBool dbiblast_parseGcg(const AjPStr line, AjPFile * alistfile,
252 				AjBool systemsort, AjPStr const * fields,
253 				ajint* maxFieldLen,
254 				ajuint* countfield,
255 				AjPStr* myid, AjPList* myfdl);
256 static AjBool dbiblast_parseSimple(const AjPStr line,
257 				   AjPFile * alistfile,
258 				   AjBool systemsort, AjPStr const * fields,
259 				   ajint* maxFieldLen,
260 				   ajuint* countfield,
261 				   AjPStr* myid, AjPList* myfdl);
262 static AjBool dbiblast_parseId(const AjPStr line, AjPFile * alistfile,
263 			       AjBool systemsort, AjPStr const * fields,
264 			       ajint* maxFieldLen,
265 			       ajuint* countfield,
266 			       AjPStr* myid, AjPList* myfdl);
267 static AjBool dbiblast_parseUnknown(const AjPStr line,
268 				    AjPFile * alistfile,
269 				    AjBool systemsort, AjPStr const * fields,
270 				    ajint* maxFieldLen,
271 				    ajuint* countfield,
272 				    AjPStr* myid, AjPList* myfdl);
273 
274 
275 
276 
277 /* @datastatic OParser ********************************************************
278 **
279 ** Parser definition structure
280 **
281 ** @alias SParser
282 ** @alias OParser
283 **
284 ** @attr Name [const char*] Parser name
285 ** @attr Parser [AjBool function] Parser function
286 ** @@
287 ******************************************************************************/
288 
289 typedef struct SParser
290 {
291   const char* Name;
292   AjBool (*Parser) (const AjPStr line, AjPFile * alistfile,
293 		    AjBool systemsort, AjPStr const * fields,
294 		    ajint* maxFieldLen, ajuint* countfield,
295 		    AjPStr* myid, AjPList* myfdl);
296 } OParser;
297 
298 static OParser parser[] =
299 {
300   {"NCBI", dbiblast_parseNcbi},
301   {"GCG", dbiblast_parseGcg},
302   {"SIMPLE", dbiblast_parseSimple},
303   {"ID", dbiblast_parseId},
304   {"UNKNOWN", dbiblast_parseUnknown},
305   {NULL, NULL}
306 };
307 
308 static EmbPEntry dbiblast_nextblastentry(PBlastDb db, ajint ifile,
309 					 const AjPStr idformat,
310 					 AjBool systemsort,
311 					 AjPStr const * fields,
312 					 ajint* maxFieldLen,
313 					 ajuint* maxidlen,
314 					 ajuint* countfield,
315 					 AjPFile elistfile,
316 					 AjPFile * alistfile);
317 static AjBool dbiblast_blastopenlib(const AjPStr lname, AjBool usesrc,
318 				    ajint blastv, char dbtype,
319 				    PBlastDb* pdb);
320 
321 static void dbiblast_dbfree(PBlastDb* pdb);
322 
323 static void dbiblast_dbname(AjPStr* dbname,
324 			    const AjPStr oname, const char *suff);
325 static void dbiblast_newname(AjPStr* nname,
326 			     const AjPStr oname, const char *suff);
327 
328 static void dbiblast_memreadUInt4(PMemFile fd, ajuint *val);
329 
330 static PMemFile dbiblast_memfopenfile(const AjPStr name);
331 static void dbiblast_memfclosefile(PMemFile* pfd);
332 static size_t dbiblast_memfseek(PMemFile mf, ajlong offset, ajint whence);
333 static size_t dbiblast_memfread(void* dest, size_t size, size_t num_items,
334 				PMemFile mf);
335 static size_t dbiblast_memfreadS(AjPStr* dest, size_t size, size_t num_items,
336 				 PMemFile mf);
337 
338 static ajint dbiblast_loadtable(ajuint* table, ajint isize, PBlastDb db,
339 				ajint top, ajint pos);
340 static ajint dbiblast_ncblreadhdr(AjPStr* hdrline, PBlastDb db,
341 				  ajint start, ajint end);
342 static AjBool dbiblast_wrongtype(const AjPStr oname, const char *suff);
343 
344 static AjBool readReverse = AJFALSE;
345 
346 
347 
348 
349 /* @prog dbiblast *************************************************************
350 **
351 ** Index a BLAST database
352 **
353 ******************************************************************************/
354 
main(int argc,char ** argv)355 int main(int argc, char **argv)
356 {
357 
358     AjPList idlist;
359     AjPList* fieldList = NULL;
360 
361     AjBool systemsort;
362     AjBool cleanup;
363 
364     ajint blastv = 0;
365     char dbtype  = '\0';
366 
367     ajuint maxindex;
368     ajuint maxidlen = 0;
369     ajuint maxlen;
370 
371     AjPStr version = NULL;
372     AjPStr seqtype = NULL;
373 
374     AjPFile elistfile  = NULL;
375     AjPFile* alistfile = NULL;
376 
377     AjPStr dbname   = NULL;
378     AjPStr release  = NULL;
379     AjPStr datestr  = NULL;
380     AjPStr sortopt  = NULL;
381     void **entryIds = NULL;
382 
383     AjBool usesrc = AJTRUE;
384 
385     AjPStr directory;
386     AjPStr indexdir;
387     AjPStr filename;
388     AjPStr exclude;
389     AjPStr curfilename = NULL;
390 
391     AjPStr idformat = NULL;
392 
393     EmbPEntry entry;
394 
395     PBlastDb db = NULL;
396 
397     ajuint idCount = 0;
398     ajuint idDone;
399     AjPList listTestFiles = NULL;
400     void ** testFiles = NULL;
401     ajuint nfiles;
402     ajuint ifile;
403     ajuint jfile;
404 
405     ajuint filesize;
406     short recsize;
407     ajuint maxfilelen = 20;
408     char date[4] =
409     {
410 	0,0,0,0
411     };
412 
413     AjPStr tmpfname = NULL;
414     AjPStr* fields  = NULL;
415 
416     AjPFile entFile = NULL;
417 
418     AjPStr* divfiles   = NULL;
419     ajint* maxFieldLen = NULL;
420 
421     ajuint ifield  = 0;
422     ajuint nfields = 0;
423 
424     AjPFile logfile = NULL;
425     ajuint* countField = NULL;
426     ajuint* fieldTot = NULL;
427     ajuint idCountFile = 0;
428     ajuint i = 0;
429 
430     embInit("dbiblast", argc, argv);
431 
432     idformat = ajStrNewC("NCBI");
433 
434     fields     = ajAcdGetList("fields");
435     directory  = ajAcdGetDirectoryName("directory");
436     indexdir   = ajAcdGetOutdirName("indexoutdir");
437     filename   = ajAcdGetString("filenames");
438     exclude    = ajAcdGetString("exclude");
439     dbname     = ajAcdGetString("dbname");
440     release    = ajAcdGetString("release");
441     datestr    = ajAcdGetString("date");
442     systemsort = ajAcdGetBoolean("systemsort");
443     cleanup    = ajAcdGetBoolean("cleanup");
444     sortopt    = ajAcdGetString("sortoptions");
445     maxindex   = ajAcdGetInt("maxindex");
446     version    = ajAcdGetListSingle("blastversion");
447     seqtype    = ajAcdGetListSingle("seqtype");
448     usesrc     = ajAcdGetBoolean("sourcefile");
449     logfile    = ajAcdGetOutfile("outfile");
450 
451     while(fields[nfields])		/* array ends with a NULL */
452 	nfields++;
453 
454     if(nfields)
455     {
456 	AJCNEW(maxFieldLen, nfields);
457 	AJCNEW0(countField, nfields);
458 	AJCNEW0(fieldTot, nfields);
459 	for(ifield=0; ifield < nfields; ifield++)
460 	    maxFieldLen[ifield] = (ajint) maxindex * -1;
461 
462 	if(systemsort)
463 	    AJCNEW(alistfile, nfields);
464 	else
465 	{
466 	    AJCNEW(fieldList, nfields);
467 	    for(ifield=0; ifield < nfields; ifield++)
468 		fieldList[ifield] = ajListNew();
469 	}
470     }
471 
472     if(ajStrMatchC(datestr, "00/00/00"))
473 	ajFmtPrintS(&datestr, "%D", ajTimeRefTodayFmt("dbindex"));
474 
475     ajStrRemoveWhite(&dbname);		/* used for temp filenames */
476     embDbiDateSet(datestr, date);
477     idlist = ajListNew();
478 
479     if(ajUtilGetBigendian())
480 	readReverse = ajFalse;
481     else
482 	readReverse = ajTrue;
483 
484     ajStrToInt(version, &blastv);
485     dbtype = ajStrGetCharFirst(seqtype);
486 
487     ajDebug("reading '%S/%S'\n", directory, filename);
488     ajDebug("writing '%S/'\n", indexdir);
489 
490     listTestFiles = embDbiFileListExc(directory, filename, exclude);
491     ajListSort(listTestFiles, &ajStrVcmp);
492     nfiles = (ajuint) ajListToarray(listTestFiles, &testFiles);
493     if(!nfiles)
494         ajDie("No input files in '%S' matched filename '%S'",
495               directory, filename);
496 
497     embDbiLogHeader(logfile, dbname, release, datestr,
498 		     indexdir, maxindex);
499 
500     embDbiLogFields(logfile, fields, nfields);
501     embDbiLogSource(logfile, directory, filename, exclude,
502 		    (AjPStr*) testFiles, nfiles);
503     embDbiLogCmdline(logfile);
504 
505     AJCNEW0(divfiles, nfiles);
506 
507     /*
508     ** process each input file, one at a time
509     */
510 
511     jfile = 0;
512     for(ifile=0; ifile < nfiles; ifile++)
513     {
514 	curfilename = (AjPStr) testFiles[ifile];
515 	if(!dbiblast_blastopenlib(curfilename,
516 				  usesrc, blastv, dbtype, &db))
517 	    continue;	 /* could be the wrong file type with "*.*" */
518 
519 	ajDebug("processing filename '%S' ...\n", curfilename);
520 	ajDebug("processing file '%S' ...\n", db->TFile->Name);
521 
522 
523 	ajStrAssignS(&divfiles[jfile], db->TFile->Name);
524 	ajFilenameTrimPath(&divfiles[jfile]);
525 	if(ajStrGetLen(divfiles[jfile]) >= maxfilelen)
526 	    maxfilelen = ajStrGetLen(divfiles[jfile]) + 1;
527 
528 	if(systemsort)	 /* elistfile for entries, alist for fields */
529 	    elistfile = embDbiSortOpen(alistfile, jfile,
530 				       dbname, fields, nfields);
531 
532 	idCountFile = 0;
533 	for(i=0;i<nfields;i++)
534 	    countField[i] = 0;
535 	while((entry=dbiblast_nextblastentry(db, jfile,
536 					     idformat, systemsort,
537 					     fields,
538 					     maxFieldLen,
539 					     &maxidlen, countField,
540 					     elistfile, alistfile)))
541 	{
542 	    idCountFile++;
543 	    if(!systemsort)	    /* save the entry data in lists */
544 	    {
545 		embDbiMemEntry(idlist, fieldList, nfields, entry, jfile);
546 	    }
547 	}
548 	idCount += idCountFile;
549 	if(systemsort)
550 	{
551 	    embDbiSortClose(&elistfile, alistfile, nfields);
552 	    /* lost the entry, so can't free it :-) */
553 	}
554 
555 	embDbiLogFile(logfile, curfilename, idCountFile, fields,
556 		      countField, nfields);
557 	dbiblast_dbfree(&db);
558 	jfile++;
559     }
560     nfiles = jfile;
561 
562     /*
563     ** write the division.lkp file
564     */
565 
566     embDbiWriteDivision(indexdir, dbname, release, date,
567 			maxfilelen, nfiles, divfiles, NULL);
568 
569     /*
570     ** Write the entryname.idx index
571     */
572 
573     ajStrAssignC(&tmpfname, "entrynam.idx");
574     entFile = ajFileNewOutNamePathS(tmpfname, indexdir);
575 
576     recsize = maxidlen+10;
577     filesize = 300 + (idCount*(ajint)recsize);
578     embDbiHeader(entFile, filesize, idCount, recsize, dbname, release, date);
579 
580     if(systemsort)
581         idDone = embDbiSortWriteEntry(entFile, maxidlen,
582 				      dbname, nfiles, cleanup, sortopt);
583     else			  /* save entries in entryIds array */
584     {
585         idDone = embDbiMemWriteEntry(entFile, maxidlen,
586 				     idlist, &entryIds);
587 	if(idDone != idCount)
588 	    ajFatal("Duplicates not allowed for in-memory processing");
589     }
590 
591     embDbiHeaderSize(entFile, 300+(idDone*(ajint)recsize), idDone);
592     ajFileClose(&entFile);
593 
594     /*
595     ** Write the fields index files
596     */
597 
598     for(ifield=0; ifield < nfields; ifield++)
599     {
600 
601         if(maxindex)
602 	    maxlen = maxindex;
603 	else
604 	{
605 	    if(maxFieldLen[ifield] >= 0)
606 		maxlen = maxFieldLen[ifield];
607 	    else
608 		maxlen = - maxFieldLen[ifield];
609 	}
610 
611         if(systemsort)
612 	    fieldTot[ifield] = embDbiSortWriteFields(dbname, release,
613 						     date, indexdir,
614 						     fields[ifield], maxlen,
615 						     nfiles, idCount,
616 						     cleanup, sortopt);
617 	else
618 	    fieldTot[ifield] = embDbiMemWriteFields(dbname, release,
619 						    date, indexdir,
620 						    fields[ifield], maxlen,
621 						    fieldList[ifield],
622 						    entryIds);
623     }
624 
625     embDbiLogFinal(logfile,maxindex, maxFieldLen, fields, fieldTot,
626 		   nfields, nfiles, idDone, idCount);
627 
628     if(systemsort)
629 	embDbiRmEntryFile(dbname, cleanup);
630 
631     ajListMap(idlist, &embDbiEntryDelMap, NULL);
632     ajListFree(&idlist);
633     AJFREE(entryIds);
634 
635     ajStrDelarray(&fields);
636 
637     for(i=0;i<nfields;i++)
638     {
639 	if(systemsort)
640 	{
641 	    ajFileClose(&alistfile[i]);
642 	}
643 	else
644 	{
645 	    ajListMap(fieldList[i], &embDbiFieldDelMap, NULL);
646 	    ajListFree(&fieldList[i]);
647 	}
648     }
649     AJFREE(alistfile);
650     AJFREE(fieldList);
651     ajStrDel(&version);
652     ajStrDel(&seqtype);
653     ajFileClose(&elistfile);
654     for(i=0;i<nfiles;i++)
655     {
656 	ajStrDel(&divfiles[i]);
657     }
658     AJFREE(countField);
659     AJFREE(fieldTot);
660 
661     ajStrDel(&dbname);
662     ajStrDel(&release);
663     ajStrDel(&datestr);
664     ajStrDel(&sortopt);
665     ajStrDel(&directory);
666     ajStrDel(&indexdir);
667     ajStrDel(&filename);
668     ajStrDel(&exclude);
669     ajStrDel(&idformat);
670     ajStrDel(&tmpfname);
671 
672     AJFREE(maxFieldLen);
673 
674     ajFileClose(&logfile);
675 
676     ajListstrFreeData(&listTestFiles);
677 
678     ajStrDel(&dbiblastGTmpT);
679     ajStrDel(&dbiblastGId);
680     ajStrDel(&dbiblastGAcc);
681     ajStrDel(&dbiblastGHline);
682     ajStrDel(&dbiblastGTmpDes);
683     ajStrDel(&dbiblastGTmpFd);
684     ajStrDel(&dbiblastGTmpGi);
685     ajStrDel(&dbiblastGTmpDb);
686     ajStrDel(&dbiblastGTmpAc);
687     ajStrDel(&dbiblastGTmpSv);
688     ajRegFree(&dbiblastGWrdexp);
689 
690     embDbiEntryDel(&dbiblastGEntry);
691 
692     if(dbiblastGFdl)
693     {
694         for(i=0; i < nfields; i++)
695             ajListFree(&dbiblastGFdl[i]);
696         AJFREE(dbiblastGFdl);
697     }
698 
699     for(i=0;i<nfiles;i++)
700     {
701         ajStrDel(&divfiles[i]);
702     }
703     AJFREE(divfiles);
704     AJFREE(testFiles);
705 
706     embExit();
707 
708     return 0;
709 }
710 
711 
712 
713 
714 /* @funcstatic dbiblast_nextblastentry ****************************************
715 **
716 ** Returns next  database entry as an EmbPEntry object
717 **
718 ** @param [u] db [PBlastDb] Blast database object
719 ** @param [r] ifile [ajint] File number.
720 ** @param [r] idformat [const AjPStr] Id format in FASTA file
721 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
722 ** @param [r] fields [AjPStr const *] Field names to be indexed
723 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
724 ** @param [w] maxidlen [ajuint*] Maximum entry ID length
725 ** @param [w] countfield [ajuint*] Number of tokens for each field
726 ** @param [u] elistfile [AjPFile] entry file
727 ** @param [u] alistfile [AjPFile *] field data files array
728 ** @return [EmbPEntry] Entry data object.
729 ** @@
730 ******************************************************************************/
731 
dbiblast_nextblastentry(PBlastDb db,ajint ifile,const AjPStr idformat,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * maxidlen,ajuint * countfield,AjPFile elistfile,AjPFile * alistfile)732 static EmbPEntry dbiblast_nextblastentry(PBlastDb db, ajint ifile,
733 					 const AjPStr idformat,
734 					 AjBool systemsort,
735 					 AjPStr const * fields,
736 					 ajint * maxFieldLen,
737 					 ajuint* maxidlen, ajuint* countfield,
738 					 AjPFile elistfile,
739 					 AjPFile * alistfile)
740 {
741     ajint i;
742     static ajint lastfile = -1;
743     static ajint iparser  = -1;
744     static ajint called   = 0;
745     static ajuint tabhdr[TABLESIZE];
746     static ajint iload  = TABLESIZE-1;
747     static ajint irest  = 0;
748     static ajint ipos   = 0;
749 
750     static ajint jpos = 0;
751     ajint ir;
752     ajint j;
753     static ajint is = 0;
754     char* token;
755     static ajint nfields;
756     ajint ifield;
757 
758     if(!called)
759     {
760 	for(i=0; parser[i].Name; i++)
761 	    if(ajStrMatchC(idformat, parser[i].Name))
762 	    {
763 		iparser = i;
764 		break;
765 	    }
766 
767 	if(iparser < 0)
768 	    ajFatal("idformat '%S' unknown", idformat);
769 	ajDebug("idformat '%S' Parser %d\n", idformat, iparser);
770 	ajStrSetRes(&dbiblastGId, HDRSIZE);
771 	ajStrSetRes(&dbiblastGAcc, HDRSIZE);
772 	ajStrSetRes(&dbiblastGHline, HDRSIZE);
773 	called = 1;
774     }
775 
776     if(!dbiblastGFdl)
777     {
778 	nfields=0;
779 	while(fields[nfields])
780 	    nfields++;
781 	if(nfields)
782 	    AJCNEW(dbiblastGFdl, nfields);
783 	for(i=0; i < nfields; i++)
784 	    dbiblastGFdl[i] = ajListNew();
785     }
786 
787     if(lastfile != ifile)
788     {
789 	lastfile = ifile;
790 	ipos = 1;
791 	/*    isize = 0;*/
792 	irest = 0;
793 	iload = TABLESIZE-1;
794     }
795 
796     if(!dbiblastGEntry || !systemsort)
797 	dbiblastGEntry = embDbiEntryNew(nfields);
798 
799     /* pick up the next entry, parse it and dump it */
800 
801     if(ipos > db->Size)
802 	return NULL;
803 
804     if( ipos >= irest)
805     {
806 	ajDebug("ipos: %d iload: %d irest: %d\n", ipos, iload, irest);
807 	irest = ipos + TABLESIZE - 2;
808 	if(irest > db->Size)
809 	{
810 	    iload = db->Size - ipos + 1;
811 	    irest = db->Size;
812 	}
813 
814 	jpos=0;
815 	j = dbiblast_loadtable(tabhdr, iload, db, db->TopHdr, ipos-1);
816 	if(!j)
817 	    ajDebug("No elements read");
818     }
819 
820     j = dbiblast_ncblreadhdr(&dbiblastGHline, db, tabhdr[jpos], tabhdr[jpos+1]);
821 
822     if(!(*parser[iparser].Parser)(dbiblastGHline, alistfile, systemsort,
823                                   fields, maxFieldLen, countfield,
824                                   &dbiblastGId, dbiblastGFdl))
825 	ajFatal("failed to parse '%S'", dbiblastGHline);
826 
827     ir = ipos;
828 
829     if(ajStrGetLen(dbiblastGId) > *maxidlen)
830 	*maxidlen = ajStrGetLen(dbiblastGId);
831 
832     if(systemsort)
833 	ajFmtPrintF(elistfile, "%S %d %d %d\n",
834                     dbiblastGId, ir, is, ifile+1);
835     else
836     {
837 	dbiblastGEntry->entry   = ajCharNewS(dbiblastGId);
838 	dbiblastGEntry->rpos    = ir;
839 	dbiblastGEntry->spos    = is;
840 	dbiblastGEntry->filenum = ifile+1;
841 
842 	/* field tokens as list, then move to dbiblastGEntry->field */
843 	for(ifield=0; ifield < nfields; ifield++)
844 	{
845 	    dbiblastGEntry->nfield[ifield] =
846             (ajuint) ajListGetLength(dbiblastGFdl[ifield]);
847 
848 	    if(dbiblastGEntry->nfield[ifield])
849 	    {
850 		AJCNEW(dbiblastGEntry->field[ifield],
851 		       dbiblastGEntry->nfield[ifield]);
852 
853 		i = 0;
854 		while(ajListPop(dbiblastGFdl[ifield], (void**) &token))
855 		    dbiblastGEntry->field[ifield][i++] = token;
856 	    }
857 	    else
858 		dbiblastGEntry->field[ifield] = NULL;
859 	}
860     }
861     ipos++;
862     jpos++;
863 
864     return dbiblastGEntry;
865 }
866 
867 
868 
869 
870 /* @funcstatic dbiblast_dbfree ************************************************
871 **
872 ** Free BLAST library object
873 **
874 ** @param [u] pdb [PBlastDb*] Blast dababase structure.
875 ** @return [void]
876 ** @@
877 ******************************************************************************/
878 
dbiblast_dbfree(PBlastDb * pdb)879 static void dbiblast_dbfree( PBlastDb* pdb)
880 {
881     PBlastDb db;
882 
883     if(!pdb)
884 	return;
885 
886     if(!*pdb)
887 	return;
888 
889     db = *pdb;
890 
891     dbiblast_memfclosefile(&db->TFile);
892     dbiblast_memfclosefile(&db->HFile);
893     dbiblast_memfclosefile(&db->SFile);
894     dbiblast_memfclosefile(&db->FFile);
895 
896     ajStrDel(&db->Name);
897     ajStrDel(&db->Date);
898     ajStrDel(&db->Title);
899 
900     AJFREE(*pdb);
901 
902     return;
903 }
904 
905 
906 
907 
908 /* @funcstatic dbiblast_blastopenlib ******************************************
909 **
910 ** Open BLAST library
911 **
912 ** @param [r] name [const AjPStr] Source file name
913 ** @param [r] usesrc [AjBool] If ajTrue, use the source (fasta) file
914 ** @param [r] blastv [ajint] Blast version number (1 or 2)
915 ** @param [r] dbtype [char] Blast database type (p)rotein or (n)ucleotide
916 ** @param [u] pdb [PBlastDb*] Blast dababase structure.
917 ** @return [AjBool] ajTrue on success
918 ** @@
919 ******************************************************************************/
920 
dbiblast_blastopenlib(const AjPStr name,AjBool usesrc,ajint blastv,char dbtype,PBlastDb * pdb)921 static AjBool dbiblast_blastopenlib(const AjPStr name, AjBool usesrc,
922 				    ajint blastv, char dbtype,
923 				    PBlastDb* pdb)
924 {
925 
926     AjPStr hname = NULL;
927     AjPStr sname = NULL;
928     AjPStr tname = NULL;
929     static AjPStr dbname = NULL;
930     ajint rdtmp  = 0;
931     ajint rdtmp2 = 0;
932     ajint itype;
933     ajint ttop;
934     PMemFile TFile = NULL;
935 
936     PBlastDb ret;
937 
938     for(itype=0; blasttypes[itype].ExtT; itype++)
939     {
940 	if((blastv == 1) && blasttypes[itype].IsBlast2)
941 	    continue;
942 
943 	if((blastv == 2) && !blasttypes[itype].IsBlast2)
944 	    continue;
945 
946 	if((dbtype == 'P') && !blasttypes[itype].IsProtein)
947 	    continue;
948 
949 	if((dbtype == 'N') && blasttypes[itype].IsProtein)
950 	    continue;
951 
952 	if(dbiblast_wrongtype(name, blasttypes[itype].ExtT))
953 	    continue;
954 
955         dbiblast_dbname(&dbname,name,blasttypes[itype].ExtT);
956 	dbiblast_newname(&tname,dbname,blasttypes[itype].ExtT);
957 	TFile = dbiblast_memfopenfile(tname);
958 
959 	if(TFile)
960 	    break;
961     }
962 
963     if(!TFile)
964 	return ajFalse;
965 
966     AJNEW0(*pdb);
967 
968     ret = *pdb;
969 
970     ret->TFile = TFile;
971 
972     ajStrAssignS(&ret->Name, dbname);
973     ajDebug("Name '%S'\n", ret->Name);
974 
975     /* find and open the 'table' file(s) */
976 
977     if(!ret->TFile)
978 	ajFatal(" cannot open %S table file %S\n", dbname, tname);
979 
980     ajDebug("Successfully opened table file for type %d\n", itype);
981 
982     ret->IsProtein = blasttypes[itype].IsProtein;
983     ret->IsBlast2  = blasttypes[itype].IsBlast2;
984 
985     /* read the type and format - all databases */
986 
987     dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->DbType);
988     dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->DbFormat);
989     ret->HeaderLen += 8;
990 
991     ajDebug("dbtype: %x dbformat: %x\n", ret->DbType, ret->DbFormat);
992 
993     /* Open the header and (compressed) sequence files */
994     /* for DNA, also look for the FASTA file */
995 
996     dbiblast_newname(&hname,dbname,blasttypes[itype].ExtH);
997     if((ret->HFile = dbiblast_memfopenfile(hname))==NULL)
998 	ajFatal(" cannot open %S header file\n",hname);
999 
1000 
1001     dbiblast_newname(&sname,dbname,blasttypes[itype].ExtS);
1002     if((ret->SFile = dbiblast_memfopenfile(sname))==NULL)
1003 	ajFatal(" cannot open %S sequence file\n",sname);
1004 
1005 
1006     if(!ret->IsBlast2 && !ret->IsProtein && usesrc)
1007 	/* this can fail */
1008 	if((ret->FFile = dbiblast_memfopenfile(dbname))==NULL)
1009 	    ajDebug(" cannot open %S source file\n",dbname);
1010 
1011     /* read the title - all formats */
1012     dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->TitleLen);
1013 
1014     /* blast2 does not align after the title */
1015     if(ret->IsBlast2)
1016 	rdtmp = ret->TitleLen;
1017     else
1018 	rdtmp = ret->TitleLen + ((ret->TitleLen%4 !=0 ) ?
1019 				 4-(ret->TitleLen%4) : 0);
1020     ajStrAssignResC(&ret->Title, rdtmp+1, "");
1021     ajDebug("IsBlast2: %B title_len: %d rdtmp: %d title_str: '%S'\n",
1022 	    ret->IsBlast2, ret->TitleLen, rdtmp, ret->Title);
1023     ajStrTrace(ret->Title);
1024     dbiblast_memfreadS(&ret->Title,(size_t)1,(size_t)rdtmp,ret->TFile);
1025 
1026     if(ret->IsBlast2)
1027 	ajStrSetValidLen(&ret->Title, ret->TitleLen);
1028     else
1029 	ajStrSetValidLen(&ret->Title, ret->TitleLen-1);
1030 
1031     ajDebug("title_len: %d rdtmp: %d title_str: '%S'\n",
1032 	    ret->TitleLen, rdtmp, ret->Title);
1033 
1034     ret->HeaderLen += 4 + rdtmp;
1035 
1036     /* read the date - blast2 */
1037     if(ret->IsBlast2)
1038     {
1039 	dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->DateLen);
1040 	rdtmp2 = ret->DateLen;
1041 	ajStrAssignResC(&ret->Date, rdtmp2+1, "");
1042 	dbiblast_memfreadS(&ret->Date,(size_t)1,(size_t)rdtmp2,ret->TFile);
1043 	ajStrSetValid(&ret->Date);
1044 	ret->DateLen = ajStrGetLen(ret->Date);
1045 	ajDebug("datelen: %d rdtmp: %d date: '%S'\n",
1046 		ret->DateLen, rdtmp2, ret->Date);
1047 	ret->HeaderLen += 4 + rdtmp2;
1048     }
1049 
1050     /* read the rest of the header (different for protein and DNA) */
1051     if(!ret->IsBlast2 && !ret->IsProtein)
1052     {
1053 	/* length of source lines */
1054 	dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->LineLen);
1055 	ret->HeaderLen += 4;
1056     }
1057 
1058     /* all formats have the next 3 */
1059     dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->Size);
1060     if(ret->IsProtein)
1061     {			 /* mad, but they are the other way for DNA */
1062 	dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->TotLen);
1063 	dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->MaxSeqLen);
1064     }
1065     else
1066     {
1067 	dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->MaxSeqLen);
1068 	dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->TotLen);
1069     }
1070 
1071     ret->HeaderLen += 12;
1072 
1073     if(!ret->IsBlast2 && !ret->IsProtein)
1074     {					/* Blast 1.4 DNA only */
1075 	/* compressed db length */
1076 	dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->CompLen);
1077 	/* count of nt's cleaned */
1078 	dbiblast_memreadUInt4(ret->TFile,(ajuint*)&ret->CleanCount);
1079 	ret->HeaderLen += 8;
1080     }
1081 
1082     ajDebug(" size: %u, totlen: %d maxseqlen: %u\n",
1083 	    ret->Size, ret->TotLen, ret->MaxSeqLen);
1084     ajDebug(" linelen: %u, complen: %d cleancount: %d\n",
1085 	    ret->LineLen, ret->CompLen, ret->CleanCount);
1086 
1087 
1088     /* Now for the tables of offsets. Again maddeningly different in each */
1089     if(ret->IsBlast2)
1090     {
1091 	ttop = ret->TopHdr = ret->HeaderLen; /* header first */
1092 	ttop = ret->TopCmp = ttop + (ret->Size+1) * 4; /* then sequence */
1093 	if(!ret->IsProtein)		/* Blast 2 DNA only */
1094 	    ttop = ret->TopAmb = ttop + (ret->Size+1) * 4;
1095     }
1096     else
1097     {
1098 	ttop = ret->TopCmp = ret->HeaderLen + ret->CleanCount*4; /* comp seq */
1099 	if(!ret->IsProtein)		/* Blast 1.4 DNA only */
1100 	    ttop = ret->TopSrc = ttop + (ret->Size+1) * 4;
1101 	ttop = ret->TopHdr = ttop + (ret->Size+1) * 4; /* headers for all */
1102 	if(!ret->IsProtein)		/* Blast 1.4 DNA only */
1103 	    ttop = ret->TopAmb = ttop + (ret->Size+1) * 4;
1104     }
1105 
1106     ajDebug("table file index  starts at %d\n", ret->HeaderLen);
1107     ajDebug("table file csq    starts at %d\n", ret->TopCmp);
1108     ajDebug("table file src    starts at %d\n", ret->TopSrc);
1109     ajDebug("table file hdr    starts at %d\n", ret->TopHdr);
1110     ajDebug("table file amb    starts at %d\n", ret->TopAmb);
1111 
1112     ajStrDel(&hname);
1113     ajStrDel(&sname);
1114     ajStrDel(&tname);
1115     ajStrDel(&dbname);
1116 
1117     return ajTrue;
1118 }
1119 
1120 
1121 
1122 
1123 /* @funcstatic dbiblast_parseNcbi *********************************************
1124 **
1125 ** Parses an NCBI style header from the BLAST header table.
1126 **
1127 ** @param [r] line [const AjPStr] Input line
1128 ** @param [u] alistfile [AjPFile *] List of field temporary files
1129 ** @param [r] systemsort [AjBool] If ajTrue, use the system sort utility,
1130 **                                else sort in memory
1131 ** @param [r] fields [AjPStr const*] Field names
1132 ** @param [w] maxFieldLen [ajint*] Maximum token lengths for each field
1133 ** @param [w] countfield [ajuint*] Number of tokens for each field
1134 ** @param [w] myid [AjPStr*] ID
1135 ** @param [w] fdlist [AjPList *] Field token lists
1136 **                                  (one list for each field)
1137 ** @return [AjBool] ajTrue on success
1138 ** @@
1139 ******************************************************************************/
1140 
dbiblast_parseNcbi(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * fdlist)1141 static AjBool dbiblast_parseNcbi(const AjPStr line, AjPFile * alistfile,
1142 				 AjBool systemsort, AjPStr const * fields,
1143 				 ajint* maxFieldLen,
1144 				 ajuint* countfield,
1145 				 AjPStr* myid,
1146 				 AjPList* fdlist)
1147 {
1148     char* fd;
1149 
1150     static ajint numFields;
1151     static ajint accfield = -1;
1152     static ajint desfield = -1;
1153     static ajint svnfield = -1;
1154     static AjBool reset = AJTRUE;
1155 
1156     if(!fields)
1157     {
1158 	reset = ajTrue;
1159 	accfield = svnfield = desfield = -1;
1160 	return ajFalse;
1161     }
1162 
1163     if(reset)
1164     {
1165 	numFields = 0;
1166 	while(fields[numFields])
1167 	{
1168 	    if(ajStrMatchCaseC(fields[numFields], "acc"))
1169 		accfield=numFields;
1170 	    else if(ajStrMatchCaseC(fields[numFields], "sv"))
1171 		svnfield=numFields;
1172 	    else if(ajStrMatchCaseC(fields[numFields], "des"))
1173 		desfield=numFields;
1174 	    else
1175 		ajWarn("EMBL parsing unknown field '%S' ignored",
1176 		       fields[numFields]);
1177 	    numFields++;
1178 	}
1179 	reset = ajFalse;
1180     }
1181 
1182     if(!dbiblastGWrdexp)
1183 	dbiblastGWrdexp = ajRegCompC("([A-Za-z0-9]+)");
1184 
1185     ajStrAssignC(&dbiblastGTmpDes, "");
1186     ajStrAssignC(&dbiblastGTmpT,   "");
1187     ajStrAssignC(&dbiblastGTmpAc,  "");
1188     ajStrAssignC(&dbiblastGTmpSv,  "");
1189     ajStrAssignC(&dbiblastGTmpGi,  "");
1190     ajStrAssignC(&dbiblastGTmpDb,  "");
1191 
1192     ajFmtPrintS(&dbiblastGTmpT, ">%S", line);
1193 
1194     if(!ajSeqParseNcbi(dbiblastGTmpT, myid, &dbiblastGTmpAc, &dbiblastGTmpSv,
1195                        &dbiblastGTmpGi, &dbiblastGTmpDb, &dbiblastGTmpDes))
1196 	return ajFalse;
1197 
1198     if(ajStrGetLen(dbiblastGTmpAc))
1199 	ajStrFmtUpper(&dbiblastGTmpAc);
1200 
1201     if(accfield >= 0)
1202 	embDbiMaxlen(&dbiblastGTmpAc, &maxFieldLen[accfield]);
1203 
1204     if(svnfield >= 0)
1205     {
1206 	embDbiMaxlen(&dbiblastGTmpSv, &maxFieldLen[svnfield]);
1207 	embDbiMaxlen(&dbiblastGTmpGi, &maxFieldLen[svnfield]);
1208     }
1209 
1210 
1211     ajStrFmtUpper(myid);
1212 
1213     /* ajDebug("parseNCBI success\n"); */
1214 
1215     if(systemsort)
1216     {
1217 	if(accfield >= 0 && ajStrGetLen(dbiblastGTmpAc))
1218 	{
1219 	    countfield[accfield]++;
1220 	    ajFmtPrintF(alistfile[accfield], "%S %S\n", *myid, dbiblastGTmpAc);
1221 	}
1222 	if(svnfield >= 0 && ajStrGetLen(dbiblastGTmpSv))
1223 	{
1224 	    countfield[svnfield]++;
1225 	    ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiblastGTmpSv);
1226 	}
1227 	if(svnfield >= 0 && ajStrGetLen(dbiblastGTmpGi))
1228 	{
1229 	    countfield[svnfield]++;
1230 	    ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiblastGTmpGi);
1231 	}
1232 	if(desfield >= 0 && ajStrGetLen(dbiblastGTmpDes))
1233 	    while(ajRegExec(dbiblastGWrdexp, dbiblastGTmpDes))
1234 	    {
1235 		ajRegSubI(dbiblastGWrdexp, 1, &dbiblastGTmpFd);
1236 		embDbiMaxlen(&dbiblastGTmpFd, &maxFieldLen[desfield]);
1237 		ajStrFmtUpper(&dbiblastGTmpFd);
1238 		ajDebug("++des '%S'\n", dbiblastGTmpFd);
1239 		countfield[desfield]++;
1240 		ajFmtPrintF(alistfile[desfield], "%S %S\n",
1241                             *myid, dbiblastGTmpFd);
1242 		ajRegPost(dbiblastGWrdexp, &dbiblastGTmpDes);
1243 	    }
1244     }
1245     else
1246     {
1247         if(accfield >= 0 && ajStrGetLen(dbiblastGTmpAc))
1248 	{
1249 	    fd = ajCharNewS(dbiblastGTmpAc);
1250 	    countfield[accfield]++;
1251 	    ajListPushAppend(fdlist[accfield], fd);
1252 	}
1253 
1254         if(svnfield >= 0 && ajStrGetLen(dbiblastGTmpSv))
1255 	{
1256 	    fd = ajCharNewS(dbiblastGTmpSv);
1257 	    countfield[svnfield]++;
1258 	    ajListPushAppend(fdlist[svnfield], fd);
1259 	}
1260 
1261         if(svnfield >= 0 && ajStrGetLen(dbiblastGTmpGi))
1262 	{
1263 	    fd = ajCharNewS(dbiblastGTmpGi);
1264 	    ajListPushAppend(fdlist[svnfield], fd);
1265 	}
1266 
1267         if(desfield >= 0 && ajStrGetLen(dbiblastGTmpDes))
1268 	{
1269 	    while(ajRegExec(dbiblastGWrdexp, dbiblastGTmpDes))
1270 	    {
1271 		ajRegSubI(dbiblastGWrdexp, 1, &dbiblastGTmpFd);
1272 		embDbiMaxlen(&dbiblastGTmpFd, &maxFieldLen[desfield]);
1273 		ajStrFmtUpper(&dbiblastGTmpFd);
1274 		ajDebug("++des '%S'\n", dbiblastGTmpFd);
1275 		fd = ajCharNewS(dbiblastGTmpFd);
1276 		countfield[desfield]++;
1277 		ajListPushAppend(fdlist[desfield], fd);
1278 		ajRegPost(dbiblastGWrdexp, &dbiblastGTmpDes);
1279 	    }
1280 	}
1281     }
1282 
1283     /* ajDebug("parseNCBI '%S' '%S'\n", *myid, dbiblastGTmpAc); */
1284 
1285     return ajTrue;
1286 }
1287 
1288 
1289 
1290 
1291 /* @funcstatic dbiblast_parseGcg **********************************************
1292 **
1293 ** Parses a GCG style header from the BLAST header table.
1294 **
1295 ** @param [r] line [const AjPStr] Input line
1296 ** @param [u] alistfile [AjPFile *] field data files array
1297 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1298 ** @param [r] fields [AjPStr const *] Field names to be indexed
1299 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1300 ** @param [w] countfield [ajuint*] Number of tokens for each field
1301 ** @param [w] myid [AjPStr*] ID
1302 ** @param [w] myfdl [AjPList *] Accession number list
1303 ** @return [AjBool] ajTrue on success
1304 ** @@
1305 ******************************************************************************/
1306 
dbiblast_parseGcg(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * myfdl)1307 static AjBool dbiblast_parseGcg(const AjPStr line, AjPFile * alistfile,
1308 				AjBool systemsort, AjPStr const * fields,
1309 				ajint* maxFieldLen,
1310 				ajuint* countfield,
1311 				AjPStr* myid, AjPList* myfdl)
1312 {
1313     static AjPRegexp idexp = NULL;
1314     static AjPStr mytmpac    = NULL;
1315     char* ac;
1316     static ajint numFields;
1317     static ajint accfield = -1;
1318     static AjBool reset = AJTRUE;
1319 
1320     if(!fields)
1321     {
1322 	reset = ajTrue;
1323 	accfield = -1;
1324 	return ajFalse;
1325     }
1326 
1327     if(reset)
1328     {
1329 	numFields = 0;
1330 	while(fields[numFields])
1331 	{
1332 	    countfield[numFields]=0;
1333 	    if(ajStrMatchCaseC(fields[numFields], "acc"))
1334 		accfield=numFields;
1335 	    else if(!ajStrMatchCaseC(fields[numFields], "sv") &&
1336 		    !ajStrMatchCaseC(fields[numFields], "des"))
1337 		ajWarn("GCG ID parsing unknown field '%S' ignored",
1338 		       fields[numFields]);
1339 
1340 	    numFields++;
1341 	}
1342 	reset = ajFalse;
1343     }
1344 
1345 
1346     if(!idexp)
1347 	idexp = ajRegCompC("^[^:]+:([^ ]+)( +([A-Za-z][A-Za-z0-9]+[0-9]))");
1348 
1349     if(!ajRegExec(idexp, line))
1350 	return ajFalse;
1351 
1352     ajRegSubI(idexp, 1, myid);
1353     ajRegSubI(idexp, 3, &mytmpac);
1354     ajStrFmtUpper(myid);
1355     ajStrFmtUpper(&mytmpac); /* GCG mixes case on new SwissProt acnums */
1356 
1357     if(accfield >= 0)
1358     {
1359         embDbiMaxlen(&mytmpac, &maxFieldLen[accfield]);
1360 
1361 	countfield[accfield]++;
1362 	if(systemsort)
1363 	    ajFmtPrintF(alistfile[accfield], "%S %S\n", *myid, mytmpac);
1364 	else
1365 	{
1366 	    ac = ajCharNewS(mytmpac);
1367 	    ajListPushAppend(myfdl[accfield], ac);
1368 	}
1369     }
1370 
1371     ajDebug("parseGCG '%S' '%S'\n", *myid, mytmpac);
1372 
1373     return ajTrue;
1374 }
1375 
1376 
1377 
1378 
1379 /* @funcstatic dbiblast_parseSimple *******************************************
1380 **
1381 ** Parses a plain header from the BLAST header table.
1382 **
1383 ** @param [r] line [const AjPStr] Input line
1384 ** @param [u] alistfile [AjPFile *] field data files array
1385 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1386 ** @param [r] fields [AjPStr const *] Field names to be indexed
1387 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1388 ** @param [w] countfield [ajuint*] Number of tokens for each field
1389 ** @param [w] myid [AjPStr*] ID
1390 ** @param [w] myfdl [AjPList*] Accession number list
1391 ** @return [AjBool] ajTrue on success
1392 ** @@
1393 ******************************************************************************/
1394 
dbiblast_parseSimple(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * myfdl)1395 static AjBool dbiblast_parseSimple(const AjPStr line,
1396 				   AjPFile * alistfile,
1397 				   AjBool systemsort, AjPStr const * fields,
1398 				   ajint* maxFieldLen,
1399 				   ajuint* countfield,
1400 				   AjPStr* myid,
1401 				   AjPList* myfdl)
1402 {
1403     static AjPRegexp idexp = NULL;
1404     static AjPStr mytmpac    = NULL;
1405     char* ac;
1406     static ajint numFields;
1407     static ajint accfield = -1;
1408     static AjBool reset = AJTRUE;
1409 
1410     if(!fields)
1411     {
1412 	reset = ajTrue;
1413 	accfield = -1;
1414 	return ajFalse;
1415     }
1416 
1417     if(reset)
1418     {
1419 	numFields = 0;
1420 	while(fields[numFields])
1421 	{
1422 	    if(ajStrMatchCaseC(fields[numFields], "acc"))
1423 		accfield=numFields;
1424 	    else if(!ajStrMatchCaseC(fields[numFields], "sv") &&
1425 		    !ajStrMatchCaseC(fields[numFields], "des"))
1426 		ajWarn("Simple ID parsing unknown field '%S' ignored",
1427 		       fields[numFields]);
1428 	    numFields++;
1429 	}
1430 	reset = ajFalse;
1431     }
1432 
1433 
1434     if(!idexp)
1435 	idexp = ajRegCompC("^([^ ]+)( +([A-Za-z][A-Za-z0-9]+[0-9]))");
1436 
1437     if(!ajRegExec(idexp, line))
1438 	return ajFalse;
1439 
1440     ajRegSubI(idexp, 1, myid);
1441     ajRegSubI(idexp, 3, &mytmpac);
1442     ajStrFmtUpper(myid);
1443     ajStrFmtUpper(&mytmpac); /* GCG mixes case on new SwissProt acnums */
1444 
1445     if(accfield >= 0)
1446     {
1447         embDbiMaxlen(&mytmpac, &maxFieldLen[accfield]);
1448 	countfield[accfield]++;
1449 	if(systemsort)
1450 	    ajFmtPrintF(alistfile[accfield], "%S %S\n", *myid, mytmpac);
1451 	else
1452 	{
1453 	    ac = ajCharNewS(mytmpac);
1454 	    ajListPushAppend(myfdl[accfield], ac);
1455 	}
1456     }
1457 
1458     ajDebug("parseSimple '%S' '%S'\n", *myid, mytmpac);
1459 
1460     return ajTrue;
1461 }
1462 
1463 
1464 
1465 
1466 /* @funcstatic dbiblast_parseId ***********************************************
1467 **
1468 ** Parses a simple FASTA ID from the BLAST header table.
1469 **
1470 ** @param [r] line [const AjPStr] Input line
1471 ** @param [u] alistfile [AjPFile *] field data files array
1472 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1473 ** @param [r] fields [AjPStr const *] Field names to be indexed
1474 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1475 ** @param [w] countfield [ajuint*] Number of tokens for each field
1476 ** @param [w] myid [AjPStr*] ID
1477 ** @param [w] myfdl [AjPList*] Accession number list
1478 ** @return [AjBool] ajTrue on success
1479 ** @@
1480 ******************************************************************************/
1481 
dbiblast_parseId(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * myfdl)1482 static AjBool dbiblast_parseId(const AjPStr line, AjPFile * alistfile,
1483 			       AjBool systemsort, AjPStr const * fields,
1484 			       ajint* maxFieldLen,
1485 			       ajuint* countfield,
1486 			       AjPStr* myid,
1487 			       AjPList * myfdl)
1488 {
1489     static AjPRegexp idexp = NULL;
1490     static AjBool reset = AJTRUE;
1491 
1492     (void) alistfile;
1493     (void) systemsort;
1494     (void) maxFieldLen;
1495     (void) countfield;
1496     (void) myfdl;
1497 
1498     if(!fields)
1499     {
1500 	reset = ajTrue;
1501 	return ajFalse;
1502     }
1503 
1504     if(reset)
1505     {
1506 	reset = ajFalse;
1507     }
1508 
1509 
1510     if(!idexp)
1511 	idexp = ajRegCompC("^([^ ]+)");
1512 
1513     if(!ajRegExec(idexp, line))
1514 	return ajFalse;
1515 
1516     ajRegSubI(idexp, 1, myid);
1517     ajStrFmtUpper(myid);
1518 
1519     ajDebug("parseId '%S'\n", *myid);
1520 
1521     return ajTrue;
1522 }
1523 
1524 
1525 
1526 
1527 /* @funcstatic dbiblast_parseUnknown ******************************************
1528 **
1529 ** Parses an unknown type ID from the BLAST header table.
1530 **
1531 ** @param [r] line [const AjPStr] Input line
1532 ** @param [u] alistfile [AjPFile *] field data files array
1533 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1534 ** @param [r] fields [AjPStr const *] Field names to be indexed
1535 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1536 ** @param [w] countfield [ajuint*] Number of tokens for each field
1537 ** @param [w] myid [AjPStr*] ID
1538 ** @param [w] myfdl [AjPList*] Accession number list
1539 ** @return [AjBool] ajTrue on success
1540 ** @@
1541 ******************************************************************************/
1542 
dbiblast_parseUnknown(const AjPStr line,AjPFile * alistfile,AjBool systemsort,AjPStr const * fields,ajint * maxFieldLen,ajuint * countfield,AjPStr * myid,AjPList * myfdl)1543 static AjBool dbiblast_parseUnknown(const AjPStr line,
1544 				    AjPFile * alistfile,
1545 				    AjBool systemsort, AjPStr const * fields,
1546 				    ajint* maxFieldLen,
1547 				    ajuint* countfield,
1548 				    AjPStr* myid,
1549 				    AjPList* myfdl)
1550 {
1551     static ajint called = 0;
1552     static AjBool reset = AJTRUE;
1553 
1554     (void) line;
1555     (void) alistfile;
1556     (void) systemsort;
1557     (void) maxFieldLen;
1558     (void) countfield;
1559     (void) myid;
1560     (void) myfdl;
1561 
1562     if(!fields)
1563     {
1564 	reset = ajTrue;
1565 	return ajFalse;
1566     }
1567 
1568     if(reset)
1569     {
1570 	reset = ajFalse;
1571     }
1572 
1573 
1574     if(!called)			/* first time - find out the format */
1575 	called = 1;
1576 
1577     return ajFalse;
1578 }
1579 
1580 
1581 
1582 
1583 /* @funcstatic dbiblast_memreadUInt4 ******************************************
1584 **
1585 ** Reads a 4 byte unsigned integer from a (possibly memory mapped)
1586 ** binary file, with the correct byte orientation
1587 **
1588 ** @param [u] fd [PMemFile] Input file
1589 ** @param [w] val [ajuint *] Unsigned integer
1590 ** @return [void]
1591 ** @@
1592 ******************************************************************************/
1593 
dbiblast_memreadUInt4(PMemFile fd,ajuint * val)1594 static void dbiblast_memreadUInt4(PMemFile fd, ajuint *val)
1595 {
1596 
1597     dbiblast_memfread((char *)val,(size_t)4,(size_t)1,fd);
1598     if(readReverse)
1599 	ajByteRevLen4((ajint *)val);
1600 
1601     return;
1602 }
1603 
1604 
1605 
1606 
1607 /* @funcstatic dbiblast_memfreadS *********************************************
1608 **
1609 ** Reads a string from a (possibly memory mapped)
1610 ** binary file, with the correct byte orientation
1611 **
1612 ** @param [w] dest [AjPStr*] Output string, must be already the right size
1613 ** @param [r] size [size_t] Size of string (1)
1614 ** @param [r] num_items [size_t] Number of bytes
1615 ** @param [u] mf [PMemFile] Input file
1616 ** @return [size_t] fread return code
1617 ** @@
1618 ******************************************************************************/
1619 
dbiblast_memfreadS(AjPStr * dest,size_t size,size_t num_items,PMemFile mf)1620 static size_t dbiblast_memfreadS(AjPStr* dest, size_t size, size_t num_items,
1621 				 PMemFile mf)
1622 {
1623 
1624     return dbiblast_memfread(ajStrGetuniquePtr(dest), size, num_items, mf);
1625 }
1626 
1627 
1628 
1629 
1630 /* @funcstatic dbiblast_memfseek **********************************************
1631 **
1632 ** fseek in a (possibly memory mapped) binary file
1633 **
1634 ** @param [u] mf [PMemFile] Input file
1635 ** @param [r] offset [ajlong] Offset in file
1636 ** @param [r] whence [ajint] Start of offset, as defined for 'fseek'
1637 ** @return [size_t] Result of 'fseek'
1638 ** @@
1639 ******************************************************************************/
1640 
dbiblast_memfseek(PMemFile mf,ajlong offset,ajint whence)1641 static size_t dbiblast_memfseek(PMemFile mf, ajlong offset, ajint whence)
1642 {
1643 
1644     if(mf->IsMem)
1645     {					/* memory mapped */
1646 	switch(whence)
1647 	{
1648 	case 0:
1649 	    mf->Pos = offset;
1650 	    break;
1651 	case 1:
1652 	    mf->Pos += offset;
1653 	    break;
1654 	case 2:
1655 	    mf->Pos = mf->Size + offset;
1656 	    break;
1657 	default:
1658 	    ajErr("invalid memfseek code %d", whence);
1659 	    embExitBad();
1660 	}
1661 	if(mf->Pos > mf->Size)
1662 	    mf->Pos = mf->Size;
1663 	if(mf->Pos < 0)
1664 	    mf->Pos = 0;
1665 	return 0;
1666     }
1667 
1668     return ajFileSeek( mf->File, offset, whence);
1669 }
1670 
1671 
1672 
1673 
1674 /* @funcstatic dbiblast_memfread **********************************************
1675 **
1676 ** fread in a (possibly memory mapped) binary file
1677 **
1678 ** @param [w] dest [void*] Output text string
1679 ** @param [r] size [size_t] Size of string (1)
1680 ** @param [r] num_items [size_t] Number of bytes
1681 ** @param [u] mf [PMemFile] Input file
1682 ** @return [size_t] Result of 'fread'
1683 ** @@
1684 ******************************************************************************/
1685 
dbiblast_memfread(void * dest,size_t size,size_t num_items,PMemFile mf)1686 static size_t dbiblast_memfread(void* dest, size_t size, size_t num_items,
1687 				PMemFile mf)
1688 {
1689     size_t i;
1690 
1691     if(mf->IsMem)
1692     {					/* memory mapped */
1693 	i = size * num_items;
1694 	memcpy(dest, &mf->Mem[mf->Pos], i);
1695 	mf->Pos += (ajlong) i;
1696 	return i;
1697     }
1698 
1699     return ajReadbinBinary(mf->File, num_items, size, dest);
1700 }
1701 
1702 
1703 
1704 
1705 /* @funcstatic dbiblast_newname ***********************************************
1706 **
1707 ** Generate a new filename with a different suffix.
1708 **
1709 ** @param [w] nname [AjPStr*] New filename
1710 ** @param [r] oname [const AjPStr] Original file name
1711 ** @param [r] suff [const char*] New suffix
1712 ** @return [void]
1713 ** @@
1714 ******************************************************************************/
1715 
dbiblast_newname(AjPStr * nname,const AjPStr oname,const char * suff)1716 static void dbiblast_newname(AjPStr* nname, const AjPStr oname,
1717 			     const char *suff)
1718 {
1719 
1720     ajStrAssignS(nname, oname);
1721     if(ajStrGetCharFirst(oname)=='@')
1722 	ajStrCutStart(nname, 1);
1723     ajStrAppendK(nname, '.');
1724     ajStrAppendC(nname, suff);
1725 
1726     return;
1727 }
1728 
1729 
1730 
1731 
1732 /* @funcstatic dbiblast_dbname ************************************************
1733 **
1734 ** Generate the database name (original fasta file name)
1735 ** by stripping off the suffix
1736 **
1737 ** @param [w] dbname [AjPStr*] Database filename
1738 ** @param [r] oname [const AjPStr] Original file name
1739 ** @param [r] suff [const char*] New suffix
1740 ** @return [void]
1741 ** @@
1742 ******************************************************************************/
1743 
dbiblast_dbname(AjPStr * dbname,const AjPStr oname,const char * suff)1744 static void dbiblast_dbname(AjPStr* dbname,
1745 			    const AjPStr oname, const char *suff)
1746 {
1747     AjPStr suffix = NULL;
1748 
1749     ajFmtPrintS(&suffix, ".%s", suff);
1750 
1751     ajStrAssignS(dbname, oname);
1752 
1753     if(ajStrGetCharFirst(oname)=='@')
1754 	ajStrCutStart(dbname, 1);
1755 
1756     if(!ajStrSuffixS(*dbname, suffix))
1757     {
1758 	ajStrDel(&suffix);
1759 	return;
1760     }
1761 
1762     ajStrCutEnd(dbname, ajStrGetLen(suffix));
1763 
1764     ajStrDel(&suffix);
1765 
1766     return;
1767 }
1768 
1769 
1770 
1771 
1772 /* @funcstatic dbiblast_wrongtype *********************************************
1773 **
1774 ** Tests for the other database filenames in case the user asked
1775 ** for "*.*". Used to test we have the *.suff file before opening all files.
1776 **
1777 ** @param [r] oname [const AjPStr] Original file name
1778 ** @param [r] suff [const char*] Required suffix
1779 ** @return [AjBool] ajTrue if any other filename suffix is recognized
1780 ** @@
1781 ******************************************************************************/
1782 
dbiblast_wrongtype(const AjPStr oname,const char * suff)1783 static AjBool dbiblast_wrongtype(const AjPStr oname, const char *suff)
1784 {
1785     ajint itype;
1786 
1787     for(itype=0; blasttypes[itype].ExtT; itype++)
1788     {
1789 	if(strcmp(suff, blasttypes[itype].ExtT))
1790 	    if(ajStrSuffixC(oname, blasttypes[itype].ExtT))
1791 		return ajTrue;
1792 
1793 	if(strcmp(suff, blasttypes[itype].ExtH))
1794 	    if(ajStrSuffixC(oname, blasttypes[itype].ExtH))
1795 		return ajTrue;
1796 
1797 	if(strcmp(suff, blasttypes[itype].ExtS))
1798 	    if(ajStrSuffixC(oname, blasttypes[itype].ExtS))
1799 		return ajTrue;
1800     }
1801 
1802     return ajFalse;
1803 }
1804 
1805 
1806 
1807 
1808 /* @funcstatic dbiblast_memfclosefile *****************************************
1809 **
1810 ** Close a (possibly memory mapped) binary file
1811 **
1812 ** @param [d] pfd [PMemFile*] File
1813 ** @return [void]
1814 ** @@
1815 ******************************************************************************/
1816 
dbiblast_memfclosefile(PMemFile * pfd)1817 static void dbiblast_memfclosefile(PMemFile* pfd)
1818 {
1819     PMemFile fd;
1820 
1821     if(!pfd)
1822 	return;
1823 
1824     if(!*pfd)
1825 	return;
1826 
1827     fd = *pfd;
1828     ajFileClose(&fd->File);
1829     ajStrDel(&fd->Name);
1830 
1831     AJFREE(*pfd);
1832 
1833     return;
1834 }
1835 
1836 
1837 
1838 
1839 /* @funcstatic dbiblast_memfopenfile ******************************************
1840 **
1841 ** Open a (possibly memory mapped) binary file
1842 **
1843 ** @param [r] name [const AjPStr] File name
1844 ** @return [PMemFile] Memory mapped file object created
1845 ** @@
1846 ******************************************************************************/
1847 
dbiblast_memfopenfile(const AjPStr name)1848 static PMemFile dbiblast_memfopenfile(const AjPStr name)
1849 {
1850     PMemFile ret;
1851     AjPFile fp;
1852 
1853     fp = ajFileNewInNameS(name);
1854     if(!fp)
1855 	return NULL;
1856 
1857     AJNEW0(ret);
1858 
1859     ajStrAssignS(&ret->Name, name);
1860     ret->IsMem = 0;
1861     ret->File  = fp;
1862     ret->Size  = 0;
1863     ret->Mem   = NULL;
1864 
1865     ajDebug("fopened '%S'\n", name);
1866 
1867     return ret;
1868 }
1869 
1870 
1871 
1872 
1873 /* @funcstatic dbiblast_loadtable *********************************************
1874 **
1875 ** Load part of the BLAST binary table into memory
1876 **
1877 ** @param [w] table [ajuint*] table array to be read
1878 ** @param [r] isize [ajint] Number of elements to read
1879 ** @param [u] db [PBlastDb] Blast database structure
1880 ** @param [r] top [ajint] Byte offset for start of table
1881 ** @param [r] pos [ajint] Current element number in table.
1882 ** @return [ajint] Number of elements read.
1883 ** @@
1884 ******************************************************************************/
1885 
dbiblast_loadtable(ajuint * table,ajint isize,PBlastDb db,ajint top,ajint pos)1886 static ajint dbiblast_loadtable(ajuint* table, ajint isize, PBlastDb db,
1887 				ajint top, ajint pos)
1888 {
1889     ajint i;
1890     ajint j;
1891     ajint imax;
1892 
1893     imax = pos + isize;
1894     if(imax > (db->Size+1))
1895 	imax = db->Size+1;
1896 
1897     ajDebug("loadtable size %d top %d db->Size %d pos %d imax %d\n",
1898 	    isize, top, db->Size, pos, imax);
1899 
1900     dbiblast_memfseek(db->TFile, top + 4*(pos), 0);
1901     j = 0;
1902 
1903     for(i=pos; i<=imax; i++)
1904     {
1905 	/* ajDebug("reading at %d\n", ajFileResetPos(db->TFile->File));*/
1906 	dbiblast_memreadUInt4(db->TFile,&table[j++]);
1907 	/* ajDebug("read i: %d j: %d value: %d\n", i, j-1, table[j-1]);*/
1908     }
1909 
1910     return imax - pos + 1;
1911 }
1912 
1913 
1914 
1915 
1916 /* @funcstatic dbiblast_ncblreadhdr *******************************************
1917 **
1918 ** Read the FASTA header line for one entry
1919 **
1920 ** @param [w] hdrline [AjPStr*] Header line
1921 ** @param [u] db [PBlastDb] Blast database structure
1922 ** @param [r] start [ajint] Byte offset for start of header
1923 ** @param [r] end [ajint] Byte offset for end of header
1924 ** @return [ajint] Number of bytes read.
1925 ** @@
1926 ******************************************************************************/
1927 
dbiblast_ncblreadhdr(AjPStr * hdrline,PBlastDb db,ajint start,ajint end)1928 static ajint dbiblast_ncblreadhdr(AjPStr* hdrline, PBlastDb db, ajint start,
1929 				  ajint end)
1930 {
1931     ajint size;
1932     ajint llen;
1933     PMemFile hfp;
1934 
1935     size = ajStrGetRes(*hdrline);
1936     hfp  = db->HFile;
1937 
1938     if(end)
1939     {
1940 	llen = end - start;
1941 
1942 	if(db->IsBlast2)
1943 	    llen += 1;
1944 
1945 	if(llen > size)
1946 	    llen = size;
1947     }
1948     else
1949 	llen = size;
1950 
1951     /*ajDebug("ncblreadhdr start %d end %d llen %d\n", start, end, llen);*/
1952 
1953     if(db->IsBlast2)
1954     {
1955 	dbiblast_memfseek(hfp,start,0);
1956 	dbiblast_memfreadS(hdrline,(size_t)1,(size_t)(llen-1),hfp);
1957     }
1958     else
1959     {
1960 	dbiblast_memfseek(hfp,start+1,0); /* skip the '>' character */
1961 	dbiblast_memfreadS(hdrline,(size_t)1,(size_t)(llen-1),hfp);
1962     }
1963 
1964     ajStrSetValidLen(hdrline, (llen-1));
1965 
1966     return llen;
1967 }
1968