1 /* @source dbiflat application
2 **
3 ** Index flatfile databases
4 **
5 ** @author Copyright (C) Peter Rice, Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 
23 /******************************************************************************
24 **
25 ** EMBOSS/Staden/EMBLCD indexing
26 **
27 ** This version reads a flat file database,
28 ** and writes entryname and field (e.g. accession) index files.
29 **
30 ** It needs to know the format in order to
31 ** parse the entryname and accession number.
32 **
33 ** To save memory, it is also helpful to know the maximum number of
34 ** entries in the database and the maximum entryname length so that
35 ** space can be preallocated for storage.
36 **
37 ** Entry names and accession numbers are held in list structures,
38 ** then converted to arrays and sorted.
39 **
40 ** Multiple input files are allowed.
41 **
42 ** EMBLCD and Staden index files use different names but have essentially
43 ** the same contents.
44 **
45 ******************************************************************************/
46 
47 #include "emboss.h"
48 
49 #define FLATTYPE_OTHER 0
50 #define FLATTYPE_ID 1
51 #define FLATTYPE_ACC 2
52 #define FLATTYPE_DES 3
53 #define FLATTYPE_KEY 4
54 #define FLATTYPE_TAX 5
55 #define FLATTYPE_VER 6
56 
57 /* Definiiton of global variables */
58 
59 static AjPStr dbiflatGRline   = NULL;
60 static AjPStr dbiflatGTmpId   = NULL;
61 static AjPStr dbiflatGTmpStr  = NULL;
62 static AjPStr dbiflatGTmpLine = NULL;
63 static AjPStr dbiflatGTmpFd   = NULL;
64 static AjPStr dbiflatGTypStr  = NULL;
65 
66 static AjPRegexp dbiflatGRegEmblType = NULL;
67 static AjPRegexp dbiflatGRegEmblId   = NULL;
68 static AjPRegexp dbiflatGRegEmblAcc  = NULL;
69 static AjPRegexp dbiflatGRegEmblWrd  = NULL;
70 static AjPRegexp dbiflatGRegEmblVer  = NULL;
71 static AjPRegexp dbiflatGRegEmblPhr  = NULL;
72 static AjPRegexp dbiflatGRegEmblTax  = NULL;
73 static AjPRegexp dbiflatGRegEmblEnd  = NULL;
74 
75 static AjPRegexp dbiflatGRegGbType = NULL;
76 static AjPRegexp dbiflatGRegGbMore = NULL;
77 static AjPRegexp dbiflatGRegGbWrd = NULL;
78 static AjPRegexp dbiflatGRegGbPhr = NULL;
79 static AjPRegexp dbiflatGRegGbTax = NULL;
80 static AjPRegexp dbiflatGRegGbVer = NULL;
81 static AjPRegexp dbiflatGRegGbEnd = NULL;
82 
83 static AjPRegexp dbiflatGRegRefseqTyp = NULL;
84 static AjPRegexp dbiflatGRegRefseqMore = NULL;
85 static AjPRegexp dbiflatGRegRefseqId = NULL;
86 static AjPRegexp dbiflatGRegRefseqWrd = NULL;
87 static AjPRegexp dbiflatGRegRefseqPhr = NULL;
88 static AjPRegexp dbiflatGRegRefseqTax = NULL;
89 static AjPRegexp dbiflatGRegRefseqVer = NULL;
90 static AjPRegexp dbiflatGRegRefseqEnd = NULL;
91 
92 static EmbPEntry dbiflatGEntry = NULL;
93 
94 static AjPList* dbiflatGFdl = NULL;
95 
96 static AjBool dbiflat_ParseSwiss(AjPFile libr, AjPFile* alistfile,
97                                  AjBool systemsort, AjPStr* fields,
98                                  ajint* maxFieldLen, ajuint* countfield,
99                                  ajint *dpos, AjPStr* myid, AjPList* acl);
100 static AjBool dbiflat_ParseEmbl(AjPFile libr, AjPFile* alistfile,
101 				AjBool systemsort, AjPStr* fields,
102 				ajint* maxFieldLen, ajuint* countfield,
103 				ajint *dpos, AjPStr* myid, AjPList* acl);
104 static AjBool dbiflat_ParseGenbank(AjPFile libr, AjPFile* alistfile,
105 				   AjBool systemsort, AjPStr* fields,
106 				   ajint* maxFieldLen, ajuint* countfield,
107 				   ajint *dpos, AjPStr* myid, AjPList* acl);
108 static AjBool dbiflat_ParseRefseq(AjPFile libr, AjPFile* alistfile,
109 				  AjBool systemsort, AjPStr* fields,
110 				  ajint* maxFieldLen, ajuint* countfield,
111 				  ajint *dpos, AjPStr* myid, AjPList* acl);
112 
113 
114 
115 
116 /* @datastatic DbiflatPParser *************************************************
117 **
118 ** Parser definition structure
119 **
120 ** @alias DbiflatSParser
121 ** @alias DbiflatOParser
122 **
123 ** @attr Name [const char*] Parser name
124 ** @attr Parser [AjBool function] Parser function
125 ** @@
126 ******************************************************************************/
127 
128 typedef struct DbiflatSParser
129 {
130     const char* Name;
131     AjBool (*Parser) (AjPFile libr, AjPFile* alistfile,
132 		      AjBool systemsort, AjPStr* fields,
133 		      ajint* maxFieldLen, ajuint* countfield,
134 		      ajint *dpos, AjPStr* myid, AjPList* acl);
135 } DbiflatOParser;
136 #define DbiflatPParser DbiflatOParser*
137 
138 
139 
140 
141 static DbiflatOParser parser[] =
142 {
143     {"EMBL", dbiflat_ParseEmbl},
144     {"SWISS", dbiflat_ParseSwiss},
145     {"GB", dbiflat_ParseGenbank},
146     {"REFSEQ", dbiflat_ParseRefseq},
147     {NULL, NULL}
148 };
149 
150 
151 
152 static EmbPEntry dbiflat_NextFlatEntry(AjPFile libr, ajuint ifile,
153 				       const AjPStr idformat,
154 				       AjBool systemsort,
155 				       AjPStr* fields, ajint* maxFieldLen,
156 				       ajuint* maxidlen, ajuint* countfield,
157 				       AjPFile elistfile, AjPFile* alistfile);
158 
159 
160 
161 
162 
163 /* @prog dbiflat **************************************************************
164 **
165 ** Index a flat file database
166 **
167 ******************************************************************************/
168 
main(int argc,char ** argv)169 int main(int argc, char **argv)
170 {
171 
172     AjPList idlist;
173     AjPList* fieldList = NULL;
174 
175     AjBool systemsort;
176     AjBool cleanup;
177 
178     ajuint maxindex;
179     ajuint maxidlen = 0;
180     ajuint maxlen;
181 
182     AjPFile elistfile  = NULL;
183     AjPFile* alistfile = NULL;
184 
185     AjPStr dbname   = NULL;
186     AjPStr release  = NULL;
187     AjPStr datestr  = NULL;
188     AjPStr sortopt  = NULL;
189     void **entryIds = NULL;
190 
191     AjPStr directory;
192     AjPStr indexdir;
193     AjPStr filename;
194     AjPStr exclude;
195     AjPStr curfilename = NULL;
196     AjPFile libr = NULL;
197     AjPStr idformat = NULL;
198 
199     EmbPEntry entry;
200 
201     ajuint idCount = 0;
202     ajuint idDone;
203     AjPList listInputFiles = NULL;
204     void ** inputFiles = NULL;
205     ajuint nfiles;
206     ajuint ifile;
207 
208     ajuint filesize;
209     short recsize;
210     ajuint maxfilelen = 20;
211     char date[4] =
212     {
213 	0,0,0,0
214     };
215 
216     AjPStr tmpfname = NULL;
217     AjPStr* fields  = NULL;
218 
219     AjPFile entFile = NULL;
220 
221     AjPStr* divfiles   = NULL;
222     ajint* maxFieldLen = NULL;
223 
224     ajuint ifield  = 0;
225     ajuint nfields = 0;
226 
227     AjPFile logfile = NULL;
228     ajuint* countField = NULL;
229     ajuint* fieldTot = NULL;
230     ajuint idCountFile = 0;
231     ajuint i;
232 
233     embInit("dbiflat", argc, argv);
234 
235     idformat   = ajAcdGetListSingle("idformat");
236     fields     = ajAcdGetList("fields");
237     directory  = ajAcdGetDirectoryName("directory");
238     indexdir   = ajAcdGetOutdirName("indexoutdir");
239     filename   = ajAcdGetString("filenames");
240     exclude    = ajAcdGetString("exclude");
241     dbname     = ajAcdGetString("dbname");
242     release    = ajAcdGetString("release");
243     datestr    = ajAcdGetString("date");
244     systemsort = ajAcdGetBoolean("systemsort");
245     cleanup    = ajAcdGetBoolean("cleanup");
246     sortopt    = ajAcdGetString("sortoptions");
247     maxindex   = ajAcdGetInt("maxindex");
248     logfile    = ajAcdGetOutfile("outfile");
249 
250     while(fields[nfields])		/* array ends with a NULL */
251 	nfields++;
252 
253     if(nfields)
254     {
255 	AJCNEW(maxFieldLen, nfields);
256 	AJCNEW0(countField, nfields);
257 	AJCNEW0(fieldTot, nfields);
258 	for(ifield=0; ifield < nfields; ifield++)
259 	    maxFieldLen[ifield] = (ajint)maxindex * -1; /* -maxindex illegal */
260 
261 	if(systemsort)
262 	    AJCNEW(alistfile, nfields);
263 	else
264 	{
265 	    AJCNEW(fieldList, nfields);
266 	    for(ifield=0; ifield < nfields; ifield++)
267 		fieldList[ifield] = ajListNew();
268 	}
269     }
270 
271     if(ajStrMatchC(datestr, "00/00/00"))
272 	ajFmtPrintS(&datestr, "%D", ajTimeRefTodayFmt("dbindex"));
273 
274     ajStrRemoveWhite(&dbname);		/* used for temp filenames */
275     embDbiDateSet(datestr, date);
276     idlist = ajListNew();
277 
278     ajDebug("reading '%S/%S'\n", directory, filename);
279     ajDebug("writing '%S/'\n", indexdir);
280 
281     listInputFiles = embDbiFileListExc(directory, filename, exclude);
282     ajListSort(listInputFiles, &ajStrVcmp);
283     nfiles = (ajuint) ajListToarray(listInputFiles, &inputFiles);
284     if(!nfiles)
285         ajDie("No input files in '%S' matched filename '%S'",
286               directory, filename);
287 
288     embDbiLogHeader(logfile, dbname, release, datestr,
289 		     indexdir, maxindex);
290 
291     embDbiLogFields(logfile, fields, nfields);
292     embDbiLogSource(logfile, directory, filename, exclude,
293 		    (AjPStr*) inputFiles, nfiles);
294     embDbiLogCmdline(logfile);
295 
296     AJCNEW0(divfiles, nfiles);
297 
298     /* process each input file, one at a time */
299     for(ifile=0; ifile < nfiles; ifile++)
300     {
301 	ajStrAssignS(&curfilename, (AjPStr) inputFiles[ifile]);
302 	embDbiFlatOpenlib(curfilename, &libr);
303 	ajFilenameTrimPath(&curfilename);
304 	if(ajStrGetLen(curfilename) >= maxfilelen)
305 	    maxfilelen = ajStrGetLen(curfilename) + 1;
306 
307 	ajDebug("processing file '%F' ...\n", libr);
308 	ajStrAssignS(&divfiles[ifile], curfilename);
309 
310 	if(systemsort)	 /* elistfile for entries, alist for fields */
311 	    elistfile = embDbiSortOpen(alistfile, ifile,
312 				       dbname, fields, nfields);
313 
314 	idCountFile = 0;
315 	for(i=0;i<nfields;i++)
316 	    countField[i] = 0;
317 	while((entry=dbiflat_NextFlatEntry(libr, ifile, idformat,
318 					   systemsort, fields, maxFieldLen,
319 					   &maxidlen, countField,
320 					   elistfile, alistfile)))
321 	{
322 	    idCountFile++;
323 
324 	    if(!systemsort)	    /* save the entry data in lists */
325 	    {
326 		embDbiMemEntry(idlist, fieldList, nfields, entry, ifile);
327 		entry = NULL;
328 	    }
329 	}
330 	idCount += idCountFile;
331 	if(systemsort)
332 	{
333 	    embDbiSortClose(&elistfile, alistfile, nfields);
334 	    AJFREE(entry);
335 	}
336 	else
337 	{
338 	    embDbiEntryDel(&dbiflatGEntry);
339 	}
340 	embDbiLogFile(logfile, curfilename, idCountFile, fields,
341 		      countField, nfields);
342     }
343 
344 
345     embDbiWriteDivision(indexdir, dbname, release, date,
346 			maxfilelen, nfiles, divfiles, NULL);
347 
348     /* Write the entryname.idx index */
349     ajStrAssignC(&tmpfname, "entrynam.idx");
350     entFile = ajFileNewOutNamePathS(tmpfname, indexdir);
351 
352     recsize = maxidlen+10;
353     filesize = 300 + (idCount*(ajint)recsize);
354     embDbiHeader(entFile, filesize, idCount, recsize, dbname, release, date);
355 
356     if(systemsort)
357         idDone = embDbiSortWriteEntry(entFile, maxidlen,
358 				      dbname, nfiles, cleanup, sortopt);
359     else			  /* save entries in entryIds array */
360     {
361         idDone = embDbiMemWriteEntry(entFile, maxidlen,
362 				     idlist, &entryIds);
363 	if(idDone != idCount)
364 	    ajFatal("Duplicates not allowed for in-memory processing");
365     }
366 
367     embDbiHeaderSize(entFile, 300+(idDone*(ajint)recsize), idDone);
368     ajFileClose(&entFile);
369 
370     /* Write the fields index files */
371     for(ifield=0; ifield < nfields; ifield++)
372     {
373         if(maxindex)
374 	    maxlen = maxindex;
375 	{
376 	    if(maxFieldLen[ifield] >= 0)
377 		maxlen = maxFieldLen[ifield];
378 	    else
379 		maxlen = - maxFieldLen[ifield];
380 	}
381 
382         if(systemsort)
383 	    fieldTot[ifield] = embDbiSortWriteFields(dbname, release,
384 						     date, indexdir,
385 						     fields[ifield], maxlen,
386 						     nfiles, idCount,
387 						     cleanup, sortopt);
388 	else
389 	    fieldTot[ifield] = embDbiMemWriteFields(dbname, release,
390 						    date, indexdir,
391 						    fields[ifield], maxlen,
392 						    fieldList[ifield],
393 						    entryIds);
394     }
395 
396     embDbiLogFinal(logfile,maxindex, maxFieldLen, fields, fieldTot,
397 		   nfields, nfiles, idDone, idCount);
398 
399     if(systemsort)
400     {
401 	embDbiRmEntryFile(dbname, cleanup);
402     }
403 
404     ajStrDel(&dbname);
405     ajStrDel(&release);
406     ajStrDel(&datestr);
407     ajStrDel(&sortopt);
408     ajStrDel(&filename);
409     ajStrDel(&exclude);
410     ajStrDel(&directory);
411     ajStrDel(&indexdir);
412     ajFileClose(&libr);
413     ajFileClose(&logfile);
414     ajStrDel(&idformat);
415     ajStrDelarray(&fields);
416 
417     ajStrDel(&tmpfname);
418     ajFileClose(&elistfile);
419 
420 
421     for(i=0;i<nfields;i++)
422     {
423 	if(systemsort)
424 	{
425 	    ajFileClose(&alistfile[i]);
426 	}
427 	else
428 	{
429 	    ajListMap(fieldList[i], &embDbiFieldDelMap, NULL);
430 	    ajListFree(&fieldList[i]);
431 	}
432     }
433 
434     AJFREE(alistfile);
435     AJFREE(fieldList);
436     AJFREE(maxFieldLen);
437     AJFREE(countField);
438     AJFREE(fieldTot);
439 
440     for(i=0;i<nfiles;i++)
441     {
442 	ajStrDel(&divfiles[i]);
443     }
444     AJFREE(divfiles);
445     AJFREE(inputFiles);
446 
447     ajRegFree(&dbiflatGRegEmblType);
448     ajRegFree(&dbiflatGRegEmblId);
449     ajRegFree(&dbiflatGRegEmblAcc);
450     ajRegFree(&dbiflatGRegEmblWrd);
451     ajRegFree(&dbiflatGRegEmblVer);
452     ajRegFree(&dbiflatGRegEmblPhr);
453     ajRegFree(&dbiflatGRegEmblTax);
454     ajRegFree(&dbiflatGRegEmblEnd);
455 
456     ajRegFree(&dbiflatGRegGbType);
457     ajRegFree(&dbiflatGRegGbMore);
458     ajRegFree(&dbiflatGRegGbWrd);
459     ajRegFree(&dbiflatGRegGbPhr);
460     ajRegFree(&dbiflatGRegGbTax);
461     ajRegFree(&dbiflatGRegGbVer);
462     ajRegFree(&dbiflatGRegGbEnd);
463 
464     ajRegFree(&dbiflatGRegRefseqTyp);
465     ajRegFree(&dbiflatGRegRefseqMore);
466     ajRegFree(&dbiflatGRegRefseqId);
467     ajRegFree(&dbiflatGRegRefseqWrd);
468     ajRegFree(&dbiflatGRegRefseqTax);
469     ajRegFree(&dbiflatGRegRefseqVer);
470     ajRegFree(&dbiflatGRegRefseqEnd);
471 
472     embDbiEntryDel(&dbiflatGEntry);
473 
474     ajStrDel(&dbiflatGRline);
475     ajStrDel(&dbiflatGTmpFd);
476     ajStrDel(&dbiflatGTmpLine);
477     ajStrDel(&dbiflatGTmpStr);
478     ajStrDel(&dbiflatGTypStr);
479     ajStrDel(&dbiflatGTmpId);
480 
481     if(dbiflatGFdl)
482     {
483 	for(i=0; i < nfields; i++)
484 	    ajListFree(&dbiflatGFdl[i]);
485 	AJFREE(dbiflatGFdl);
486     }
487 
488     ajListMap(idlist, &embDbiEntryDelMap, NULL);
489     ajListFree(&idlist);
490     ajListstrFreeData(&listInputFiles);
491     AJFREE(entryIds);
492     ajStrDel(&curfilename);
493 
494     embExit();
495 
496     return 0;
497 }
498 
499 
500 
501 
502 /* @funcstatic dbiflat_NextFlatEntry ******************************************
503 **
504 ** Returns next database entry as an EmbPEntry object
505 **
506 ** @param [u] libr [AjPFile] Database file
507 ** @param [r] ifile [ajuint] File number.
508 ** @param [r] idformat [const AjPStr] Format to be used
509 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
510 ** @param [u] fields [AjPStr*] Fields to be indexed
511 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
512 ** @param [w] maxidlen [ajuint*] Maximum entry ID length
513 ** @param [w] countfield [ajuint*] Number of tokens for each field
514 ** @param [u] elistfile [AjPFile] entry file
515 ** @param [u] alistfile [AjPFile*] field data files array
516 ** @return [EmbPEntry] Entry data object.
517 ** @@
518 ******************************************************************************/
519 
dbiflat_NextFlatEntry(AjPFile libr,ajuint ifile,const AjPStr idformat,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * maxidlen,ajuint * countfield,AjPFile elistfile,AjPFile * alistfile)520 static EmbPEntry dbiflat_NextFlatEntry(AjPFile libr, ajuint ifile,
521 				       const AjPStr idformat,
522 				       AjBool systemsort,
523 				       AjPStr* fields, ajint* maxFieldLen,
524 				       ajuint* maxidlen, ajuint* countfield,
525 				       AjPFile elistfile, AjPFile* alistfile)
526 {
527     ajint ir;
528     ajint is = 0;
529     char* token;
530     ajint i;
531     static ajint called  = 0;
532     static ajint iparser = -1;
533     static ajint nfields;
534     ajint ifield;
535 
536     if(!called)
537     {
538 	for(i=0; parser[i].Name; i++)
539 	    if(ajStrMatchC(idformat, parser[i].Name))
540 	    {
541 		iparser = i;
542 		break;
543 	    }
544 
545 	if(iparser < 0)
546 	    ajFatal("idformat '%S' unknown", idformat);
547     }
548 
549     if(!dbiflatGFdl)
550     {
551 	nfields = 0;
552 	while(fields[nfields])
553 	    nfields++;
554 	if(nfields)
555 	    AJCNEW(dbiflatGFdl, nfields);
556 	for(i=0; i < nfields; i++)
557 	    dbiflatGFdl[i] = ajListNew();
558     }
559 
560     if(!dbiflatGEntry || !systemsort)
561 	dbiflatGEntry = embDbiEntryNew(nfields);
562 
563     if(!(*parser[iparser].Parser)(libr, alistfile, systemsort, fields,
564                                   maxFieldLen, countfield, &ir,
565                                   &dbiflatGTmpId, dbiflatGFdl))
566 	return NULL;
567 
568     /* dbiflatGTmpId to ret->entry */
569     if(ajStrGetLen(dbiflatGTmpId) > *maxidlen)
570 	*maxidlen = ajStrGetLen(dbiflatGTmpId);
571 
572     if(systemsort)
573 	ajFmtPrintF(elistfile, "%S %d %d %d\n",
574                     dbiflatGTmpId, ir, is, ifile+1);
575     else
576     {
577 	dbiflatGEntry->entry   = ajCharNewS(dbiflatGTmpId);
578 	dbiflatGEntry->rpos    = ir;
579 	dbiflatGEntry->spos    = is;
580 	dbiflatGEntry->filenum = ifile+1;
581 
582 	/* field tokens as list, then move to dbiflatGEntry->field */
583 	for(ifield=0; ifield < nfields; ifield++)
584 	{
585 	    dbiflatGEntry->nfield[ifield] =
586             (ajuint) ajListGetLength(dbiflatGFdl[ifield]);
587 
588 	    if(dbiflatGEntry->nfield[ifield])
589 	    {
590 	        AJCNEW(dbiflatGEntry->field[ifield],
591 		       dbiflatGEntry->nfield[ifield]);
592 
593 		i = 0;
594 		while(ajListPop(dbiflatGFdl[ifield],(void**) &token))
595 		    dbiflatGEntry->field[ifield][i++] = token;
596 	    }
597 	    else
598 	        dbiflatGEntry->field[ifield] = NULL;
599 	}
600     }
601 
602     return dbiflatGEntry;
603 }
604 
605 
606 
607 
608 /* @funcstatic dbiflat_ParseSwiss *********************************************
609 **
610 ** Parse the ID, accession from a SwissProt or UniProtKB entry.
611 **
612 ** Reads to the end of the entry and then returns.
613 **
614 ** @param [u] libr [AjPFile] Input database file
615 ** @param [u] alistfile [AjPFile*] field data files array
616 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
617 ** @param [w] fields [AjPStr*] Fields required
618 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
619 ** @param [w] countfield [ajuint*] Number of tokens for each field
620 ** @param [w] dpos [ajint*] Byte offset
621 ** @param [w] myid [AjPStr*] ID
622 ** @param [w] myfdl [AjPList*] Lists of field values
623 ** @return [AjBool] ajTrue on success.
624 ** @@
625 ******************************************************************************/
626 
dbiflat_ParseSwiss(AjPFile libr,AjPFile * alistfile,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * countfield,ajint * dpos,AjPStr * myid,AjPList * myfdl)627 static AjBool dbiflat_ParseSwiss(AjPFile libr, AjPFile* alistfile,
628                                  AjBool systemsort, AjPStr* fields,
629                                  ajint* maxFieldLen, ajuint* countfield,
630                                  ajint* dpos, AjPStr* myid,
631                                  AjPList* myfdl)
632 {
633     AjPStr tmpacnum = NULL;
634     char* fd;
635     ajint lineType;
636     static ajint numFields;
637     static ajint accfield = -1;
638     static ajint desfield = -1;
639     static ajint keyfield = -1;
640     static ajint taxfield = -1;
641     static ajint svnfield = -1;
642     static AjBool reset = AJTRUE;
643     AjBool svndone = ajFalse;
644     AjBool done = ajFalse;
645     ajint i;
646     ajint lo;
647     ajint hi;
648     ajint fieldwidth;
649     AjPStr tmpac = NULL;
650     AjPStr format = NULL;
651     AjPStr prefix = NULL;
652     const char* p;
653     const char* q;
654     const char* swissprefix[] = {
655         "RecName: ", "AltName: ", "SubName: ",
656         "Includes:", "Contains:", "Flags: ",
657         "Full=", "Short=", "EC=",
658         "Allergen=", "Biotech=", "CD_antigen=", "INN=",
659         NULL
660     };
661     ajuint j;
662 
663     if(!fields)
664     {
665 	reset = ajTrue;
666 	accfield = svnfield = desfield = keyfield = taxfield = -1;
667 	return ajFalse;
668     }
669 
670     if(reset)
671     {
672 	numFields = 0;
673 	while(fields[numFields])
674 	{
675 	    countfield[numFields]=0;
676 	    if(ajStrMatchCaseC(fields[numFields], "acc"))
677 		accfield=numFields;
678 	    else if(ajStrMatchCaseC(fields[numFields], "sv"))
679 		svnfield=numFields;
680 	    else if(ajStrMatchCaseC(fields[numFields], "des"))
681 		desfield=numFields;
682 	    else if(ajStrMatchCaseC(fields[numFields], "key"))
683 		keyfield=numFields;
684 	    else if(ajStrMatchCaseC(fields[numFields], "org"))
685 		taxfield=numFields;
686 	    else
687 		ajWarn("EMBL parsing unknown field '%S' ignored",
688 		       fields[numFields]);
689 	    numFields++;
690 	}
691 
692 	reset = ajFalse;
693     }
694 
695     if(!dbiflatGRegEmblType)
696 	dbiflatGRegEmblType = ajRegCompC("^([A-Z][A-Z]) +");
697 
698     if(!dbiflatGRegEmblAcc)
699 	dbiflatGRegEmblAcc = ajRegCompC("([A-Za-z0-9-]+)");
700 
701     if(!dbiflatGRegEmblWrd)
702 	dbiflatGRegEmblWrd = ajRegCompC("([A-Za-z0-9_]+)");
703 
704     if(!dbiflatGRegEmblVer)
705 	dbiflatGRegEmblVer = ajRegCompC("([A-Za-z0-9_.]+)");
706 
707     if(!dbiflatGRegEmblPhr)
708 	dbiflatGRegEmblPhr = ajRegCompC(" *([^;.\n\r]+)");
709 
710     if(!dbiflatGRegEmblTax)
711 	dbiflatGRegEmblTax = ajRegCompC(" *([^;.\n\r()]+)");
712 
713     if(!dbiflatGRegEmblId)
714 	dbiflatGRegEmblId = ajRegCompC("^ID   ([^\\s;]+)(;\\s+SV\\s+(\\d+))?");
715 
716     if(!dbiflatGRegEmblEnd)
717 	dbiflatGRegEmblEnd = ajRegCompC("^//");
718 
719     *dpos = (ajint) ajFileResetPos(libr); /* Lossy cast */
720 
721     while(ajReadline(libr, &dbiflatGRline))
722     {
723 	if(ajRegExec(dbiflatGRegEmblEnd, dbiflatGRline))
724 	{
725 	    done = ajTrue;
726 	    break;
727 	}
728 
729 	if(ajRegExec(dbiflatGRegEmblType, dbiflatGRline))
730 	{
731 	    ajRegSubI(dbiflatGRegEmblType, 1, &dbiflatGTypStr);
732 	    if(ajStrMatchC(dbiflatGTypStr, "ID"))
733 		lineType = FLATTYPE_ID;
734 	    else if(ajStrMatchC(dbiflatGTypStr, "SV") ||
735 		    ajStrMatchC(dbiflatGTypStr, "IV")) /* emblcds database */
736 		lineType = FLATTYPE_VER;
737 	    else if(ajStrMatchC(dbiflatGTypStr, "AC") ||
738 		    ajStrMatchC(dbiflatGTypStr, "PA")) /* emblcds database */
739 		lineType = FLATTYPE_ACC;
740 	    else if(ajStrMatchC(dbiflatGTypStr, "DE"))
741 		lineType = FLATTYPE_DES;
742 	    else if(ajStrMatchC(dbiflatGTypStr, "KW"))
743 		lineType = FLATTYPE_KEY;
744 	    else if(ajStrMatchC(dbiflatGTypStr, "OS"))
745 		lineType = FLATTYPE_TAX;
746 	    else if(ajStrMatchC(dbiflatGTypStr, "OC"))
747 		lineType = FLATTYPE_TAX;
748 	    else
749 		lineType=FLATTYPE_OTHER;
750 
751 	    if(lineType != FLATTYPE_OTHER)
752 		ajRegPost(dbiflatGRegEmblType, &dbiflatGTmpLine);
753 	}
754 	else
755 	    lineType = FLATTYPE_OTHER;
756 
757 	if(lineType == FLATTYPE_ID)
758 	{
759 	    ajRegExec(dbiflatGRegEmblId, dbiflatGRline);
760 	    ajRegSubI(dbiflatGRegEmblId, 1, myid);
761 	    ajStrFmtUpper(myid);
762 	    ajDebug("++id '%S'\n", *myid);
763 	    ajRegSubI(dbiflatGRegEmblId, 3, &dbiflatGTmpFd);
764 	    if(svnfield >= 0 && ajStrGetLen(dbiflatGTmpFd))
765 	    {
766 		ajStrFmtUpper(&dbiflatGTmpFd);
767 		ajStrInsertK(&dbiflatGTmpFd, 0, '.');
768 		ajStrInsertS(&dbiflatGTmpFd, 0, *myid);
769 		/*ajDebug("++sv '%S'\n", dbiflatGTmpFd);*/
770 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
771 
772 		countfield[svnfield]++;
773 		if(systemsort)
774 		    ajFmtPrintF(alistfile[svnfield], "%S %S\n",
775                                 *myid, dbiflatGTmpFd);
776 		else
777 		{
778 		    fd = ajCharNewS(dbiflatGTmpFd);
779 		    ajListPushAppend(myfdl[svnfield], fd);
780 		}
781 		svndone = ajTrue;
782 	    }
783 	    continue;
784 	}
785 
786 	if(lineType == FLATTYPE_ACC && accfield >= 0)
787 	{
788 	    while(ajRegExec(dbiflatGRegEmblAcc, dbiflatGTmpLine))
789 	    {
790 		ajRegSubI(dbiflatGRegEmblAcc, 1, &dbiflatGTmpFd);
791 		ajStrFmtUpper(&dbiflatGTmpFd);
792 		/*ajDebug("++acc '%S'\n", dbiflatGTmpFd);*/
793 
794 		if(!tmpacnum)
795 		    ajStrAssignS(&tmpacnum, dbiflatGTmpFd);
796 
797 		if((p=strchr(MAJSTRGETPTR(dbiflatGTmpFd),(int)'-')))
798 		{
799 		    q = p;
800 		    while(isdigit((int)*(--q)));
801 		    ++q;
802 		    ajStrAssignSubC(&dbiflatGTmpStr,q,0,(ajint)(p-q-1));
803 		    ajStrToInt(dbiflatGTmpStr,&lo);
804 		    fieldwidth = (ajint) (p-q);
805 		    ajFmtPrintS(&format,"%%S%%0%dd",fieldwidth);
806 
807 		    ++p;
808 		    q = p;
809 		    while(!isdigit((int)*q))
810 			++q;
811 		    sscanf(q,"%d",&hi);
812 		    ajStrAssignSubC(&prefix,p,0,(ajint)(q-p-1));
813 
814 		    if(systemsort)
815 		    {
816 			for(i=lo;i<=hi;++i)
817 			{
818 			    ajFmtPrintS(&tmpac,MAJSTRGETPTR(format),prefix,i);
819 			    embDbiMaxlen(&tmpac, &maxFieldLen[accfield]);
820 			    countfield[accfield]++;
821 			    ajFmtPrintF(alistfile[accfield],
822 					"%S %S\n", *myid, tmpac);
823 			}
824 			ajStrDel(&tmpac);
825 		    }
826 		    else
827 		    {
828 			for(i=lo;i<=hi;++i)
829 			{
830 			    ajFmtPrintS(&tmpac,MAJSTRGETPTR(format),prefix,i);
831 			    embDbiMaxlen(&tmpac, &maxFieldLen[accfield]);
832 			    countfield[accfield]++;
833 			    fd = ajCharNewS(tmpac);
834 			    ajListPushAppend(myfdl[accfield], fd);
835 			}
836 			ajStrDel(&tmpac);
837 		    }
838 		    ajStrDel(&format);
839 		    ajStrDel(&prefix);
840 		}
841 		else {
842 		    embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[accfield]);
843 
844 		    countfield[accfield]++;
845 		    if(systemsort)
846 			ajFmtPrintF(alistfile[accfield],
847 				    "%S %S\n", *myid, dbiflatGTmpFd);
848 		    else
849 		    {
850 			fd = ajCharNewS(dbiflatGTmpFd);
851 			ajListPushAppend(myfdl[accfield], fd);
852 		    }
853 		}
854 		ajRegPost(dbiflatGRegEmblAcc, &dbiflatGTmpStr);
855                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
856 	    }
857 	    continue;
858 	}
859 	else if(lineType == FLATTYPE_DES && desfield >= 0)
860 	{
861             ajStrTrimWhiteStart(&dbiflatGTmpLine);
862             for(j=0; swissprefix[j]; j++)
863             {
864                 if(ajStrPrefixC(dbiflatGTmpLine, swissprefix[j]))
865                     ajStrCutStart(&dbiflatGTmpLine, strlen(swissprefix[j]));
866             }
867 	    while(ajRegExec(dbiflatGRegEmblWrd, dbiflatGTmpLine))
868 	    {
869 		ajRegSubI(dbiflatGRegEmblWrd, 1, &dbiflatGTmpFd);
870 		ajStrFmtUpper(&dbiflatGTmpFd);
871 		/*ajDebug("++des '%S'\n", dbiflatGTmpFd);*/
872 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[desfield]);
873 
874 		countfield[desfield]++;
875 		if(systemsort)
876 		    ajFmtPrintF(alistfile[desfield], "%S %S\n",
877                                 *myid, dbiflatGTmpFd);
878 		else
879 		{
880 		    fd = ajCharNewS(dbiflatGTmpFd);
881 		    ajListPushAppend(myfdl[desfield], fd);
882 		}
883 		ajRegPost(dbiflatGRegEmblWrd, &dbiflatGTmpStr);
884                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
885 	    }
886 	    continue;
887 	}
888 	else if(lineType == FLATTYPE_VER && svnfield >= 0)
889 	{
890 	    while(ajRegExec(dbiflatGRegEmblVer, dbiflatGTmpLine))
891 	    {
892 		ajRegSubI(dbiflatGRegEmblVer, 1, &dbiflatGTmpFd);
893 		ajStrFmtUpper(&dbiflatGTmpFd);
894 		/*ajDebug("++sv '%S'\n", dbiflatGTmpFd);*/
895 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
896 
897 		countfield[svnfield]++;
898 		if(systemsort)
899 		    ajFmtPrintF(alistfile[svnfield], "%S %S\n",
900                                 *myid, dbiflatGTmpFd);
901 		else
902 		{
903 		    fd = ajCharNewS(dbiflatGTmpFd);
904 		    ajListPushAppend(myfdl[svnfield], fd);
905 		}
906 		ajRegPost(dbiflatGRegEmblVer, &dbiflatGTmpStr);
907                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
908             }
909 	    svndone = ajTrue;
910 	    continue;
911 	}
912 	else if(lineType == FLATTYPE_KEY && keyfield >= 0)
913 	{
914 	    while(ajRegExec(dbiflatGRegEmblPhr, dbiflatGTmpLine))
915 	    {
916 		ajRegSubI(dbiflatGRegEmblPhr, 1, &dbiflatGTmpFd);
917 		ajRegPost(dbiflatGRegEmblPhr, &dbiflatGTmpStr);
918                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
919 		ajStrTrimWhiteEnd(&dbiflatGTmpFd);
920 		if(!ajStrGetLen(dbiflatGTmpFd))
921 		    continue;
922 		ajStrFmtUpper(&dbiflatGTmpFd);
923 		/*ajDebug("++key '%S'\n", dbiflatGTmpFd);*/
924 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[keyfield]);
925 
926 		countfield[keyfield]++;
927 		if(systemsort)
928 		    ajFmtPrintF(alistfile[keyfield], "%S %S\n",
929                                 *myid, dbiflatGTmpFd);
930 		else
931 		{
932 		    fd = ajCharNewS(dbiflatGTmpFd);
933 		    ajListPushAppend(myfdl[keyfield], fd);
934 		}
935 	    }
936 	    continue;
937 	}
938 	else if(lineType == FLATTYPE_TAX && taxfield >= 0)
939 	{
940 	    while(ajRegExec(dbiflatGRegEmblTax, dbiflatGTmpLine))
941 	    {
942 		ajRegSubI(dbiflatGRegEmblTax, 1, &dbiflatGTmpFd);
943 		ajRegPost(dbiflatGRegEmblTax, &dbiflatGTmpStr);
944                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
945 		ajStrFmtUpper(&dbiflatGTmpFd);
946 		ajStrTrimWhiteEnd(&dbiflatGTmpFd);
947 		if(!ajStrGetLen(dbiflatGTmpFd))
948 		    continue;
949 		/*ajDebug("++tax '%S'\n", dbiflatGTmpFd);*/
950 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[taxfield]);
951 
952 		countfield[taxfield]++;
953 		if(systemsort)
954 		    ajFmtPrintF(alistfile[taxfield], "%S %S\n",
955                                 *myid, dbiflatGTmpFd);
956 		else
957 		{
958 		    fd = ajCharNewS(dbiflatGTmpFd);
959 		    ajListPushAppend(myfdl[taxfield], fd);
960 		}
961 	    }
962 	    continue;
963 	}
964     }
965 
966     if(!done)
967 	return ajFalse;
968 
969     if(svnfield >= 0 && !svndone && tmpacnum)
970     {
971 	ajFmtPrintS(&dbiflatGTmpFd, "%S.0", tmpacnum);
972 	embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
973 
974 	countfield[svnfield]++;
975 	if(systemsort)
976 	    ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiflatGTmpFd);
977 	else
978 	{
979 	    fd = ajCharNewS(dbiflatGTmpFd);
980 	    ajListPushAppend(myfdl[svnfield], fd);
981 	}
982     }
983 
984     ajStrDel(&tmpacnum);
985 
986     return ajTrue;
987 }
988 
989 
990 
991 
992 /* @funcstatic dbiflat_ParseEmbl **********************************************
993 **
994 ** Parse the ID, accession from an EMBL entry.
995 **
996 ** Reads to the end of the entry and then returns.
997 **
998 ** @param [u] libr [AjPFile] Input database file
999 ** @param [u] alistfile [AjPFile*] field data files array
1000 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1001 ** @param [w] fields [AjPStr*] Fields required
1002 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1003 ** @param [w] countfield [ajuint*] Number of tokens for each field
1004 ** @param [w] dpos [ajint*] Byte offset
1005 ** @param [w] myid [AjPStr*] ID
1006 ** @param [w] myfdl [AjPList*] Lists of field values
1007 ** @return [AjBool] ajTrue on success.
1008 ** @@
1009 ******************************************************************************/
1010 
dbiflat_ParseEmbl(AjPFile libr,AjPFile * alistfile,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * countfield,ajint * dpos,AjPStr * myid,AjPList * myfdl)1011 static AjBool dbiflat_ParseEmbl(AjPFile libr, AjPFile* alistfile,
1012 				AjBool systemsort, AjPStr* fields,
1013 				ajint* maxFieldLen, ajuint* countfield,
1014 				ajint* dpos, AjPStr* myid,
1015 				AjPList* myfdl)
1016 {
1017     AjPStr tmpacnum = NULL;
1018     char* fd;
1019     ajint lineType;
1020     static ajint numFields;
1021     static ajint accfield = -1;
1022     static ajint desfield = -1;
1023     static ajint keyfield = -1;
1024     static ajint taxfield = -1;
1025     static ajint svnfield = -1;
1026     static AjBool reset = AJTRUE;
1027     AjBool svndone = ajFalse;
1028     AjBool done = ajFalse;
1029     ajint i = 0;
1030     ajint lo;
1031     ajint hi;
1032     ajint fieldwidth;
1033     AjPStr tmpac = NULL;
1034     AjPStr format = NULL;
1035     AjPStr prefix = NULL;
1036     const char* p;
1037     const char* q;
1038 
1039     if(!fields)
1040     {
1041 	reset = ajTrue;
1042 	accfield = svnfield = desfield = keyfield = taxfield = -1;
1043 	return ajFalse;
1044     }
1045 
1046     if(reset)
1047     {
1048 	numFields = 0;
1049 	while(fields[numFields])
1050 	{
1051 	    countfield[numFields]=0;
1052 	    if(ajStrMatchCaseC(fields[numFields], "acc"))
1053 		accfield=numFields;
1054 	    else if(ajStrMatchCaseC(fields[numFields], "sv"))
1055 		svnfield=numFields;
1056 	    else if(ajStrMatchCaseC(fields[numFields], "des"))
1057 		desfield=numFields;
1058 	    else if(ajStrMatchCaseC(fields[numFields], "key"))
1059 		keyfield=numFields;
1060 	    else if(ajStrMatchCaseC(fields[numFields], "org"))
1061 		taxfield=numFields;
1062 	    else
1063 		ajWarn("EMBL parsing unknown field '%S' ignored",
1064 		       fields[numFields]);
1065 	    numFields++;
1066 	}
1067 
1068 	reset = ajFalse;
1069     }
1070 
1071     if(!dbiflatGRegEmblType)
1072 	dbiflatGRegEmblType = ajRegCompC("^([A-Z][A-Z]) +");
1073 
1074     if(!dbiflatGRegEmblAcc)
1075 	dbiflatGRegEmblAcc = ajRegCompC("([A-Za-z0-9-]+)");
1076 
1077     if(!dbiflatGRegEmblWrd)
1078 	dbiflatGRegEmblWrd = ajRegCompC("([A-Za-z0-9_]+)");
1079 
1080     if(!dbiflatGRegEmblVer)
1081 	dbiflatGRegEmblVer = ajRegCompC("([A-Za-z0-9_.]+)");
1082 
1083     if(!dbiflatGRegEmblPhr)
1084 	dbiflatGRegEmblPhr = ajRegCompC(" *([^;.\n\r]+)");
1085 
1086     if(!dbiflatGRegEmblTax)
1087 	dbiflatGRegEmblTax = ajRegCompC(" *([^;.\n\r()]+)");
1088 
1089     if(!dbiflatGRegEmblId)
1090 	dbiflatGRegEmblId = ajRegCompC("^ID   ([^\\s;]+)(;\\s+SV\\s+(\\d+))?");
1091 
1092     if(!dbiflatGRegEmblEnd)
1093 	dbiflatGRegEmblEnd = ajRegCompC("^//");
1094 
1095     *dpos = (ajint) ajFileResetPos(libr); /* Lossy cast */
1096 
1097     while(ajReadline(libr, &dbiflatGRline))
1098     {
1099 	if(ajRegExec(dbiflatGRegEmblEnd, dbiflatGRline))
1100 	{
1101 	    done = ajTrue;
1102 	    break;
1103 	}
1104 
1105 	if(ajRegExec(dbiflatGRegEmblType, dbiflatGRline))
1106 	{
1107 	    ajRegSubI(dbiflatGRegEmblType, 1, &dbiflatGTypStr);
1108 	    if(ajStrMatchC(dbiflatGTypStr, "ID"))
1109 		lineType = FLATTYPE_ID;
1110 	    else if(ajStrMatchC(dbiflatGTypStr, "SV") ||
1111 		    ajStrMatchC(dbiflatGTypStr, "IV")) /* emblcds database */
1112 		lineType = FLATTYPE_VER;
1113 	    else if(ajStrMatchC(dbiflatGTypStr, "AC") ||
1114 		    ajStrMatchC(dbiflatGTypStr, "PA")) /* emblcds database */
1115 		lineType = FLATTYPE_ACC;
1116 	    else if(ajStrMatchC(dbiflatGTypStr, "DE"))
1117 		lineType = FLATTYPE_DES;
1118 	    else if(ajStrMatchC(dbiflatGTypStr, "KW"))
1119 		lineType = FLATTYPE_KEY;
1120 	    else if(ajStrMatchC(dbiflatGTypStr, "OS"))
1121 		lineType = FLATTYPE_TAX;
1122 	    else if(ajStrMatchC(dbiflatGTypStr, "OC"))
1123 		lineType = FLATTYPE_TAX;
1124 	    else
1125 		lineType=FLATTYPE_OTHER;
1126 
1127 	    if(lineType != FLATTYPE_OTHER)
1128 		ajRegPost(dbiflatGRegEmblType, &dbiflatGTmpLine);
1129 	}
1130 	else
1131 	    lineType = FLATTYPE_OTHER;
1132 
1133 	if(lineType == FLATTYPE_ID)
1134 	{
1135 	    ajRegExec(dbiflatGRegEmblId, dbiflatGRline);
1136 	    ajRegSubI(dbiflatGRegEmblId, 1, myid);
1137 	    ajStrFmtUpper(myid);
1138 	    ajDebug("++id '%S'\n", *myid);
1139 	    ajRegSubI(dbiflatGRegEmblId, 3, &dbiflatGTmpFd);
1140 	    if(svnfield >= 0 && ajStrGetLen(dbiflatGTmpFd))
1141 	    {
1142 		ajStrFmtUpper(&dbiflatGTmpFd);
1143 		ajStrInsertK(&dbiflatGTmpFd, 0, '.');
1144 		ajStrInsertS(&dbiflatGTmpFd, 0, *myid);
1145 		/*ajDebug("++sv '%S'\n", dbiflatGTmpFd);*/
1146 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1147 
1148 		countfield[svnfield]++;
1149 		if(systemsort)
1150 		    ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1151                                 *myid, dbiflatGTmpFd);
1152 		else
1153 		{
1154 		    fd = ajCharNewS(dbiflatGTmpFd);
1155 		    ajListPushAppend(myfdl[svnfield], fd);
1156 		}
1157 		svndone = ajTrue;
1158 	    }
1159 	    continue;
1160 	}
1161 
1162 	if(lineType == FLATTYPE_ACC && accfield >= 0)
1163 	{
1164 	    while(ajRegExec(dbiflatGRegEmblAcc, dbiflatGTmpLine))
1165 	    {
1166 		ajRegSubI(dbiflatGRegEmblAcc, 1, &dbiflatGTmpFd);
1167 		ajStrFmtUpper(&dbiflatGTmpFd);
1168 		/*ajDebug("++acc '%S'\n", dbiflatGTmpFd);*/
1169 
1170 		if(!tmpacnum)
1171 		    ajStrAssignS(&tmpacnum, dbiflatGTmpFd);
1172 
1173 		if((p=strchr(MAJSTRGETPTR(dbiflatGTmpFd),(int)'-')))
1174 		{
1175 		    q = p;
1176 		    while(isdigit((int)*(--q)));
1177 		    ++q;
1178 		    ajStrAssignSubC(&dbiflatGTmpStr,q,0,(ajint)(p-q-1));
1179 		    ajStrToInt(dbiflatGTmpStr,&lo);
1180 		    fieldwidth = (ajint) (p-q);
1181 		    ajFmtPrintS(&format,"%%S%%0%dd",fieldwidth);
1182 
1183 		    ++p;
1184 		    q = p;
1185 		    while(!isdigit((int)*q))
1186 			++q;
1187 		    sscanf(q,"%d",&hi);
1188 		    ajStrAssignSubC(&prefix,p,0,(ajint)(q-p-1));
1189 
1190 		    if(systemsort)
1191 		    {
1192 			for(i=lo;i<=hi;++i)
1193 			{
1194 			    ajFmtPrintS(&tmpac,MAJSTRGETPTR(format),prefix,i);
1195 			    embDbiMaxlen(&tmpac, &maxFieldLen[accfield]);
1196 			    countfield[accfield]++;
1197 			    ajFmtPrintF(alistfile[accfield],
1198 					"%S %S\n", *myid, tmpac);
1199 			}
1200 			ajStrDel(&tmpac);
1201 		    }
1202 		    else
1203 		    {
1204 			for(i=lo;i<=hi;++i)
1205 			{
1206 			    ajFmtPrintS(&tmpac,MAJSTRGETPTR(format),prefix,i);
1207 			    embDbiMaxlen(&tmpac, &maxFieldLen[accfield]);
1208 			    countfield[accfield]++;
1209 			    fd = ajCharNewS(tmpac);
1210 			    ajListPushAppend(myfdl[accfield], fd);
1211 			}
1212 			ajStrDel(&tmpac);
1213 		    }
1214 		    ajStrDel(&format);
1215 		    ajStrDel(&prefix);
1216 		}
1217 		else {
1218 		    embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[accfield]);
1219 
1220 		    countfield[accfield]++;
1221 		    if(systemsort)
1222 			ajFmtPrintF(alistfile[accfield],
1223 				    "%S %S\n", *myid, dbiflatGTmpFd);
1224 		    else
1225 		    {
1226 			fd = ajCharNewS(dbiflatGTmpFd);
1227 			ajListPushAppend(myfdl[accfield], fd);
1228 		    }
1229 		}
1230 		ajRegPost(dbiflatGRegEmblAcc, &dbiflatGTmpStr);
1231                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1232 	    }
1233 	    continue;
1234 	}
1235 	else if(lineType == FLATTYPE_DES && desfield >= 0)
1236 	{
1237 	    while(ajRegExec(dbiflatGRegEmblWrd, dbiflatGTmpLine))
1238 	    {
1239 		ajRegSubI(dbiflatGRegEmblWrd, 1, &dbiflatGTmpFd);
1240 		ajStrFmtUpper(&dbiflatGTmpFd);
1241 		/*ajDebug("++des '%S'\n", dbiflatGTmpFd);*/
1242 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[desfield]);
1243 
1244 		countfield[desfield]++;
1245 		if(systemsort)
1246 		    ajFmtPrintF(alistfile[desfield], "%S %S\n",
1247                                 *myid, dbiflatGTmpFd);
1248 		else
1249 		{
1250 		    fd = ajCharNewS(dbiflatGTmpFd);
1251 		    ajListPushAppend(myfdl[desfield], fd);
1252 		}
1253 		ajRegPost(dbiflatGRegEmblWrd, &dbiflatGTmpStr);
1254                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1255 	    }
1256 	    continue;
1257 	}
1258 	else if(lineType == FLATTYPE_VER && svnfield >= 0)
1259 	{
1260 	    while(ajRegExec(dbiflatGRegEmblVer, dbiflatGTmpLine))
1261 	    {
1262 		ajRegSubI(dbiflatGRegEmblVer, 1, &dbiflatGTmpFd);
1263 		ajStrFmtUpper(&dbiflatGTmpFd);
1264 		/*ajDebug("++sv '%S'\n", dbiflatGTmpFd);*/
1265 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1266 
1267 		countfield[svnfield]++;
1268 		if(systemsort)
1269 		    ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1270                                 *myid, dbiflatGTmpFd);
1271 		else
1272 		{
1273 		    fd = ajCharNewS(dbiflatGTmpFd);
1274 		    ajListPushAppend(myfdl[svnfield], fd);
1275 		}
1276 		ajRegPost(dbiflatGRegEmblVer, &dbiflatGTmpStr);
1277                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1278             }
1279 	    svndone = ajTrue;
1280 	    continue;
1281 	}
1282 	else if(lineType == FLATTYPE_KEY && keyfield >= 0)
1283 	{
1284 	    while(ajRegExec(dbiflatGRegEmblPhr, dbiflatGTmpLine))
1285 	    {
1286 		ajRegSubI(dbiflatGRegEmblPhr, 1, &dbiflatGTmpFd);
1287 		ajRegPost(dbiflatGRegEmblPhr, &dbiflatGTmpStr);
1288                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1289 		ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1290 		if(!ajStrGetLen(dbiflatGTmpFd))
1291 		    continue;
1292 		ajStrFmtUpper(&dbiflatGTmpFd);
1293 		/*ajDebug("++key '%S'\n", dbiflatGTmpFd);*/
1294 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[keyfield]);
1295 
1296 		countfield[keyfield]++;
1297 		if(systemsort)
1298 		    ajFmtPrintF(alistfile[keyfield], "%S %S\n",
1299                                 *myid, dbiflatGTmpFd);
1300 		else
1301 		{
1302 		    fd = ajCharNewS(dbiflatGTmpFd);
1303 		    ajListPushAppend(myfdl[keyfield], fd);
1304 		}
1305 	    }
1306 	    continue;
1307 	}
1308 	else if(lineType == FLATTYPE_TAX && taxfield >= 0)
1309 	{
1310 	    while(ajRegExec(dbiflatGRegEmblTax, dbiflatGTmpLine))
1311 	    {
1312 		ajRegSubI(dbiflatGRegEmblTax, 1, &dbiflatGTmpFd);
1313 		ajRegPost(dbiflatGRegEmblTax, &dbiflatGTmpStr);
1314                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1315 		ajStrFmtUpper(&dbiflatGTmpFd);
1316 		ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1317 		if(!ajStrGetLen(dbiflatGTmpFd))
1318 		    continue;
1319 		/*ajDebug("++tax '%S'\n", dbiflatGTmpFd);*/
1320 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[taxfield]);
1321 
1322 		countfield[taxfield]++;
1323 		if(systemsort)
1324 		    ajFmtPrintF(alistfile[taxfield], "%S %S\n",
1325                                 *myid, dbiflatGTmpFd);
1326 		else
1327 		{
1328 		    fd = ajCharNewS(dbiflatGTmpFd);
1329 		    ajListPushAppend(myfdl[taxfield], fd);
1330 		}
1331 	    }
1332 	    continue;
1333 	}
1334     }
1335 
1336     if(!done)
1337 	return ajFalse;
1338 
1339     if(svnfield >= 0 && !svndone && tmpacnum)
1340     {
1341 	ajFmtPrintS(&dbiflatGTmpFd, "%S.0", tmpacnum);
1342 	embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1343 
1344 	countfield[svnfield]++;
1345 	if(systemsort)
1346 	    ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiflatGTmpFd);
1347 	else
1348 	{
1349 	    fd = ajCharNewS(dbiflatGTmpFd);
1350 	    ajListPushAppend(myfdl[svnfield], fd);
1351 	}
1352     }
1353 
1354     ajStrDel(&tmpacnum);
1355 
1356     return ajTrue;
1357 }
1358 
1359 
1360 
1361 
1362 /* @funcstatic dbiflat_ParseGenbank *******************************************
1363 **
1364 ** Parse the ID, accession from a Genbank entry
1365 **
1366 ** @param [u] libr [AjPFile] Input database file
1367 ** @param [u] alistfile [AjPFile*] field data files array
1368 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1369 ** @param [w] fields [AjPStr*] Fields required
1370 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1371 ** @param [w] countfield [ajuint*] Number of tokens for each field
1372 ** @param [w] dpos [ajint*] Byte offset
1373 ** @param [w] myid [AjPStr*] ID
1374 ** @param [w] myfdl [AjPList*] Lists of field values
1375 ** @return [AjBool] ajTrue on success.
1376 ** @@
1377 ******************************************************************************/
1378 
dbiflat_ParseGenbank(AjPFile libr,AjPFile * alistfile,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * countfield,ajint * dpos,AjPStr * myid,AjPList * myfdl)1379 static AjBool dbiflat_ParseGenbank(AjPFile libr, AjPFile* alistfile,
1380 				   AjBool systemsort, AjPStr* fields,
1381 				   ajint* maxFieldLen, ajuint* countfield,
1382 				   ajint* dpos, AjPStr* myid,
1383 				   AjPList* myfdl)
1384 {
1385     ajint lineType  = FLATTYPE_OTHER;
1386     AjPStr tmpacnum = NULL;
1387     char* fd;
1388     ajlong ipos = 0;
1389     static ajint numFields;
1390     static ajint accfield = -1;
1391     static ajint desfield = -1;
1392     static ajint keyfield = -1;
1393     static ajint taxfield = -1;
1394     static ajint svnfield = -1;
1395     static AjBool reset = AJTRUE;
1396     AjBool done = ajFalse;
1397     AjBool svndone = ajFalse;
1398 
1399     if(!fields)
1400     {
1401 	reset = ajTrue;
1402 	accfield = svnfield = desfield = keyfield = taxfield = -1;
1403 	return ajFalse;
1404     }
1405 
1406     if(reset)
1407     {
1408 	numFields = 0;
1409 	while(fields[numFields])
1410 	{
1411 	    countfield[numFields]=0;
1412 	    if(ajStrMatchCaseC(fields[numFields], "acc"))
1413 		accfield=numFields;
1414 	    else if(ajStrMatchCaseC(fields[numFields], "sv"))
1415 		svnfield=numFields;
1416 	    else if(ajStrMatchCaseC(fields[numFields], "des"))
1417 		desfield=numFields;
1418 	    else if(ajStrMatchCaseC(fields[numFields], "key"))
1419 		keyfield=numFields;
1420 	    else if(ajStrMatchCaseC(fields[numFields], "org"))
1421 		taxfield=numFields;
1422 	    else
1423 		ajWarn("GenBank parsing unknown field '%S' ignored",
1424 		       fields[numFields]);
1425 
1426 	    numFields++;
1427 	}
1428 	reset = ajFalse;
1429     }
1430 
1431     if(!dbiflatGRegGbType)
1432 	dbiflatGRegGbType = ajRegCompC("^(  )?([A-Z]+)");
1433 
1434     if(!dbiflatGRegGbMore)
1435 	dbiflatGRegGbMore = ajRegCompC("^            ");
1436 
1437     if(!dbiflatGRegGbWrd)
1438 	dbiflatGRegGbWrd = ajRegCompC("([A-Za-z0-9_]+)");
1439 
1440     if(!dbiflatGRegGbPhr)
1441 	dbiflatGRegGbPhr = ajRegCompC(" *([^;.\n\r]+)");
1442 
1443     if(!dbiflatGRegGbTax)
1444 	dbiflatGRegGbTax = ajRegCompC(" *([^;.\n\r()]+)");
1445 
1446     if(!dbiflatGRegGbVer)
1447 	dbiflatGRegGbVer = ajRegCompC("([A-Za-z0-9.]+)( +GI:([0-9]+))?");
1448 
1449     if(!dbiflatGRegGbEnd)
1450 	dbiflatGRegGbEnd = ajRegCompC("^//");
1451 
1452     ipos = ajFileResetPos(libr);
1453 
1454     while(ajReadline(libr, &dbiflatGRline))
1455     {
1456 	if(ajRegExec(dbiflatGRegGbEnd, dbiflatGRline))
1457 	{
1458 	    done = ajTrue;
1459 	    break;
1460 	}
1461 
1462 	if(ajRegExec(dbiflatGRegGbType, dbiflatGRline))
1463 	{
1464 	    ajRegSubI(dbiflatGRegGbType, 2, &dbiflatGTypStr);
1465 	    if(ajStrMatchC(dbiflatGTypStr, "LOCUS"))
1466 		lineType = FLATTYPE_ID;
1467 	    else if(ajStrMatchC(dbiflatGTypStr, "VERSION"))
1468 		lineType = FLATTYPE_VER;
1469 	    else if(ajStrMatchC(dbiflatGTypStr, "ACCESSION"))
1470 		lineType = FLATTYPE_ACC;
1471 	    else if(ajStrMatchC(dbiflatGTypStr, "DEFINITION"))
1472 		lineType = FLATTYPE_DES;
1473 	    else if(ajStrMatchC(dbiflatGTypStr, "KEYWORDS"))
1474 		lineType = FLATTYPE_KEY;
1475 	    else if(ajStrMatchC(dbiflatGTypStr, "ORGANISM"))
1476 		lineType = FLATTYPE_TAX;
1477 	    else lineType=FLATTYPE_OTHER;
1478 
1479 	    if(lineType != FLATTYPE_OTHER)
1480 		ajRegPost(dbiflatGRegGbType, &dbiflatGTmpLine);
1481 	    /*ajDebug("++type line %d\n", lineType);*/
1482 	}
1483 	else if(lineType != FLATTYPE_OTHER &&
1484                 ajRegExec(dbiflatGRegGbMore, dbiflatGRline))
1485 	{
1486 	    ajRegPost(dbiflatGRegGbMore, &dbiflatGTmpLine);
1487 	    /*ajDebug("++more line %d\n", lineType);*/
1488 	}
1489 	else
1490 	    lineType = FLATTYPE_OTHER;
1491 
1492 	if(lineType == FLATTYPE_ID)
1493 	{
1494 	    ajRegExec(dbiflatGRegGbWrd, dbiflatGTmpLine);
1495 	    ajRegSubI(dbiflatGRegGbWrd, 1, myid);
1496 	    *dpos = (ajint) ipos; /* Lossy cast */
1497 	}
1498 
1499 	else if(lineType == FLATTYPE_ACC && accfield >= 0)
1500 	{
1501 	    while(ajRegExec(dbiflatGRegGbWrd, dbiflatGTmpLine))
1502 	    {
1503 		ajRegSubI(dbiflatGRegGbWrd, 1, &dbiflatGTmpFd);
1504 		ajStrFmtUpper(&dbiflatGTmpFd);
1505 		/*ajDebug("++acc '%S'\n", dbiflatGTmpFd);*/
1506 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[accfield]);
1507 
1508 		countfield[accfield]++;
1509 		if(systemsort)
1510 		    ajFmtPrintF(alistfile[accfield], "%S %S\n",
1511                                 *myid, dbiflatGTmpFd);
1512 		else
1513 		{
1514 		    fd = ajCharNewS(dbiflatGTmpFd);
1515 		    ajListPushAppend(myfdl[accfield], fd);
1516 		}
1517 		ajRegPost(dbiflatGRegGbWrd, &dbiflatGTmpStr);
1518                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1519 	    }
1520 	    continue;
1521 	}
1522 
1523 	else if(lineType == FLATTYPE_DES && desfield >= 0)
1524 	{
1525 	    while(ajRegExec(dbiflatGRegGbWrd, dbiflatGTmpLine))
1526 	    {
1527 	        ajRegSubI(dbiflatGRegGbWrd, 1, &dbiflatGTmpFd);
1528 		ajStrFmtUpper(&dbiflatGTmpFd);
1529 		/*ajDebug("++des '%S'\n", dbiflatGTmpFd);*/
1530 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[desfield]);
1531 
1532 		countfield[desfield]++;
1533 		if(systemsort)
1534 		    ajFmtPrintF(alistfile[desfield],
1535 				"%S %S\n", *myid, dbiflatGTmpFd);
1536 		else
1537 		{
1538 		    fd = ajCharNewS(dbiflatGTmpFd);
1539 		    ajListPushAppend(myfdl[desfield], fd);
1540 		}
1541 		ajRegPost(dbiflatGRegGbWrd, &dbiflatGTmpStr);
1542                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1543 	    }
1544 	    continue;
1545 	}
1546 
1547 	else if(lineType == FLATTYPE_KEY && keyfield >= 0)
1548 	{
1549 	    while(ajRegExec(dbiflatGRegGbPhr, dbiflatGTmpLine))
1550 	    {
1551 	        ajRegSubI(dbiflatGRegGbPhr, 1, &dbiflatGTmpFd);
1552 		ajRegPost(dbiflatGRegGbPhr, &dbiflatGTmpStr);
1553                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1554 		ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1555 		if(!ajStrGetLen(dbiflatGTmpFd))
1556 		    continue;
1557 		ajStrFmtUpper(&dbiflatGTmpFd);
1558 		/*ajDebug("++key '%S'\n", dbiflatGTmpFd);*/
1559 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[keyfield]);
1560 
1561 		countfield[keyfield]++;
1562 		if(systemsort)
1563 		    ajFmtPrintF(alistfile[keyfield],
1564 				"%S %S\n", *myid, dbiflatGTmpFd);
1565 		else
1566 		{
1567 		    fd = ajCharNewS(dbiflatGTmpFd);
1568 		    ajListPushAppend(myfdl[keyfield], fd);
1569 		}
1570 	    }
1571 	    continue;
1572 	}
1573 
1574 	else if(lineType == FLATTYPE_TAX && taxfield >= 0)
1575 	{
1576 	    while(ajRegExec(dbiflatGRegGbTax, dbiflatGTmpLine))
1577 	    {
1578 	        ajRegSubI(dbiflatGRegGbTax, 1, &dbiflatGTmpFd);
1579 		ajRegPost(dbiflatGRegGbTax, &dbiflatGTmpStr);
1580                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1581 		ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1582 		if(!ajStrGetLen(dbiflatGTmpFd))
1583 		    continue;
1584 		ajStrFmtUpper(&dbiflatGTmpFd);
1585 		/*ajDebug("++tax '%S'\n", dbiflatGTmpFd);*/
1586 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[taxfield]);
1587 
1588 		countfield[taxfield]++;
1589 		if(systemsort)
1590 		    ajFmtPrintF(alistfile[taxfield],
1591 				"%S %S\n", *myid, dbiflatGTmpFd);
1592 		else
1593 		{
1594 		    fd = ajCharNewS(dbiflatGTmpFd);
1595 		    ajListPushAppend(myfdl[taxfield], fd);
1596 		}
1597 	    }
1598 	    continue;
1599 	}
1600 
1601 	else if(lineType == FLATTYPE_VER && svnfield >= 0)
1602 	{
1603 	    if(ajRegExec(dbiflatGRegGbVer, dbiflatGTmpLine))
1604 	    {
1605 		ajRegSubI(dbiflatGRegGbVer, 1, &dbiflatGTmpFd);
1606 		ajStrFmtUpper(&dbiflatGTmpFd);
1607 		/*ajDebug("++ver '%S'\n", dbiflatGTmpFd);*/
1608 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1609 
1610 		if(systemsort)
1611 		    ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1612                                 *myid, dbiflatGTmpFd);
1613 		else
1614 		{
1615 		    fd = ajCharNewS(dbiflatGTmpFd);
1616 		    ajListPushAppend(myfdl[svnfield], fd);
1617 		}
1618 		svndone = ajTrue;
1619 
1620 		ajRegSubI(dbiflatGRegGbVer, 3, &dbiflatGTmpFd);
1621 		if(!ajStrGetLen(dbiflatGTmpFd))
1622 		    continue;
1623 		ajStrFmtUpper(&dbiflatGTmpFd);
1624 		/*ajDebug("++ver gi: '%S'\n", dbiflatGTmpFd);*/
1625 
1626 		countfield[svnfield]++;
1627 		if(systemsort)
1628 		    ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1629                                 *myid, dbiflatGTmpFd);
1630 		else
1631 		{
1632 		    fd = ajCharNewS(dbiflatGTmpFd);
1633 		    ajListPushAppend(myfdl[svnfield], fd);
1634 		}
1635 	    }
1636 	    continue;
1637 	}
1638 
1639 	ipos = ajFileResetPos(libr);
1640     }
1641 
1642     if(!done)
1643 	return ajFalse;
1644 
1645     if(svnfield >= 0 && !svndone && tmpacnum)
1646     {
1647 	ajFmtPrintS(&dbiflatGTmpFd, "%S.0", tmpacnum);
1648 	embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1649 
1650 	countfield[svnfield]++;
1651 	if(systemsort)
1652 	    ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiflatGTmpFd);
1653 	else
1654 	{
1655 	    fd = ajCharNewS(dbiflatGTmpFd);
1656 	    ajListPushAppend(myfdl[svnfield], fd);
1657 	}
1658     }
1659 
1660     ajStrDel(&tmpacnum);
1661 
1662     return ajTrue;
1663 }
1664 
1665 
1666 
1667 
1668 /* @funcstatic dbiflat_ParseRefseq ********************************************
1669 **
1670 ** Parse the ID, accession from an NCBI REFSEQ entry
1671 **
1672 ** @param [u] libr [AjPFile] Input database file
1673 ** @param [u] alistfile [AjPFile*] field data files array
1674 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1675 ** @param [w] fields [AjPStr*] Fields required
1676 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1677 ** @param [w] countfield [ajuint*] Number of tokens for each field
1678 ** @param [w] dpos [ajint*] Byte offset
1679 ** @param [w] myid [AjPStr*] ID
1680 ** @param [w] myfdl [AjPList*] Lists of field values
1681 ** @return [AjBool] ajTrue on success.
1682 ** @@
1683 ******************************************************************************/
1684 
dbiflat_ParseRefseq(AjPFile libr,AjPFile * alistfile,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * countfield,ajint * dpos,AjPStr * myid,AjPList * myfdl)1685 static AjBool dbiflat_ParseRefseq(AjPFile libr, AjPFile* alistfile,
1686 				  AjBool systemsort, AjPStr* fields,
1687 				  ajint* maxFieldLen, ajuint* countfield,
1688 				  ajint* dpos, AjPStr* myid,
1689 				  AjPList* myfdl)
1690 {
1691     ajint lineType = FLATTYPE_OTHER;
1692     AjPStr tmpacnum = NULL;
1693     char* fd;
1694     ajlong ipos = 0;
1695     static ajint numFields;
1696     static ajint accfield = -1;
1697     static ajint desfield = -1;
1698     static ajint keyfield = -1;
1699     static ajint taxfield = -1;
1700     static ajint svnfield = -1;
1701     static AjBool reset = AJTRUE;
1702     AjBool done = ajFalse;
1703     AjBool svndone = ajFalse;
1704 
1705     if(!fields)
1706     {
1707 	reset = ajTrue;
1708 	accfield = svnfield = desfield = keyfield = taxfield = -1;
1709 	return ajFalse;
1710     }
1711 
1712     if(reset)
1713     {
1714 	numFields = 0;
1715 	while(fields[numFields])
1716 	{
1717 	    countfield[numFields]=0;
1718 	    if(ajStrMatchCaseC(fields[numFields], "acc"))
1719 		accfield=numFields;
1720 	    else if(ajStrMatchCaseC(fields[numFields], "sv"))
1721 		svnfield=numFields;
1722 	    else if(ajStrMatchCaseC(fields[numFields], "des"))
1723 		desfield=numFields;
1724 	    else if(ajStrMatchCaseC(fields[numFields], "key"))
1725 		keyfield=numFields;
1726 	    else if(ajStrMatchCaseC(fields[numFields], "org"))
1727 		taxfield=numFields;
1728 	    else
1729 		ajWarn("GenBank parsing unknown field '%S' ignored",
1730 		       fields[numFields]);
1731 
1732 	    numFields++;
1733 	}
1734 	reset = ajFalse;
1735     }
1736 
1737     /*
1738     ** These are almost the same as GenBank, but with some exceptions noted
1739     */
1740 
1741     if(!dbiflatGRegRefseqTyp)
1742 	dbiflatGRegRefseqTyp = ajRegCompC("^(  )?([A-Z]+)");
1743 
1744     if(!dbiflatGRegRefseqMore)
1745 	dbiflatGRegRefseqMore = ajRegCompC("^            ");
1746 
1747     if(!dbiflatGRegRefseqWrd)
1748 	dbiflatGRegRefseqWrd = ajRegCompC("([A-Za-z0-9_]+)");
1749 
1750     if(!dbiflatGRegRefseqId) /* funny characters in IDs */
1751 	dbiflatGRegRefseqId = ajRegCompC("([^ \t\r\n]+)");
1752 
1753     if(!dbiflatGRegRefseqPhr)
1754 	dbiflatGRegRefseqPhr = ajRegCompC(" *([^;.\n\r]+)");
1755 
1756     if(!dbiflatGRegRefseqTax)
1757 	dbiflatGRegRefseqTax = ajRegCompC(" *([^;.\n\r()]+)");
1758 
1759     if(!dbiflatGRegRefseqVer) /* allow '_' in accession/version */
1760 	dbiflatGRegRefseqVer = ajRegCompC("([A-Za-z0-9_.]+)( +GI:([0-9]+))?");
1761 
1762     if(!dbiflatGRegRefseqEnd)
1763 	dbiflatGRegRefseqEnd = ajRegCompC("^//");
1764 
1765     ipos = ajFileResetPos(libr);
1766 
1767     while(ajReadline(libr, &dbiflatGRline))
1768     {
1769 	if(ajRegExec(dbiflatGRegRefseqEnd, dbiflatGRline))
1770 	{
1771 	    done = ajTrue;
1772 	    break;
1773 	}
1774 
1775 	if(ajRegExec(dbiflatGRegRefseqTyp, dbiflatGRline))
1776 	{
1777 	    ajRegSubI(dbiflatGRegRefseqTyp, 2, &dbiflatGTypStr);
1778 	    if(ajStrMatchC(dbiflatGTypStr, "LOCUS"))
1779 		lineType = FLATTYPE_ID;
1780 	    else if(ajStrMatchC(dbiflatGTypStr, "VERSION"))
1781 		lineType = FLATTYPE_VER;
1782 	    else if(ajStrMatchC(dbiflatGTypStr, "ACCESSION"))
1783 		lineType = FLATTYPE_ACC;
1784 	    else if(ajStrMatchC(dbiflatGTypStr, "DEFINITION"))
1785 		lineType = FLATTYPE_DES;
1786 	    else if(ajStrMatchC(dbiflatGTypStr, "KEYWORDS"))
1787 		lineType = FLATTYPE_KEY;
1788 	    else if(ajStrMatchC(dbiflatGTypStr, "ORGANISM"))
1789 		lineType = FLATTYPE_TAX;
1790 	    else
1791 		lineType=FLATTYPE_OTHER;
1792 
1793 	    if(lineType != FLATTYPE_OTHER)
1794 		ajRegPost(dbiflatGRegRefseqTyp, &dbiflatGTmpLine);
1795 	    /*ajDebug("++type line %d\n", lineType);*/
1796 	}
1797 	else if(lineType != FLATTYPE_OTHER &&
1798                 ajRegExec(dbiflatGRegRefseqMore, dbiflatGRline))
1799 	{
1800 	    ajRegPost(dbiflatGRegRefseqMore, &dbiflatGTmpLine);
1801 	    /*ajDebug("++more line %d\n", lineType);*/
1802 	}
1803 	else
1804 	    lineType = FLATTYPE_OTHER;
1805 
1806 	if(lineType == FLATTYPE_ID)    /* use REFSEQ-specific idexp */
1807 	{
1808 	    ajRegExec(dbiflatGRegRefseqId, dbiflatGTmpLine);
1809 	    ajRegSubI(dbiflatGRegRefseqId, 1, myid);
1810 	    ajStrFmtUpper(myid);
1811 	    *dpos = (ajint) ipos; /* Lossy cast */
1812 	}
1813 
1814 	else if(lineType == FLATTYPE_ACC && accfield >= 0)
1815 	{
1816 	    while(ajRegExec(dbiflatGRegRefseqWrd, dbiflatGTmpLine))
1817             /* should be OK */
1818 	    {
1819 		ajRegSubI(dbiflatGRegRefseqWrd, 1, &dbiflatGTmpFd);
1820 		ajStrFmtUpper(&dbiflatGTmpFd);
1821 		/*ajDebug("++acc '%S'\n", dbiflatGTmpFd);*/
1822 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[accfield]);
1823 
1824 		countfield[accfield]++;
1825 		if(systemsort)
1826 		    ajFmtPrintF(alistfile[accfield], "%S %S\n",
1827                                 *myid, dbiflatGTmpFd);
1828 		else
1829 		{
1830 		    fd = ajCharNewS(dbiflatGTmpFd);
1831 		    ajListPushAppend(myfdl[accfield], fd);
1832 		}
1833 		ajRegPost(dbiflatGRegRefseqWrd, &dbiflatGTmpStr);
1834                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1835 	    }
1836 	    continue;
1837 	}
1838 	else if(lineType == FLATTYPE_DES && desfield >= 0)
1839 	{
1840 	    while(ajRegExec(dbiflatGRegRefseqWrd, dbiflatGTmpLine))
1841 	    {
1842 	        ajRegSubI(dbiflatGRegRefseqWrd, 1, &dbiflatGTmpFd);
1843 		ajStrFmtUpper(&dbiflatGTmpFd);
1844 		/*ajDebug("++des '%S'\n", dbiflatGTmpFd);*/
1845 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[desfield]);
1846 
1847 		countfield[desfield]++;
1848 		if(systemsort)
1849 		    ajFmtPrintF(alistfile[desfield],
1850 				"%S %S\n", *myid, dbiflatGTmpFd);
1851 		else
1852 		{
1853 		    fd = ajCharNewS(dbiflatGTmpFd);
1854 		    ajListPushAppend(myfdl[desfield], fd);
1855 		}
1856 		ajRegPost(dbiflatGRegRefseqWrd, &dbiflatGTmpStr);
1857                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1858 	    }
1859 	    continue;
1860 	}
1861 
1862 	else if(lineType == FLATTYPE_KEY && keyfield >= 0)
1863 	{
1864 	    while(ajRegExec(dbiflatGRegRefseqPhr, dbiflatGTmpLine))
1865 	    {
1866 	        ajRegSubI(dbiflatGRegRefseqPhr, 1, &dbiflatGTmpFd);
1867 		ajRegPost(dbiflatGRegRefseqPhr, &dbiflatGTmpStr);
1868                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1869 		ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1870 		if(!ajStrGetLen(dbiflatGTmpFd))
1871 		    continue;
1872 		ajStrFmtUpper(&dbiflatGTmpFd);
1873 		/*ajDebug("++key '%S'\n", dbiflatGTmpFd);*/
1874 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[keyfield]);
1875 
1876 		countfield[keyfield]++;
1877 		if(systemsort)
1878 		    ajFmtPrintF(alistfile[keyfield],
1879 				"%S %S\n", *myid, dbiflatGTmpFd);
1880 		else
1881 		{
1882 		    fd = ajCharNewS(dbiflatGTmpFd);
1883 		    ajListPushAppend(myfdl[keyfield], fd);
1884 		}
1885 	    }
1886 	    continue;
1887 	}
1888 	else if(lineType == FLATTYPE_TAX && taxfield >= 0)
1889 	{
1890 	    while(ajRegExec(dbiflatGRegRefseqTax, dbiflatGTmpLine))
1891 	    {
1892 	        ajRegSubI(dbiflatGRegRefseqTax, 1, &dbiflatGTmpFd);
1893 		ajRegPost(dbiflatGRegRefseqTax, &dbiflatGTmpStr);
1894                 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1895 		ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1896 		if(!ajStrGetLen(dbiflatGTmpFd))
1897 		    continue;
1898 		ajStrFmtUpper(&dbiflatGTmpFd);
1899 		/*ajDebug("++tax '%S'\n", dbiflatGTmpFd);*/
1900 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[taxfield]);
1901 
1902 		countfield[taxfield]++;
1903 		if(systemsort)
1904 		    ajFmtPrintF(alistfile[taxfield],
1905 				"%S %S\n", *myid, dbiflatGTmpFd);
1906 		else
1907 		{
1908 		    fd = ajCharNewS(dbiflatGTmpFd);
1909 		    ajListPushAppend(myfdl[taxfield], fd);
1910 		}
1911 	    }
1912 	    continue;
1913 	}
1914 	else if(lineType == FLATTYPE_VER && svnfield >= 0)
1915 	{			       /* special verexp for REFSEQ */
1916 	    if(ajRegExec(dbiflatGRegRefseqVer, dbiflatGTmpLine))
1917 	    {
1918 		ajRegSubI(dbiflatGRegRefseqVer, 1, &dbiflatGTmpFd);
1919 		ajStrFmtUpper(&dbiflatGTmpFd);
1920 		/*ajDebug("++ver '%S'\n", dbiflatGTmpFd);*/
1921 		embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1922 
1923 		countfield[svnfield]++;
1924 		if(systemsort)
1925 		    ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1926                                 *myid, dbiflatGTmpFd);
1927 		else
1928 		{
1929 		    fd = ajCharNewS(dbiflatGTmpFd);
1930 		    ajListPushAppend(myfdl[svnfield], fd);
1931 		}
1932 		svndone = ajTrue;
1933 
1934 		ajRegSubI(dbiflatGRegRefseqVer, 3, &dbiflatGTmpFd);
1935 		if(!ajStrGetLen(dbiflatGTmpFd)) continue;
1936 		ajStrFmtUpper(&dbiflatGTmpFd);
1937 		/*ajDebug("++ver gi: '%S'\n", dbiflatGTmpFd);*/
1938 
1939 		if(systemsort)
1940 		    ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1941                                 *myid, dbiflatGTmpFd);
1942 		else
1943 		{
1944 		    fd = ajCharNewS(dbiflatGTmpFd);
1945 		    ajListPushAppend(myfdl[svnfield], fd);
1946 		}
1947 	    }
1948 	    continue;
1949 	}
1950 
1951 	ipos = ajFileResetPos(libr);
1952     }
1953 
1954     if(!done)
1955 	return ajFalse;
1956 
1957     if(svnfield >= 0 && !svndone && tmpacnum)
1958     {
1959 	ajFmtPrintS(&dbiflatGTmpFd, "%S.0", tmpacnum);
1960 	embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1961 
1962 	countfield[svnfield]++;
1963 	if(systemsort)
1964 	    ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiflatGTmpFd);
1965 	else
1966 	{
1967 	    fd = ajCharNewS(dbiflatGTmpFd);
1968 	    ajListPushAppend(myfdl[svnfield], fd);
1969 	}
1970     }
1971 
1972     ajStrDel(&tmpacnum);
1973 
1974     return ajTrue;
1975 }
1976