1 /* @source dbiflat application
2 **
3 ** Index flatfile databases
4 **
5 ** @author Copyright (C) Peter Rice, Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 /******************************************************************************
24 **
25 ** EMBOSS/Staden/EMBLCD indexing
26 **
27 ** This version reads a flat file database,
28 ** and writes entryname and field (e.g. accession) index files.
29 **
30 ** It needs to know the format in order to
31 ** parse the entryname and accession number.
32 **
33 ** To save memory, it is also helpful to know the maximum number of
34 ** entries in the database and the maximum entryname length so that
35 ** space can be preallocated for storage.
36 **
37 ** Entry names and accession numbers are held in list structures,
38 ** then converted to arrays and sorted.
39 **
40 ** Multiple input files are allowed.
41 **
42 ** EMBLCD and Staden index files use different names but have essentially
43 ** the same contents.
44 **
45 ******************************************************************************/
46
47 #include "emboss.h"
48
49 #define FLATTYPE_OTHER 0
50 #define FLATTYPE_ID 1
51 #define FLATTYPE_ACC 2
52 #define FLATTYPE_DES 3
53 #define FLATTYPE_KEY 4
54 #define FLATTYPE_TAX 5
55 #define FLATTYPE_VER 6
56
57 /* Definiiton of global variables */
58
59 static AjPStr dbiflatGRline = NULL;
60 static AjPStr dbiflatGTmpId = NULL;
61 static AjPStr dbiflatGTmpStr = NULL;
62 static AjPStr dbiflatGTmpLine = NULL;
63 static AjPStr dbiflatGTmpFd = NULL;
64 static AjPStr dbiflatGTypStr = NULL;
65
66 static AjPRegexp dbiflatGRegEmblType = NULL;
67 static AjPRegexp dbiflatGRegEmblId = NULL;
68 static AjPRegexp dbiflatGRegEmblAcc = NULL;
69 static AjPRegexp dbiflatGRegEmblWrd = NULL;
70 static AjPRegexp dbiflatGRegEmblVer = NULL;
71 static AjPRegexp dbiflatGRegEmblPhr = NULL;
72 static AjPRegexp dbiflatGRegEmblTax = NULL;
73 static AjPRegexp dbiflatGRegEmblEnd = NULL;
74
75 static AjPRegexp dbiflatGRegGbType = NULL;
76 static AjPRegexp dbiflatGRegGbMore = NULL;
77 static AjPRegexp dbiflatGRegGbWrd = NULL;
78 static AjPRegexp dbiflatGRegGbPhr = NULL;
79 static AjPRegexp dbiflatGRegGbTax = NULL;
80 static AjPRegexp dbiflatGRegGbVer = NULL;
81 static AjPRegexp dbiflatGRegGbEnd = NULL;
82
83 static AjPRegexp dbiflatGRegRefseqTyp = NULL;
84 static AjPRegexp dbiflatGRegRefseqMore = NULL;
85 static AjPRegexp dbiflatGRegRefseqId = NULL;
86 static AjPRegexp dbiflatGRegRefseqWrd = NULL;
87 static AjPRegexp dbiflatGRegRefseqPhr = NULL;
88 static AjPRegexp dbiflatGRegRefseqTax = NULL;
89 static AjPRegexp dbiflatGRegRefseqVer = NULL;
90 static AjPRegexp dbiflatGRegRefseqEnd = NULL;
91
92 static EmbPEntry dbiflatGEntry = NULL;
93
94 static AjPList* dbiflatGFdl = NULL;
95
96 static AjBool dbiflat_ParseSwiss(AjPFile libr, AjPFile* alistfile,
97 AjBool systemsort, AjPStr* fields,
98 ajint* maxFieldLen, ajuint* countfield,
99 ajint *dpos, AjPStr* myid, AjPList* acl);
100 static AjBool dbiflat_ParseEmbl(AjPFile libr, AjPFile* alistfile,
101 AjBool systemsort, AjPStr* fields,
102 ajint* maxFieldLen, ajuint* countfield,
103 ajint *dpos, AjPStr* myid, AjPList* acl);
104 static AjBool dbiflat_ParseGenbank(AjPFile libr, AjPFile* alistfile,
105 AjBool systemsort, AjPStr* fields,
106 ajint* maxFieldLen, ajuint* countfield,
107 ajint *dpos, AjPStr* myid, AjPList* acl);
108 static AjBool dbiflat_ParseRefseq(AjPFile libr, AjPFile* alistfile,
109 AjBool systemsort, AjPStr* fields,
110 ajint* maxFieldLen, ajuint* countfield,
111 ajint *dpos, AjPStr* myid, AjPList* acl);
112
113
114
115
116 /* @datastatic DbiflatPParser *************************************************
117 **
118 ** Parser definition structure
119 **
120 ** @alias DbiflatSParser
121 ** @alias DbiflatOParser
122 **
123 ** @attr Name [const char*] Parser name
124 ** @attr Parser [AjBool function] Parser function
125 ** @@
126 ******************************************************************************/
127
128 typedef struct DbiflatSParser
129 {
130 const char* Name;
131 AjBool (*Parser) (AjPFile libr, AjPFile* alistfile,
132 AjBool systemsort, AjPStr* fields,
133 ajint* maxFieldLen, ajuint* countfield,
134 ajint *dpos, AjPStr* myid, AjPList* acl);
135 } DbiflatOParser;
136 #define DbiflatPParser DbiflatOParser*
137
138
139
140
141 static DbiflatOParser parser[] =
142 {
143 {"EMBL", dbiflat_ParseEmbl},
144 {"SWISS", dbiflat_ParseSwiss},
145 {"GB", dbiflat_ParseGenbank},
146 {"REFSEQ", dbiflat_ParseRefseq},
147 {NULL, NULL}
148 };
149
150
151
152 static EmbPEntry dbiflat_NextFlatEntry(AjPFile libr, ajuint ifile,
153 const AjPStr idformat,
154 AjBool systemsort,
155 AjPStr* fields, ajint* maxFieldLen,
156 ajuint* maxidlen, ajuint* countfield,
157 AjPFile elistfile, AjPFile* alistfile);
158
159
160
161
162
163 /* @prog dbiflat **************************************************************
164 **
165 ** Index a flat file database
166 **
167 ******************************************************************************/
168
main(int argc,char ** argv)169 int main(int argc, char **argv)
170 {
171
172 AjPList idlist;
173 AjPList* fieldList = NULL;
174
175 AjBool systemsort;
176 AjBool cleanup;
177
178 ajuint maxindex;
179 ajuint maxidlen = 0;
180 ajuint maxlen;
181
182 AjPFile elistfile = NULL;
183 AjPFile* alistfile = NULL;
184
185 AjPStr dbname = NULL;
186 AjPStr release = NULL;
187 AjPStr datestr = NULL;
188 AjPStr sortopt = NULL;
189 void **entryIds = NULL;
190
191 AjPStr directory;
192 AjPStr indexdir;
193 AjPStr filename;
194 AjPStr exclude;
195 AjPStr curfilename = NULL;
196 AjPFile libr = NULL;
197 AjPStr idformat = NULL;
198
199 EmbPEntry entry;
200
201 ajuint idCount = 0;
202 ajuint idDone;
203 AjPList listInputFiles = NULL;
204 void ** inputFiles = NULL;
205 ajuint nfiles;
206 ajuint ifile;
207
208 ajuint filesize;
209 short recsize;
210 ajuint maxfilelen = 20;
211 char date[4] =
212 {
213 0,0,0,0
214 };
215
216 AjPStr tmpfname = NULL;
217 AjPStr* fields = NULL;
218
219 AjPFile entFile = NULL;
220
221 AjPStr* divfiles = NULL;
222 ajint* maxFieldLen = NULL;
223
224 ajuint ifield = 0;
225 ajuint nfields = 0;
226
227 AjPFile logfile = NULL;
228 ajuint* countField = NULL;
229 ajuint* fieldTot = NULL;
230 ajuint idCountFile = 0;
231 ajuint i;
232
233 embInit("dbiflat", argc, argv);
234
235 idformat = ajAcdGetListSingle("idformat");
236 fields = ajAcdGetList("fields");
237 directory = ajAcdGetDirectoryName("directory");
238 indexdir = ajAcdGetOutdirName("indexoutdir");
239 filename = ajAcdGetString("filenames");
240 exclude = ajAcdGetString("exclude");
241 dbname = ajAcdGetString("dbname");
242 release = ajAcdGetString("release");
243 datestr = ajAcdGetString("date");
244 systemsort = ajAcdGetBoolean("systemsort");
245 cleanup = ajAcdGetBoolean("cleanup");
246 sortopt = ajAcdGetString("sortoptions");
247 maxindex = ajAcdGetInt("maxindex");
248 logfile = ajAcdGetOutfile("outfile");
249
250 while(fields[nfields]) /* array ends with a NULL */
251 nfields++;
252
253 if(nfields)
254 {
255 AJCNEW(maxFieldLen, nfields);
256 AJCNEW0(countField, nfields);
257 AJCNEW0(fieldTot, nfields);
258 for(ifield=0; ifield < nfields; ifield++)
259 maxFieldLen[ifield] = (ajint)maxindex * -1; /* -maxindex illegal */
260
261 if(systemsort)
262 AJCNEW(alistfile, nfields);
263 else
264 {
265 AJCNEW(fieldList, nfields);
266 for(ifield=0; ifield < nfields; ifield++)
267 fieldList[ifield] = ajListNew();
268 }
269 }
270
271 if(ajStrMatchC(datestr, "00/00/00"))
272 ajFmtPrintS(&datestr, "%D", ajTimeRefTodayFmt("dbindex"));
273
274 ajStrRemoveWhite(&dbname); /* used for temp filenames */
275 embDbiDateSet(datestr, date);
276 idlist = ajListNew();
277
278 ajDebug("reading '%S/%S'\n", directory, filename);
279 ajDebug("writing '%S/'\n", indexdir);
280
281 listInputFiles = embDbiFileListExc(directory, filename, exclude);
282 ajListSort(listInputFiles, &ajStrVcmp);
283 nfiles = (ajuint) ajListToarray(listInputFiles, &inputFiles);
284 if(!nfiles)
285 ajDie("No input files in '%S' matched filename '%S'",
286 directory, filename);
287
288 embDbiLogHeader(logfile, dbname, release, datestr,
289 indexdir, maxindex);
290
291 embDbiLogFields(logfile, fields, nfields);
292 embDbiLogSource(logfile, directory, filename, exclude,
293 (AjPStr*) inputFiles, nfiles);
294 embDbiLogCmdline(logfile);
295
296 AJCNEW0(divfiles, nfiles);
297
298 /* process each input file, one at a time */
299 for(ifile=0; ifile < nfiles; ifile++)
300 {
301 ajStrAssignS(&curfilename, (AjPStr) inputFiles[ifile]);
302 embDbiFlatOpenlib(curfilename, &libr);
303 ajFilenameTrimPath(&curfilename);
304 if(ajStrGetLen(curfilename) >= maxfilelen)
305 maxfilelen = ajStrGetLen(curfilename) + 1;
306
307 ajDebug("processing file '%F' ...\n", libr);
308 ajStrAssignS(&divfiles[ifile], curfilename);
309
310 if(systemsort) /* elistfile for entries, alist for fields */
311 elistfile = embDbiSortOpen(alistfile, ifile,
312 dbname, fields, nfields);
313
314 idCountFile = 0;
315 for(i=0;i<nfields;i++)
316 countField[i] = 0;
317 while((entry=dbiflat_NextFlatEntry(libr, ifile, idformat,
318 systemsort, fields, maxFieldLen,
319 &maxidlen, countField,
320 elistfile, alistfile)))
321 {
322 idCountFile++;
323
324 if(!systemsort) /* save the entry data in lists */
325 {
326 embDbiMemEntry(idlist, fieldList, nfields, entry, ifile);
327 entry = NULL;
328 }
329 }
330 idCount += idCountFile;
331 if(systemsort)
332 {
333 embDbiSortClose(&elistfile, alistfile, nfields);
334 AJFREE(entry);
335 }
336 else
337 {
338 embDbiEntryDel(&dbiflatGEntry);
339 }
340 embDbiLogFile(logfile, curfilename, idCountFile, fields,
341 countField, nfields);
342 }
343
344
345 embDbiWriteDivision(indexdir, dbname, release, date,
346 maxfilelen, nfiles, divfiles, NULL);
347
348 /* Write the entryname.idx index */
349 ajStrAssignC(&tmpfname, "entrynam.idx");
350 entFile = ajFileNewOutNamePathS(tmpfname, indexdir);
351
352 recsize = maxidlen+10;
353 filesize = 300 + (idCount*(ajint)recsize);
354 embDbiHeader(entFile, filesize, idCount, recsize, dbname, release, date);
355
356 if(systemsort)
357 idDone = embDbiSortWriteEntry(entFile, maxidlen,
358 dbname, nfiles, cleanup, sortopt);
359 else /* save entries in entryIds array */
360 {
361 idDone = embDbiMemWriteEntry(entFile, maxidlen,
362 idlist, &entryIds);
363 if(idDone != idCount)
364 ajFatal("Duplicates not allowed for in-memory processing");
365 }
366
367 embDbiHeaderSize(entFile, 300+(idDone*(ajint)recsize), idDone);
368 ajFileClose(&entFile);
369
370 /* Write the fields index files */
371 for(ifield=0; ifield < nfields; ifield++)
372 {
373 if(maxindex)
374 maxlen = maxindex;
375 {
376 if(maxFieldLen[ifield] >= 0)
377 maxlen = maxFieldLen[ifield];
378 else
379 maxlen = - maxFieldLen[ifield];
380 }
381
382 if(systemsort)
383 fieldTot[ifield] = embDbiSortWriteFields(dbname, release,
384 date, indexdir,
385 fields[ifield], maxlen,
386 nfiles, idCount,
387 cleanup, sortopt);
388 else
389 fieldTot[ifield] = embDbiMemWriteFields(dbname, release,
390 date, indexdir,
391 fields[ifield], maxlen,
392 fieldList[ifield],
393 entryIds);
394 }
395
396 embDbiLogFinal(logfile,maxindex, maxFieldLen, fields, fieldTot,
397 nfields, nfiles, idDone, idCount);
398
399 if(systemsort)
400 {
401 embDbiRmEntryFile(dbname, cleanup);
402 }
403
404 ajStrDel(&dbname);
405 ajStrDel(&release);
406 ajStrDel(&datestr);
407 ajStrDel(&sortopt);
408 ajStrDel(&filename);
409 ajStrDel(&exclude);
410 ajStrDel(&directory);
411 ajStrDel(&indexdir);
412 ajFileClose(&libr);
413 ajFileClose(&logfile);
414 ajStrDel(&idformat);
415 ajStrDelarray(&fields);
416
417 ajStrDel(&tmpfname);
418 ajFileClose(&elistfile);
419
420
421 for(i=0;i<nfields;i++)
422 {
423 if(systemsort)
424 {
425 ajFileClose(&alistfile[i]);
426 }
427 else
428 {
429 ajListMap(fieldList[i], &embDbiFieldDelMap, NULL);
430 ajListFree(&fieldList[i]);
431 }
432 }
433
434 AJFREE(alistfile);
435 AJFREE(fieldList);
436 AJFREE(maxFieldLen);
437 AJFREE(countField);
438 AJFREE(fieldTot);
439
440 for(i=0;i<nfiles;i++)
441 {
442 ajStrDel(&divfiles[i]);
443 }
444 AJFREE(divfiles);
445 AJFREE(inputFiles);
446
447 ajRegFree(&dbiflatGRegEmblType);
448 ajRegFree(&dbiflatGRegEmblId);
449 ajRegFree(&dbiflatGRegEmblAcc);
450 ajRegFree(&dbiflatGRegEmblWrd);
451 ajRegFree(&dbiflatGRegEmblVer);
452 ajRegFree(&dbiflatGRegEmblPhr);
453 ajRegFree(&dbiflatGRegEmblTax);
454 ajRegFree(&dbiflatGRegEmblEnd);
455
456 ajRegFree(&dbiflatGRegGbType);
457 ajRegFree(&dbiflatGRegGbMore);
458 ajRegFree(&dbiflatGRegGbWrd);
459 ajRegFree(&dbiflatGRegGbPhr);
460 ajRegFree(&dbiflatGRegGbTax);
461 ajRegFree(&dbiflatGRegGbVer);
462 ajRegFree(&dbiflatGRegGbEnd);
463
464 ajRegFree(&dbiflatGRegRefseqTyp);
465 ajRegFree(&dbiflatGRegRefseqMore);
466 ajRegFree(&dbiflatGRegRefseqId);
467 ajRegFree(&dbiflatGRegRefseqWrd);
468 ajRegFree(&dbiflatGRegRefseqTax);
469 ajRegFree(&dbiflatGRegRefseqVer);
470 ajRegFree(&dbiflatGRegRefseqEnd);
471
472 embDbiEntryDel(&dbiflatGEntry);
473
474 ajStrDel(&dbiflatGRline);
475 ajStrDel(&dbiflatGTmpFd);
476 ajStrDel(&dbiflatGTmpLine);
477 ajStrDel(&dbiflatGTmpStr);
478 ajStrDel(&dbiflatGTypStr);
479 ajStrDel(&dbiflatGTmpId);
480
481 if(dbiflatGFdl)
482 {
483 for(i=0; i < nfields; i++)
484 ajListFree(&dbiflatGFdl[i]);
485 AJFREE(dbiflatGFdl);
486 }
487
488 ajListMap(idlist, &embDbiEntryDelMap, NULL);
489 ajListFree(&idlist);
490 ajListstrFreeData(&listInputFiles);
491 AJFREE(entryIds);
492 ajStrDel(&curfilename);
493
494 embExit();
495
496 return 0;
497 }
498
499
500
501
502 /* @funcstatic dbiflat_NextFlatEntry ******************************************
503 **
504 ** Returns next database entry as an EmbPEntry object
505 **
506 ** @param [u] libr [AjPFile] Database file
507 ** @param [r] ifile [ajuint] File number.
508 ** @param [r] idformat [const AjPStr] Format to be used
509 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
510 ** @param [u] fields [AjPStr*] Fields to be indexed
511 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
512 ** @param [w] maxidlen [ajuint*] Maximum entry ID length
513 ** @param [w] countfield [ajuint*] Number of tokens for each field
514 ** @param [u] elistfile [AjPFile] entry file
515 ** @param [u] alistfile [AjPFile*] field data files array
516 ** @return [EmbPEntry] Entry data object.
517 ** @@
518 ******************************************************************************/
519
dbiflat_NextFlatEntry(AjPFile libr,ajuint ifile,const AjPStr idformat,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * maxidlen,ajuint * countfield,AjPFile elistfile,AjPFile * alistfile)520 static EmbPEntry dbiflat_NextFlatEntry(AjPFile libr, ajuint ifile,
521 const AjPStr idformat,
522 AjBool systemsort,
523 AjPStr* fields, ajint* maxFieldLen,
524 ajuint* maxidlen, ajuint* countfield,
525 AjPFile elistfile, AjPFile* alistfile)
526 {
527 ajint ir;
528 ajint is = 0;
529 char* token;
530 ajint i;
531 static ajint called = 0;
532 static ajint iparser = -1;
533 static ajint nfields;
534 ajint ifield;
535
536 if(!called)
537 {
538 for(i=0; parser[i].Name; i++)
539 if(ajStrMatchC(idformat, parser[i].Name))
540 {
541 iparser = i;
542 break;
543 }
544
545 if(iparser < 0)
546 ajFatal("idformat '%S' unknown", idformat);
547 }
548
549 if(!dbiflatGFdl)
550 {
551 nfields = 0;
552 while(fields[nfields])
553 nfields++;
554 if(nfields)
555 AJCNEW(dbiflatGFdl, nfields);
556 for(i=0; i < nfields; i++)
557 dbiflatGFdl[i] = ajListNew();
558 }
559
560 if(!dbiflatGEntry || !systemsort)
561 dbiflatGEntry = embDbiEntryNew(nfields);
562
563 if(!(*parser[iparser].Parser)(libr, alistfile, systemsort, fields,
564 maxFieldLen, countfield, &ir,
565 &dbiflatGTmpId, dbiflatGFdl))
566 return NULL;
567
568 /* dbiflatGTmpId to ret->entry */
569 if(ajStrGetLen(dbiflatGTmpId) > *maxidlen)
570 *maxidlen = ajStrGetLen(dbiflatGTmpId);
571
572 if(systemsort)
573 ajFmtPrintF(elistfile, "%S %d %d %d\n",
574 dbiflatGTmpId, ir, is, ifile+1);
575 else
576 {
577 dbiflatGEntry->entry = ajCharNewS(dbiflatGTmpId);
578 dbiflatGEntry->rpos = ir;
579 dbiflatGEntry->spos = is;
580 dbiflatGEntry->filenum = ifile+1;
581
582 /* field tokens as list, then move to dbiflatGEntry->field */
583 for(ifield=0; ifield < nfields; ifield++)
584 {
585 dbiflatGEntry->nfield[ifield] =
586 (ajuint) ajListGetLength(dbiflatGFdl[ifield]);
587
588 if(dbiflatGEntry->nfield[ifield])
589 {
590 AJCNEW(dbiflatGEntry->field[ifield],
591 dbiflatGEntry->nfield[ifield]);
592
593 i = 0;
594 while(ajListPop(dbiflatGFdl[ifield],(void**) &token))
595 dbiflatGEntry->field[ifield][i++] = token;
596 }
597 else
598 dbiflatGEntry->field[ifield] = NULL;
599 }
600 }
601
602 return dbiflatGEntry;
603 }
604
605
606
607
608 /* @funcstatic dbiflat_ParseSwiss *********************************************
609 **
610 ** Parse the ID, accession from a SwissProt or UniProtKB entry.
611 **
612 ** Reads to the end of the entry and then returns.
613 **
614 ** @param [u] libr [AjPFile] Input database file
615 ** @param [u] alistfile [AjPFile*] field data files array
616 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
617 ** @param [w] fields [AjPStr*] Fields required
618 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
619 ** @param [w] countfield [ajuint*] Number of tokens for each field
620 ** @param [w] dpos [ajint*] Byte offset
621 ** @param [w] myid [AjPStr*] ID
622 ** @param [w] myfdl [AjPList*] Lists of field values
623 ** @return [AjBool] ajTrue on success.
624 ** @@
625 ******************************************************************************/
626
dbiflat_ParseSwiss(AjPFile libr,AjPFile * alistfile,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * countfield,ajint * dpos,AjPStr * myid,AjPList * myfdl)627 static AjBool dbiflat_ParseSwiss(AjPFile libr, AjPFile* alistfile,
628 AjBool systemsort, AjPStr* fields,
629 ajint* maxFieldLen, ajuint* countfield,
630 ajint* dpos, AjPStr* myid,
631 AjPList* myfdl)
632 {
633 AjPStr tmpacnum = NULL;
634 char* fd;
635 ajint lineType;
636 static ajint numFields;
637 static ajint accfield = -1;
638 static ajint desfield = -1;
639 static ajint keyfield = -1;
640 static ajint taxfield = -1;
641 static ajint svnfield = -1;
642 static AjBool reset = AJTRUE;
643 AjBool svndone = ajFalse;
644 AjBool done = ajFalse;
645 ajint i;
646 ajint lo;
647 ajint hi;
648 ajint fieldwidth;
649 AjPStr tmpac = NULL;
650 AjPStr format = NULL;
651 AjPStr prefix = NULL;
652 const char* p;
653 const char* q;
654 const char* swissprefix[] = {
655 "RecName: ", "AltName: ", "SubName: ",
656 "Includes:", "Contains:", "Flags: ",
657 "Full=", "Short=", "EC=",
658 "Allergen=", "Biotech=", "CD_antigen=", "INN=",
659 NULL
660 };
661 ajuint j;
662
663 if(!fields)
664 {
665 reset = ajTrue;
666 accfield = svnfield = desfield = keyfield = taxfield = -1;
667 return ajFalse;
668 }
669
670 if(reset)
671 {
672 numFields = 0;
673 while(fields[numFields])
674 {
675 countfield[numFields]=0;
676 if(ajStrMatchCaseC(fields[numFields], "acc"))
677 accfield=numFields;
678 else if(ajStrMatchCaseC(fields[numFields], "sv"))
679 svnfield=numFields;
680 else if(ajStrMatchCaseC(fields[numFields], "des"))
681 desfield=numFields;
682 else if(ajStrMatchCaseC(fields[numFields], "key"))
683 keyfield=numFields;
684 else if(ajStrMatchCaseC(fields[numFields], "org"))
685 taxfield=numFields;
686 else
687 ajWarn("EMBL parsing unknown field '%S' ignored",
688 fields[numFields]);
689 numFields++;
690 }
691
692 reset = ajFalse;
693 }
694
695 if(!dbiflatGRegEmblType)
696 dbiflatGRegEmblType = ajRegCompC("^([A-Z][A-Z]) +");
697
698 if(!dbiflatGRegEmblAcc)
699 dbiflatGRegEmblAcc = ajRegCompC("([A-Za-z0-9-]+)");
700
701 if(!dbiflatGRegEmblWrd)
702 dbiflatGRegEmblWrd = ajRegCompC("([A-Za-z0-9_]+)");
703
704 if(!dbiflatGRegEmblVer)
705 dbiflatGRegEmblVer = ajRegCompC("([A-Za-z0-9_.]+)");
706
707 if(!dbiflatGRegEmblPhr)
708 dbiflatGRegEmblPhr = ajRegCompC(" *([^;.\n\r]+)");
709
710 if(!dbiflatGRegEmblTax)
711 dbiflatGRegEmblTax = ajRegCompC(" *([^;.\n\r()]+)");
712
713 if(!dbiflatGRegEmblId)
714 dbiflatGRegEmblId = ajRegCompC("^ID ([^\\s;]+)(;\\s+SV\\s+(\\d+))?");
715
716 if(!dbiflatGRegEmblEnd)
717 dbiflatGRegEmblEnd = ajRegCompC("^//");
718
719 *dpos = (ajint) ajFileResetPos(libr); /* Lossy cast */
720
721 while(ajReadline(libr, &dbiflatGRline))
722 {
723 if(ajRegExec(dbiflatGRegEmblEnd, dbiflatGRline))
724 {
725 done = ajTrue;
726 break;
727 }
728
729 if(ajRegExec(dbiflatGRegEmblType, dbiflatGRline))
730 {
731 ajRegSubI(dbiflatGRegEmblType, 1, &dbiflatGTypStr);
732 if(ajStrMatchC(dbiflatGTypStr, "ID"))
733 lineType = FLATTYPE_ID;
734 else if(ajStrMatchC(dbiflatGTypStr, "SV") ||
735 ajStrMatchC(dbiflatGTypStr, "IV")) /* emblcds database */
736 lineType = FLATTYPE_VER;
737 else if(ajStrMatchC(dbiflatGTypStr, "AC") ||
738 ajStrMatchC(dbiflatGTypStr, "PA")) /* emblcds database */
739 lineType = FLATTYPE_ACC;
740 else if(ajStrMatchC(dbiflatGTypStr, "DE"))
741 lineType = FLATTYPE_DES;
742 else if(ajStrMatchC(dbiflatGTypStr, "KW"))
743 lineType = FLATTYPE_KEY;
744 else if(ajStrMatchC(dbiflatGTypStr, "OS"))
745 lineType = FLATTYPE_TAX;
746 else if(ajStrMatchC(dbiflatGTypStr, "OC"))
747 lineType = FLATTYPE_TAX;
748 else
749 lineType=FLATTYPE_OTHER;
750
751 if(lineType != FLATTYPE_OTHER)
752 ajRegPost(dbiflatGRegEmblType, &dbiflatGTmpLine);
753 }
754 else
755 lineType = FLATTYPE_OTHER;
756
757 if(lineType == FLATTYPE_ID)
758 {
759 ajRegExec(dbiflatGRegEmblId, dbiflatGRline);
760 ajRegSubI(dbiflatGRegEmblId, 1, myid);
761 ajStrFmtUpper(myid);
762 ajDebug("++id '%S'\n", *myid);
763 ajRegSubI(dbiflatGRegEmblId, 3, &dbiflatGTmpFd);
764 if(svnfield >= 0 && ajStrGetLen(dbiflatGTmpFd))
765 {
766 ajStrFmtUpper(&dbiflatGTmpFd);
767 ajStrInsertK(&dbiflatGTmpFd, 0, '.');
768 ajStrInsertS(&dbiflatGTmpFd, 0, *myid);
769 /*ajDebug("++sv '%S'\n", dbiflatGTmpFd);*/
770 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
771
772 countfield[svnfield]++;
773 if(systemsort)
774 ajFmtPrintF(alistfile[svnfield], "%S %S\n",
775 *myid, dbiflatGTmpFd);
776 else
777 {
778 fd = ajCharNewS(dbiflatGTmpFd);
779 ajListPushAppend(myfdl[svnfield], fd);
780 }
781 svndone = ajTrue;
782 }
783 continue;
784 }
785
786 if(lineType == FLATTYPE_ACC && accfield >= 0)
787 {
788 while(ajRegExec(dbiflatGRegEmblAcc, dbiflatGTmpLine))
789 {
790 ajRegSubI(dbiflatGRegEmblAcc, 1, &dbiflatGTmpFd);
791 ajStrFmtUpper(&dbiflatGTmpFd);
792 /*ajDebug("++acc '%S'\n", dbiflatGTmpFd);*/
793
794 if(!tmpacnum)
795 ajStrAssignS(&tmpacnum, dbiflatGTmpFd);
796
797 if((p=strchr(MAJSTRGETPTR(dbiflatGTmpFd),(int)'-')))
798 {
799 q = p;
800 while(isdigit((int)*(--q)));
801 ++q;
802 ajStrAssignSubC(&dbiflatGTmpStr,q,0,(ajint)(p-q-1));
803 ajStrToInt(dbiflatGTmpStr,&lo);
804 fieldwidth = (ajint) (p-q);
805 ajFmtPrintS(&format,"%%S%%0%dd",fieldwidth);
806
807 ++p;
808 q = p;
809 while(!isdigit((int)*q))
810 ++q;
811 sscanf(q,"%d",&hi);
812 ajStrAssignSubC(&prefix,p,0,(ajint)(q-p-1));
813
814 if(systemsort)
815 {
816 for(i=lo;i<=hi;++i)
817 {
818 ajFmtPrintS(&tmpac,MAJSTRGETPTR(format),prefix,i);
819 embDbiMaxlen(&tmpac, &maxFieldLen[accfield]);
820 countfield[accfield]++;
821 ajFmtPrintF(alistfile[accfield],
822 "%S %S\n", *myid, tmpac);
823 }
824 ajStrDel(&tmpac);
825 }
826 else
827 {
828 for(i=lo;i<=hi;++i)
829 {
830 ajFmtPrintS(&tmpac,MAJSTRGETPTR(format),prefix,i);
831 embDbiMaxlen(&tmpac, &maxFieldLen[accfield]);
832 countfield[accfield]++;
833 fd = ajCharNewS(tmpac);
834 ajListPushAppend(myfdl[accfield], fd);
835 }
836 ajStrDel(&tmpac);
837 }
838 ajStrDel(&format);
839 ajStrDel(&prefix);
840 }
841 else {
842 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[accfield]);
843
844 countfield[accfield]++;
845 if(systemsort)
846 ajFmtPrintF(alistfile[accfield],
847 "%S %S\n", *myid, dbiflatGTmpFd);
848 else
849 {
850 fd = ajCharNewS(dbiflatGTmpFd);
851 ajListPushAppend(myfdl[accfield], fd);
852 }
853 }
854 ajRegPost(dbiflatGRegEmblAcc, &dbiflatGTmpStr);
855 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
856 }
857 continue;
858 }
859 else if(lineType == FLATTYPE_DES && desfield >= 0)
860 {
861 ajStrTrimWhiteStart(&dbiflatGTmpLine);
862 for(j=0; swissprefix[j]; j++)
863 {
864 if(ajStrPrefixC(dbiflatGTmpLine, swissprefix[j]))
865 ajStrCutStart(&dbiflatGTmpLine, strlen(swissprefix[j]));
866 }
867 while(ajRegExec(dbiflatGRegEmblWrd, dbiflatGTmpLine))
868 {
869 ajRegSubI(dbiflatGRegEmblWrd, 1, &dbiflatGTmpFd);
870 ajStrFmtUpper(&dbiflatGTmpFd);
871 /*ajDebug("++des '%S'\n", dbiflatGTmpFd);*/
872 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[desfield]);
873
874 countfield[desfield]++;
875 if(systemsort)
876 ajFmtPrintF(alistfile[desfield], "%S %S\n",
877 *myid, dbiflatGTmpFd);
878 else
879 {
880 fd = ajCharNewS(dbiflatGTmpFd);
881 ajListPushAppend(myfdl[desfield], fd);
882 }
883 ajRegPost(dbiflatGRegEmblWrd, &dbiflatGTmpStr);
884 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
885 }
886 continue;
887 }
888 else if(lineType == FLATTYPE_VER && svnfield >= 0)
889 {
890 while(ajRegExec(dbiflatGRegEmblVer, dbiflatGTmpLine))
891 {
892 ajRegSubI(dbiflatGRegEmblVer, 1, &dbiflatGTmpFd);
893 ajStrFmtUpper(&dbiflatGTmpFd);
894 /*ajDebug("++sv '%S'\n", dbiflatGTmpFd);*/
895 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
896
897 countfield[svnfield]++;
898 if(systemsort)
899 ajFmtPrintF(alistfile[svnfield], "%S %S\n",
900 *myid, dbiflatGTmpFd);
901 else
902 {
903 fd = ajCharNewS(dbiflatGTmpFd);
904 ajListPushAppend(myfdl[svnfield], fd);
905 }
906 ajRegPost(dbiflatGRegEmblVer, &dbiflatGTmpStr);
907 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
908 }
909 svndone = ajTrue;
910 continue;
911 }
912 else if(lineType == FLATTYPE_KEY && keyfield >= 0)
913 {
914 while(ajRegExec(dbiflatGRegEmblPhr, dbiflatGTmpLine))
915 {
916 ajRegSubI(dbiflatGRegEmblPhr, 1, &dbiflatGTmpFd);
917 ajRegPost(dbiflatGRegEmblPhr, &dbiflatGTmpStr);
918 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
919 ajStrTrimWhiteEnd(&dbiflatGTmpFd);
920 if(!ajStrGetLen(dbiflatGTmpFd))
921 continue;
922 ajStrFmtUpper(&dbiflatGTmpFd);
923 /*ajDebug("++key '%S'\n", dbiflatGTmpFd);*/
924 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[keyfield]);
925
926 countfield[keyfield]++;
927 if(systemsort)
928 ajFmtPrintF(alistfile[keyfield], "%S %S\n",
929 *myid, dbiflatGTmpFd);
930 else
931 {
932 fd = ajCharNewS(dbiflatGTmpFd);
933 ajListPushAppend(myfdl[keyfield], fd);
934 }
935 }
936 continue;
937 }
938 else if(lineType == FLATTYPE_TAX && taxfield >= 0)
939 {
940 while(ajRegExec(dbiflatGRegEmblTax, dbiflatGTmpLine))
941 {
942 ajRegSubI(dbiflatGRegEmblTax, 1, &dbiflatGTmpFd);
943 ajRegPost(dbiflatGRegEmblTax, &dbiflatGTmpStr);
944 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
945 ajStrFmtUpper(&dbiflatGTmpFd);
946 ajStrTrimWhiteEnd(&dbiflatGTmpFd);
947 if(!ajStrGetLen(dbiflatGTmpFd))
948 continue;
949 /*ajDebug("++tax '%S'\n", dbiflatGTmpFd);*/
950 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[taxfield]);
951
952 countfield[taxfield]++;
953 if(systemsort)
954 ajFmtPrintF(alistfile[taxfield], "%S %S\n",
955 *myid, dbiflatGTmpFd);
956 else
957 {
958 fd = ajCharNewS(dbiflatGTmpFd);
959 ajListPushAppend(myfdl[taxfield], fd);
960 }
961 }
962 continue;
963 }
964 }
965
966 if(!done)
967 return ajFalse;
968
969 if(svnfield >= 0 && !svndone && tmpacnum)
970 {
971 ajFmtPrintS(&dbiflatGTmpFd, "%S.0", tmpacnum);
972 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
973
974 countfield[svnfield]++;
975 if(systemsort)
976 ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiflatGTmpFd);
977 else
978 {
979 fd = ajCharNewS(dbiflatGTmpFd);
980 ajListPushAppend(myfdl[svnfield], fd);
981 }
982 }
983
984 ajStrDel(&tmpacnum);
985
986 return ajTrue;
987 }
988
989
990
991
992 /* @funcstatic dbiflat_ParseEmbl **********************************************
993 **
994 ** Parse the ID, accession from an EMBL entry.
995 **
996 ** Reads to the end of the entry and then returns.
997 **
998 ** @param [u] libr [AjPFile] Input database file
999 ** @param [u] alistfile [AjPFile*] field data files array
1000 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1001 ** @param [w] fields [AjPStr*] Fields required
1002 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1003 ** @param [w] countfield [ajuint*] Number of tokens for each field
1004 ** @param [w] dpos [ajint*] Byte offset
1005 ** @param [w] myid [AjPStr*] ID
1006 ** @param [w] myfdl [AjPList*] Lists of field values
1007 ** @return [AjBool] ajTrue on success.
1008 ** @@
1009 ******************************************************************************/
1010
dbiflat_ParseEmbl(AjPFile libr,AjPFile * alistfile,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * countfield,ajint * dpos,AjPStr * myid,AjPList * myfdl)1011 static AjBool dbiflat_ParseEmbl(AjPFile libr, AjPFile* alistfile,
1012 AjBool systemsort, AjPStr* fields,
1013 ajint* maxFieldLen, ajuint* countfield,
1014 ajint* dpos, AjPStr* myid,
1015 AjPList* myfdl)
1016 {
1017 AjPStr tmpacnum = NULL;
1018 char* fd;
1019 ajint lineType;
1020 static ajint numFields;
1021 static ajint accfield = -1;
1022 static ajint desfield = -1;
1023 static ajint keyfield = -1;
1024 static ajint taxfield = -1;
1025 static ajint svnfield = -1;
1026 static AjBool reset = AJTRUE;
1027 AjBool svndone = ajFalse;
1028 AjBool done = ajFalse;
1029 ajint i = 0;
1030 ajint lo;
1031 ajint hi;
1032 ajint fieldwidth;
1033 AjPStr tmpac = NULL;
1034 AjPStr format = NULL;
1035 AjPStr prefix = NULL;
1036 const char* p;
1037 const char* q;
1038
1039 if(!fields)
1040 {
1041 reset = ajTrue;
1042 accfield = svnfield = desfield = keyfield = taxfield = -1;
1043 return ajFalse;
1044 }
1045
1046 if(reset)
1047 {
1048 numFields = 0;
1049 while(fields[numFields])
1050 {
1051 countfield[numFields]=0;
1052 if(ajStrMatchCaseC(fields[numFields], "acc"))
1053 accfield=numFields;
1054 else if(ajStrMatchCaseC(fields[numFields], "sv"))
1055 svnfield=numFields;
1056 else if(ajStrMatchCaseC(fields[numFields], "des"))
1057 desfield=numFields;
1058 else if(ajStrMatchCaseC(fields[numFields], "key"))
1059 keyfield=numFields;
1060 else if(ajStrMatchCaseC(fields[numFields], "org"))
1061 taxfield=numFields;
1062 else
1063 ajWarn("EMBL parsing unknown field '%S' ignored",
1064 fields[numFields]);
1065 numFields++;
1066 }
1067
1068 reset = ajFalse;
1069 }
1070
1071 if(!dbiflatGRegEmblType)
1072 dbiflatGRegEmblType = ajRegCompC("^([A-Z][A-Z]) +");
1073
1074 if(!dbiflatGRegEmblAcc)
1075 dbiflatGRegEmblAcc = ajRegCompC("([A-Za-z0-9-]+)");
1076
1077 if(!dbiflatGRegEmblWrd)
1078 dbiflatGRegEmblWrd = ajRegCompC("([A-Za-z0-9_]+)");
1079
1080 if(!dbiflatGRegEmblVer)
1081 dbiflatGRegEmblVer = ajRegCompC("([A-Za-z0-9_.]+)");
1082
1083 if(!dbiflatGRegEmblPhr)
1084 dbiflatGRegEmblPhr = ajRegCompC(" *([^;.\n\r]+)");
1085
1086 if(!dbiflatGRegEmblTax)
1087 dbiflatGRegEmblTax = ajRegCompC(" *([^;.\n\r()]+)");
1088
1089 if(!dbiflatGRegEmblId)
1090 dbiflatGRegEmblId = ajRegCompC("^ID ([^\\s;]+)(;\\s+SV\\s+(\\d+))?");
1091
1092 if(!dbiflatGRegEmblEnd)
1093 dbiflatGRegEmblEnd = ajRegCompC("^//");
1094
1095 *dpos = (ajint) ajFileResetPos(libr); /* Lossy cast */
1096
1097 while(ajReadline(libr, &dbiflatGRline))
1098 {
1099 if(ajRegExec(dbiflatGRegEmblEnd, dbiflatGRline))
1100 {
1101 done = ajTrue;
1102 break;
1103 }
1104
1105 if(ajRegExec(dbiflatGRegEmblType, dbiflatGRline))
1106 {
1107 ajRegSubI(dbiflatGRegEmblType, 1, &dbiflatGTypStr);
1108 if(ajStrMatchC(dbiflatGTypStr, "ID"))
1109 lineType = FLATTYPE_ID;
1110 else if(ajStrMatchC(dbiflatGTypStr, "SV") ||
1111 ajStrMatchC(dbiflatGTypStr, "IV")) /* emblcds database */
1112 lineType = FLATTYPE_VER;
1113 else if(ajStrMatchC(dbiflatGTypStr, "AC") ||
1114 ajStrMatchC(dbiflatGTypStr, "PA")) /* emblcds database */
1115 lineType = FLATTYPE_ACC;
1116 else if(ajStrMatchC(dbiflatGTypStr, "DE"))
1117 lineType = FLATTYPE_DES;
1118 else if(ajStrMatchC(dbiflatGTypStr, "KW"))
1119 lineType = FLATTYPE_KEY;
1120 else if(ajStrMatchC(dbiflatGTypStr, "OS"))
1121 lineType = FLATTYPE_TAX;
1122 else if(ajStrMatchC(dbiflatGTypStr, "OC"))
1123 lineType = FLATTYPE_TAX;
1124 else
1125 lineType=FLATTYPE_OTHER;
1126
1127 if(lineType != FLATTYPE_OTHER)
1128 ajRegPost(dbiflatGRegEmblType, &dbiflatGTmpLine);
1129 }
1130 else
1131 lineType = FLATTYPE_OTHER;
1132
1133 if(lineType == FLATTYPE_ID)
1134 {
1135 ajRegExec(dbiflatGRegEmblId, dbiflatGRline);
1136 ajRegSubI(dbiflatGRegEmblId, 1, myid);
1137 ajStrFmtUpper(myid);
1138 ajDebug("++id '%S'\n", *myid);
1139 ajRegSubI(dbiflatGRegEmblId, 3, &dbiflatGTmpFd);
1140 if(svnfield >= 0 && ajStrGetLen(dbiflatGTmpFd))
1141 {
1142 ajStrFmtUpper(&dbiflatGTmpFd);
1143 ajStrInsertK(&dbiflatGTmpFd, 0, '.');
1144 ajStrInsertS(&dbiflatGTmpFd, 0, *myid);
1145 /*ajDebug("++sv '%S'\n", dbiflatGTmpFd);*/
1146 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1147
1148 countfield[svnfield]++;
1149 if(systemsort)
1150 ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1151 *myid, dbiflatGTmpFd);
1152 else
1153 {
1154 fd = ajCharNewS(dbiflatGTmpFd);
1155 ajListPushAppend(myfdl[svnfield], fd);
1156 }
1157 svndone = ajTrue;
1158 }
1159 continue;
1160 }
1161
1162 if(lineType == FLATTYPE_ACC && accfield >= 0)
1163 {
1164 while(ajRegExec(dbiflatGRegEmblAcc, dbiflatGTmpLine))
1165 {
1166 ajRegSubI(dbiflatGRegEmblAcc, 1, &dbiflatGTmpFd);
1167 ajStrFmtUpper(&dbiflatGTmpFd);
1168 /*ajDebug("++acc '%S'\n", dbiflatGTmpFd);*/
1169
1170 if(!tmpacnum)
1171 ajStrAssignS(&tmpacnum, dbiflatGTmpFd);
1172
1173 if((p=strchr(MAJSTRGETPTR(dbiflatGTmpFd),(int)'-')))
1174 {
1175 q = p;
1176 while(isdigit((int)*(--q)));
1177 ++q;
1178 ajStrAssignSubC(&dbiflatGTmpStr,q,0,(ajint)(p-q-1));
1179 ajStrToInt(dbiflatGTmpStr,&lo);
1180 fieldwidth = (ajint) (p-q);
1181 ajFmtPrintS(&format,"%%S%%0%dd",fieldwidth);
1182
1183 ++p;
1184 q = p;
1185 while(!isdigit((int)*q))
1186 ++q;
1187 sscanf(q,"%d",&hi);
1188 ajStrAssignSubC(&prefix,p,0,(ajint)(q-p-1));
1189
1190 if(systemsort)
1191 {
1192 for(i=lo;i<=hi;++i)
1193 {
1194 ajFmtPrintS(&tmpac,MAJSTRGETPTR(format),prefix,i);
1195 embDbiMaxlen(&tmpac, &maxFieldLen[accfield]);
1196 countfield[accfield]++;
1197 ajFmtPrintF(alistfile[accfield],
1198 "%S %S\n", *myid, tmpac);
1199 }
1200 ajStrDel(&tmpac);
1201 }
1202 else
1203 {
1204 for(i=lo;i<=hi;++i)
1205 {
1206 ajFmtPrintS(&tmpac,MAJSTRGETPTR(format),prefix,i);
1207 embDbiMaxlen(&tmpac, &maxFieldLen[accfield]);
1208 countfield[accfield]++;
1209 fd = ajCharNewS(tmpac);
1210 ajListPushAppend(myfdl[accfield], fd);
1211 }
1212 ajStrDel(&tmpac);
1213 }
1214 ajStrDel(&format);
1215 ajStrDel(&prefix);
1216 }
1217 else {
1218 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[accfield]);
1219
1220 countfield[accfield]++;
1221 if(systemsort)
1222 ajFmtPrintF(alistfile[accfield],
1223 "%S %S\n", *myid, dbiflatGTmpFd);
1224 else
1225 {
1226 fd = ajCharNewS(dbiflatGTmpFd);
1227 ajListPushAppend(myfdl[accfield], fd);
1228 }
1229 }
1230 ajRegPost(dbiflatGRegEmblAcc, &dbiflatGTmpStr);
1231 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1232 }
1233 continue;
1234 }
1235 else if(lineType == FLATTYPE_DES && desfield >= 0)
1236 {
1237 while(ajRegExec(dbiflatGRegEmblWrd, dbiflatGTmpLine))
1238 {
1239 ajRegSubI(dbiflatGRegEmblWrd, 1, &dbiflatGTmpFd);
1240 ajStrFmtUpper(&dbiflatGTmpFd);
1241 /*ajDebug("++des '%S'\n", dbiflatGTmpFd);*/
1242 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[desfield]);
1243
1244 countfield[desfield]++;
1245 if(systemsort)
1246 ajFmtPrintF(alistfile[desfield], "%S %S\n",
1247 *myid, dbiflatGTmpFd);
1248 else
1249 {
1250 fd = ajCharNewS(dbiflatGTmpFd);
1251 ajListPushAppend(myfdl[desfield], fd);
1252 }
1253 ajRegPost(dbiflatGRegEmblWrd, &dbiflatGTmpStr);
1254 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1255 }
1256 continue;
1257 }
1258 else if(lineType == FLATTYPE_VER && svnfield >= 0)
1259 {
1260 while(ajRegExec(dbiflatGRegEmblVer, dbiflatGTmpLine))
1261 {
1262 ajRegSubI(dbiflatGRegEmblVer, 1, &dbiflatGTmpFd);
1263 ajStrFmtUpper(&dbiflatGTmpFd);
1264 /*ajDebug("++sv '%S'\n", dbiflatGTmpFd);*/
1265 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1266
1267 countfield[svnfield]++;
1268 if(systemsort)
1269 ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1270 *myid, dbiflatGTmpFd);
1271 else
1272 {
1273 fd = ajCharNewS(dbiflatGTmpFd);
1274 ajListPushAppend(myfdl[svnfield], fd);
1275 }
1276 ajRegPost(dbiflatGRegEmblVer, &dbiflatGTmpStr);
1277 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1278 }
1279 svndone = ajTrue;
1280 continue;
1281 }
1282 else if(lineType == FLATTYPE_KEY && keyfield >= 0)
1283 {
1284 while(ajRegExec(dbiflatGRegEmblPhr, dbiflatGTmpLine))
1285 {
1286 ajRegSubI(dbiflatGRegEmblPhr, 1, &dbiflatGTmpFd);
1287 ajRegPost(dbiflatGRegEmblPhr, &dbiflatGTmpStr);
1288 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1289 ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1290 if(!ajStrGetLen(dbiflatGTmpFd))
1291 continue;
1292 ajStrFmtUpper(&dbiflatGTmpFd);
1293 /*ajDebug("++key '%S'\n", dbiflatGTmpFd);*/
1294 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[keyfield]);
1295
1296 countfield[keyfield]++;
1297 if(systemsort)
1298 ajFmtPrintF(alistfile[keyfield], "%S %S\n",
1299 *myid, dbiflatGTmpFd);
1300 else
1301 {
1302 fd = ajCharNewS(dbiflatGTmpFd);
1303 ajListPushAppend(myfdl[keyfield], fd);
1304 }
1305 }
1306 continue;
1307 }
1308 else if(lineType == FLATTYPE_TAX && taxfield >= 0)
1309 {
1310 while(ajRegExec(dbiflatGRegEmblTax, dbiflatGTmpLine))
1311 {
1312 ajRegSubI(dbiflatGRegEmblTax, 1, &dbiflatGTmpFd);
1313 ajRegPost(dbiflatGRegEmblTax, &dbiflatGTmpStr);
1314 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1315 ajStrFmtUpper(&dbiflatGTmpFd);
1316 ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1317 if(!ajStrGetLen(dbiflatGTmpFd))
1318 continue;
1319 /*ajDebug("++tax '%S'\n", dbiflatGTmpFd);*/
1320 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[taxfield]);
1321
1322 countfield[taxfield]++;
1323 if(systemsort)
1324 ajFmtPrintF(alistfile[taxfield], "%S %S\n",
1325 *myid, dbiflatGTmpFd);
1326 else
1327 {
1328 fd = ajCharNewS(dbiflatGTmpFd);
1329 ajListPushAppend(myfdl[taxfield], fd);
1330 }
1331 }
1332 continue;
1333 }
1334 }
1335
1336 if(!done)
1337 return ajFalse;
1338
1339 if(svnfield >= 0 && !svndone && tmpacnum)
1340 {
1341 ajFmtPrintS(&dbiflatGTmpFd, "%S.0", tmpacnum);
1342 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1343
1344 countfield[svnfield]++;
1345 if(systemsort)
1346 ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiflatGTmpFd);
1347 else
1348 {
1349 fd = ajCharNewS(dbiflatGTmpFd);
1350 ajListPushAppend(myfdl[svnfield], fd);
1351 }
1352 }
1353
1354 ajStrDel(&tmpacnum);
1355
1356 return ajTrue;
1357 }
1358
1359
1360
1361
1362 /* @funcstatic dbiflat_ParseGenbank *******************************************
1363 **
1364 ** Parse the ID, accession from a Genbank entry
1365 **
1366 ** @param [u] libr [AjPFile] Input database file
1367 ** @param [u] alistfile [AjPFile*] field data files array
1368 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1369 ** @param [w] fields [AjPStr*] Fields required
1370 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1371 ** @param [w] countfield [ajuint*] Number of tokens for each field
1372 ** @param [w] dpos [ajint*] Byte offset
1373 ** @param [w] myid [AjPStr*] ID
1374 ** @param [w] myfdl [AjPList*] Lists of field values
1375 ** @return [AjBool] ajTrue on success.
1376 ** @@
1377 ******************************************************************************/
1378
dbiflat_ParseGenbank(AjPFile libr,AjPFile * alistfile,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * countfield,ajint * dpos,AjPStr * myid,AjPList * myfdl)1379 static AjBool dbiflat_ParseGenbank(AjPFile libr, AjPFile* alistfile,
1380 AjBool systemsort, AjPStr* fields,
1381 ajint* maxFieldLen, ajuint* countfield,
1382 ajint* dpos, AjPStr* myid,
1383 AjPList* myfdl)
1384 {
1385 ajint lineType = FLATTYPE_OTHER;
1386 AjPStr tmpacnum = NULL;
1387 char* fd;
1388 ajlong ipos = 0;
1389 static ajint numFields;
1390 static ajint accfield = -1;
1391 static ajint desfield = -1;
1392 static ajint keyfield = -1;
1393 static ajint taxfield = -1;
1394 static ajint svnfield = -1;
1395 static AjBool reset = AJTRUE;
1396 AjBool done = ajFalse;
1397 AjBool svndone = ajFalse;
1398
1399 if(!fields)
1400 {
1401 reset = ajTrue;
1402 accfield = svnfield = desfield = keyfield = taxfield = -1;
1403 return ajFalse;
1404 }
1405
1406 if(reset)
1407 {
1408 numFields = 0;
1409 while(fields[numFields])
1410 {
1411 countfield[numFields]=0;
1412 if(ajStrMatchCaseC(fields[numFields], "acc"))
1413 accfield=numFields;
1414 else if(ajStrMatchCaseC(fields[numFields], "sv"))
1415 svnfield=numFields;
1416 else if(ajStrMatchCaseC(fields[numFields], "des"))
1417 desfield=numFields;
1418 else if(ajStrMatchCaseC(fields[numFields], "key"))
1419 keyfield=numFields;
1420 else if(ajStrMatchCaseC(fields[numFields], "org"))
1421 taxfield=numFields;
1422 else
1423 ajWarn("GenBank parsing unknown field '%S' ignored",
1424 fields[numFields]);
1425
1426 numFields++;
1427 }
1428 reset = ajFalse;
1429 }
1430
1431 if(!dbiflatGRegGbType)
1432 dbiflatGRegGbType = ajRegCompC("^( )?([A-Z]+)");
1433
1434 if(!dbiflatGRegGbMore)
1435 dbiflatGRegGbMore = ajRegCompC("^ ");
1436
1437 if(!dbiflatGRegGbWrd)
1438 dbiflatGRegGbWrd = ajRegCompC("([A-Za-z0-9_]+)");
1439
1440 if(!dbiflatGRegGbPhr)
1441 dbiflatGRegGbPhr = ajRegCompC(" *([^;.\n\r]+)");
1442
1443 if(!dbiflatGRegGbTax)
1444 dbiflatGRegGbTax = ajRegCompC(" *([^;.\n\r()]+)");
1445
1446 if(!dbiflatGRegGbVer)
1447 dbiflatGRegGbVer = ajRegCompC("([A-Za-z0-9.]+)( +GI:([0-9]+))?");
1448
1449 if(!dbiflatGRegGbEnd)
1450 dbiflatGRegGbEnd = ajRegCompC("^//");
1451
1452 ipos = ajFileResetPos(libr);
1453
1454 while(ajReadline(libr, &dbiflatGRline))
1455 {
1456 if(ajRegExec(dbiflatGRegGbEnd, dbiflatGRline))
1457 {
1458 done = ajTrue;
1459 break;
1460 }
1461
1462 if(ajRegExec(dbiflatGRegGbType, dbiflatGRline))
1463 {
1464 ajRegSubI(dbiflatGRegGbType, 2, &dbiflatGTypStr);
1465 if(ajStrMatchC(dbiflatGTypStr, "LOCUS"))
1466 lineType = FLATTYPE_ID;
1467 else if(ajStrMatchC(dbiflatGTypStr, "VERSION"))
1468 lineType = FLATTYPE_VER;
1469 else if(ajStrMatchC(dbiflatGTypStr, "ACCESSION"))
1470 lineType = FLATTYPE_ACC;
1471 else if(ajStrMatchC(dbiflatGTypStr, "DEFINITION"))
1472 lineType = FLATTYPE_DES;
1473 else if(ajStrMatchC(dbiflatGTypStr, "KEYWORDS"))
1474 lineType = FLATTYPE_KEY;
1475 else if(ajStrMatchC(dbiflatGTypStr, "ORGANISM"))
1476 lineType = FLATTYPE_TAX;
1477 else lineType=FLATTYPE_OTHER;
1478
1479 if(lineType != FLATTYPE_OTHER)
1480 ajRegPost(dbiflatGRegGbType, &dbiflatGTmpLine);
1481 /*ajDebug("++type line %d\n", lineType);*/
1482 }
1483 else if(lineType != FLATTYPE_OTHER &&
1484 ajRegExec(dbiflatGRegGbMore, dbiflatGRline))
1485 {
1486 ajRegPost(dbiflatGRegGbMore, &dbiflatGTmpLine);
1487 /*ajDebug("++more line %d\n", lineType);*/
1488 }
1489 else
1490 lineType = FLATTYPE_OTHER;
1491
1492 if(lineType == FLATTYPE_ID)
1493 {
1494 ajRegExec(dbiflatGRegGbWrd, dbiflatGTmpLine);
1495 ajRegSubI(dbiflatGRegGbWrd, 1, myid);
1496 *dpos = (ajint) ipos; /* Lossy cast */
1497 }
1498
1499 else if(lineType == FLATTYPE_ACC && accfield >= 0)
1500 {
1501 while(ajRegExec(dbiflatGRegGbWrd, dbiflatGTmpLine))
1502 {
1503 ajRegSubI(dbiflatGRegGbWrd, 1, &dbiflatGTmpFd);
1504 ajStrFmtUpper(&dbiflatGTmpFd);
1505 /*ajDebug("++acc '%S'\n", dbiflatGTmpFd);*/
1506 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[accfield]);
1507
1508 countfield[accfield]++;
1509 if(systemsort)
1510 ajFmtPrintF(alistfile[accfield], "%S %S\n",
1511 *myid, dbiflatGTmpFd);
1512 else
1513 {
1514 fd = ajCharNewS(dbiflatGTmpFd);
1515 ajListPushAppend(myfdl[accfield], fd);
1516 }
1517 ajRegPost(dbiflatGRegGbWrd, &dbiflatGTmpStr);
1518 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1519 }
1520 continue;
1521 }
1522
1523 else if(lineType == FLATTYPE_DES && desfield >= 0)
1524 {
1525 while(ajRegExec(dbiflatGRegGbWrd, dbiflatGTmpLine))
1526 {
1527 ajRegSubI(dbiflatGRegGbWrd, 1, &dbiflatGTmpFd);
1528 ajStrFmtUpper(&dbiflatGTmpFd);
1529 /*ajDebug("++des '%S'\n", dbiflatGTmpFd);*/
1530 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[desfield]);
1531
1532 countfield[desfield]++;
1533 if(systemsort)
1534 ajFmtPrintF(alistfile[desfield],
1535 "%S %S\n", *myid, dbiflatGTmpFd);
1536 else
1537 {
1538 fd = ajCharNewS(dbiflatGTmpFd);
1539 ajListPushAppend(myfdl[desfield], fd);
1540 }
1541 ajRegPost(dbiflatGRegGbWrd, &dbiflatGTmpStr);
1542 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1543 }
1544 continue;
1545 }
1546
1547 else if(lineType == FLATTYPE_KEY && keyfield >= 0)
1548 {
1549 while(ajRegExec(dbiflatGRegGbPhr, dbiflatGTmpLine))
1550 {
1551 ajRegSubI(dbiflatGRegGbPhr, 1, &dbiflatGTmpFd);
1552 ajRegPost(dbiflatGRegGbPhr, &dbiflatGTmpStr);
1553 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1554 ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1555 if(!ajStrGetLen(dbiflatGTmpFd))
1556 continue;
1557 ajStrFmtUpper(&dbiflatGTmpFd);
1558 /*ajDebug("++key '%S'\n", dbiflatGTmpFd);*/
1559 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[keyfield]);
1560
1561 countfield[keyfield]++;
1562 if(systemsort)
1563 ajFmtPrintF(alistfile[keyfield],
1564 "%S %S\n", *myid, dbiflatGTmpFd);
1565 else
1566 {
1567 fd = ajCharNewS(dbiflatGTmpFd);
1568 ajListPushAppend(myfdl[keyfield], fd);
1569 }
1570 }
1571 continue;
1572 }
1573
1574 else if(lineType == FLATTYPE_TAX && taxfield >= 0)
1575 {
1576 while(ajRegExec(dbiflatGRegGbTax, dbiflatGTmpLine))
1577 {
1578 ajRegSubI(dbiflatGRegGbTax, 1, &dbiflatGTmpFd);
1579 ajRegPost(dbiflatGRegGbTax, &dbiflatGTmpStr);
1580 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1581 ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1582 if(!ajStrGetLen(dbiflatGTmpFd))
1583 continue;
1584 ajStrFmtUpper(&dbiflatGTmpFd);
1585 /*ajDebug("++tax '%S'\n", dbiflatGTmpFd);*/
1586 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[taxfield]);
1587
1588 countfield[taxfield]++;
1589 if(systemsort)
1590 ajFmtPrintF(alistfile[taxfield],
1591 "%S %S\n", *myid, dbiflatGTmpFd);
1592 else
1593 {
1594 fd = ajCharNewS(dbiflatGTmpFd);
1595 ajListPushAppend(myfdl[taxfield], fd);
1596 }
1597 }
1598 continue;
1599 }
1600
1601 else if(lineType == FLATTYPE_VER && svnfield >= 0)
1602 {
1603 if(ajRegExec(dbiflatGRegGbVer, dbiflatGTmpLine))
1604 {
1605 ajRegSubI(dbiflatGRegGbVer, 1, &dbiflatGTmpFd);
1606 ajStrFmtUpper(&dbiflatGTmpFd);
1607 /*ajDebug("++ver '%S'\n", dbiflatGTmpFd);*/
1608 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1609
1610 if(systemsort)
1611 ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1612 *myid, dbiflatGTmpFd);
1613 else
1614 {
1615 fd = ajCharNewS(dbiflatGTmpFd);
1616 ajListPushAppend(myfdl[svnfield], fd);
1617 }
1618 svndone = ajTrue;
1619
1620 ajRegSubI(dbiflatGRegGbVer, 3, &dbiflatGTmpFd);
1621 if(!ajStrGetLen(dbiflatGTmpFd))
1622 continue;
1623 ajStrFmtUpper(&dbiflatGTmpFd);
1624 /*ajDebug("++ver gi: '%S'\n", dbiflatGTmpFd);*/
1625
1626 countfield[svnfield]++;
1627 if(systemsort)
1628 ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1629 *myid, dbiflatGTmpFd);
1630 else
1631 {
1632 fd = ajCharNewS(dbiflatGTmpFd);
1633 ajListPushAppend(myfdl[svnfield], fd);
1634 }
1635 }
1636 continue;
1637 }
1638
1639 ipos = ajFileResetPos(libr);
1640 }
1641
1642 if(!done)
1643 return ajFalse;
1644
1645 if(svnfield >= 0 && !svndone && tmpacnum)
1646 {
1647 ajFmtPrintS(&dbiflatGTmpFd, "%S.0", tmpacnum);
1648 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1649
1650 countfield[svnfield]++;
1651 if(systemsort)
1652 ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiflatGTmpFd);
1653 else
1654 {
1655 fd = ajCharNewS(dbiflatGTmpFd);
1656 ajListPushAppend(myfdl[svnfield], fd);
1657 }
1658 }
1659
1660 ajStrDel(&tmpacnum);
1661
1662 return ajTrue;
1663 }
1664
1665
1666
1667
1668 /* @funcstatic dbiflat_ParseRefseq ********************************************
1669 **
1670 ** Parse the ID, accession from an NCBI REFSEQ entry
1671 **
1672 ** @param [u] libr [AjPFile] Input database file
1673 ** @param [u] alistfile [AjPFile*] field data files array
1674 ** @param [r] systemsort [AjBool] If ajTrue use system sort, else internal sort
1675 ** @param [w] fields [AjPStr*] Fields required
1676 ** @param [w] maxFieldLen [ajint*] Maximum token length for each field
1677 ** @param [w] countfield [ajuint*] Number of tokens for each field
1678 ** @param [w] dpos [ajint*] Byte offset
1679 ** @param [w] myid [AjPStr*] ID
1680 ** @param [w] myfdl [AjPList*] Lists of field values
1681 ** @return [AjBool] ajTrue on success.
1682 ** @@
1683 ******************************************************************************/
1684
dbiflat_ParseRefseq(AjPFile libr,AjPFile * alistfile,AjBool systemsort,AjPStr * fields,ajint * maxFieldLen,ajuint * countfield,ajint * dpos,AjPStr * myid,AjPList * myfdl)1685 static AjBool dbiflat_ParseRefseq(AjPFile libr, AjPFile* alistfile,
1686 AjBool systemsort, AjPStr* fields,
1687 ajint* maxFieldLen, ajuint* countfield,
1688 ajint* dpos, AjPStr* myid,
1689 AjPList* myfdl)
1690 {
1691 ajint lineType = FLATTYPE_OTHER;
1692 AjPStr tmpacnum = NULL;
1693 char* fd;
1694 ajlong ipos = 0;
1695 static ajint numFields;
1696 static ajint accfield = -1;
1697 static ajint desfield = -1;
1698 static ajint keyfield = -1;
1699 static ajint taxfield = -1;
1700 static ajint svnfield = -1;
1701 static AjBool reset = AJTRUE;
1702 AjBool done = ajFalse;
1703 AjBool svndone = ajFalse;
1704
1705 if(!fields)
1706 {
1707 reset = ajTrue;
1708 accfield = svnfield = desfield = keyfield = taxfield = -1;
1709 return ajFalse;
1710 }
1711
1712 if(reset)
1713 {
1714 numFields = 0;
1715 while(fields[numFields])
1716 {
1717 countfield[numFields]=0;
1718 if(ajStrMatchCaseC(fields[numFields], "acc"))
1719 accfield=numFields;
1720 else if(ajStrMatchCaseC(fields[numFields], "sv"))
1721 svnfield=numFields;
1722 else if(ajStrMatchCaseC(fields[numFields], "des"))
1723 desfield=numFields;
1724 else if(ajStrMatchCaseC(fields[numFields], "key"))
1725 keyfield=numFields;
1726 else if(ajStrMatchCaseC(fields[numFields], "org"))
1727 taxfield=numFields;
1728 else
1729 ajWarn("GenBank parsing unknown field '%S' ignored",
1730 fields[numFields]);
1731
1732 numFields++;
1733 }
1734 reset = ajFalse;
1735 }
1736
1737 /*
1738 ** These are almost the same as GenBank, but with some exceptions noted
1739 */
1740
1741 if(!dbiflatGRegRefseqTyp)
1742 dbiflatGRegRefseqTyp = ajRegCompC("^( )?([A-Z]+)");
1743
1744 if(!dbiflatGRegRefseqMore)
1745 dbiflatGRegRefseqMore = ajRegCompC("^ ");
1746
1747 if(!dbiflatGRegRefseqWrd)
1748 dbiflatGRegRefseqWrd = ajRegCompC("([A-Za-z0-9_]+)");
1749
1750 if(!dbiflatGRegRefseqId) /* funny characters in IDs */
1751 dbiflatGRegRefseqId = ajRegCompC("([^ \t\r\n]+)");
1752
1753 if(!dbiflatGRegRefseqPhr)
1754 dbiflatGRegRefseqPhr = ajRegCompC(" *([^;.\n\r]+)");
1755
1756 if(!dbiflatGRegRefseqTax)
1757 dbiflatGRegRefseqTax = ajRegCompC(" *([^;.\n\r()]+)");
1758
1759 if(!dbiflatGRegRefseqVer) /* allow '_' in accession/version */
1760 dbiflatGRegRefseqVer = ajRegCompC("([A-Za-z0-9_.]+)( +GI:([0-9]+))?");
1761
1762 if(!dbiflatGRegRefseqEnd)
1763 dbiflatGRegRefseqEnd = ajRegCompC("^//");
1764
1765 ipos = ajFileResetPos(libr);
1766
1767 while(ajReadline(libr, &dbiflatGRline))
1768 {
1769 if(ajRegExec(dbiflatGRegRefseqEnd, dbiflatGRline))
1770 {
1771 done = ajTrue;
1772 break;
1773 }
1774
1775 if(ajRegExec(dbiflatGRegRefseqTyp, dbiflatGRline))
1776 {
1777 ajRegSubI(dbiflatGRegRefseqTyp, 2, &dbiflatGTypStr);
1778 if(ajStrMatchC(dbiflatGTypStr, "LOCUS"))
1779 lineType = FLATTYPE_ID;
1780 else if(ajStrMatchC(dbiflatGTypStr, "VERSION"))
1781 lineType = FLATTYPE_VER;
1782 else if(ajStrMatchC(dbiflatGTypStr, "ACCESSION"))
1783 lineType = FLATTYPE_ACC;
1784 else if(ajStrMatchC(dbiflatGTypStr, "DEFINITION"))
1785 lineType = FLATTYPE_DES;
1786 else if(ajStrMatchC(dbiflatGTypStr, "KEYWORDS"))
1787 lineType = FLATTYPE_KEY;
1788 else if(ajStrMatchC(dbiflatGTypStr, "ORGANISM"))
1789 lineType = FLATTYPE_TAX;
1790 else
1791 lineType=FLATTYPE_OTHER;
1792
1793 if(lineType != FLATTYPE_OTHER)
1794 ajRegPost(dbiflatGRegRefseqTyp, &dbiflatGTmpLine);
1795 /*ajDebug("++type line %d\n", lineType);*/
1796 }
1797 else if(lineType != FLATTYPE_OTHER &&
1798 ajRegExec(dbiflatGRegRefseqMore, dbiflatGRline))
1799 {
1800 ajRegPost(dbiflatGRegRefseqMore, &dbiflatGTmpLine);
1801 /*ajDebug("++more line %d\n", lineType);*/
1802 }
1803 else
1804 lineType = FLATTYPE_OTHER;
1805
1806 if(lineType == FLATTYPE_ID) /* use REFSEQ-specific idexp */
1807 {
1808 ajRegExec(dbiflatGRegRefseqId, dbiflatGTmpLine);
1809 ajRegSubI(dbiflatGRegRefseqId, 1, myid);
1810 ajStrFmtUpper(myid);
1811 *dpos = (ajint) ipos; /* Lossy cast */
1812 }
1813
1814 else if(lineType == FLATTYPE_ACC && accfield >= 0)
1815 {
1816 while(ajRegExec(dbiflatGRegRefseqWrd, dbiflatGTmpLine))
1817 /* should be OK */
1818 {
1819 ajRegSubI(dbiflatGRegRefseqWrd, 1, &dbiflatGTmpFd);
1820 ajStrFmtUpper(&dbiflatGTmpFd);
1821 /*ajDebug("++acc '%S'\n", dbiflatGTmpFd);*/
1822 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[accfield]);
1823
1824 countfield[accfield]++;
1825 if(systemsort)
1826 ajFmtPrintF(alistfile[accfield], "%S %S\n",
1827 *myid, dbiflatGTmpFd);
1828 else
1829 {
1830 fd = ajCharNewS(dbiflatGTmpFd);
1831 ajListPushAppend(myfdl[accfield], fd);
1832 }
1833 ajRegPost(dbiflatGRegRefseqWrd, &dbiflatGTmpStr);
1834 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1835 }
1836 continue;
1837 }
1838 else if(lineType == FLATTYPE_DES && desfield >= 0)
1839 {
1840 while(ajRegExec(dbiflatGRegRefseqWrd, dbiflatGTmpLine))
1841 {
1842 ajRegSubI(dbiflatGRegRefseqWrd, 1, &dbiflatGTmpFd);
1843 ajStrFmtUpper(&dbiflatGTmpFd);
1844 /*ajDebug("++des '%S'\n", dbiflatGTmpFd);*/
1845 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[desfield]);
1846
1847 countfield[desfield]++;
1848 if(systemsort)
1849 ajFmtPrintF(alistfile[desfield],
1850 "%S %S\n", *myid, dbiflatGTmpFd);
1851 else
1852 {
1853 fd = ajCharNewS(dbiflatGTmpFd);
1854 ajListPushAppend(myfdl[desfield], fd);
1855 }
1856 ajRegPost(dbiflatGRegRefseqWrd, &dbiflatGTmpStr);
1857 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1858 }
1859 continue;
1860 }
1861
1862 else if(lineType == FLATTYPE_KEY && keyfield >= 0)
1863 {
1864 while(ajRegExec(dbiflatGRegRefseqPhr, dbiflatGTmpLine))
1865 {
1866 ajRegSubI(dbiflatGRegRefseqPhr, 1, &dbiflatGTmpFd);
1867 ajRegPost(dbiflatGRegRefseqPhr, &dbiflatGTmpStr);
1868 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1869 ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1870 if(!ajStrGetLen(dbiflatGTmpFd))
1871 continue;
1872 ajStrFmtUpper(&dbiflatGTmpFd);
1873 /*ajDebug("++key '%S'\n", dbiflatGTmpFd);*/
1874 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[keyfield]);
1875
1876 countfield[keyfield]++;
1877 if(systemsort)
1878 ajFmtPrintF(alistfile[keyfield],
1879 "%S %S\n", *myid, dbiflatGTmpFd);
1880 else
1881 {
1882 fd = ajCharNewS(dbiflatGTmpFd);
1883 ajListPushAppend(myfdl[keyfield], fd);
1884 }
1885 }
1886 continue;
1887 }
1888 else if(lineType == FLATTYPE_TAX && taxfield >= 0)
1889 {
1890 while(ajRegExec(dbiflatGRegRefseqTax, dbiflatGTmpLine))
1891 {
1892 ajRegSubI(dbiflatGRegRefseqTax, 1, &dbiflatGTmpFd);
1893 ajRegPost(dbiflatGRegRefseqTax, &dbiflatGTmpStr);
1894 ajStrAssignS(&dbiflatGTmpLine, dbiflatGTmpStr);
1895 ajStrTrimWhiteEnd(&dbiflatGTmpFd);
1896 if(!ajStrGetLen(dbiflatGTmpFd))
1897 continue;
1898 ajStrFmtUpper(&dbiflatGTmpFd);
1899 /*ajDebug("++tax '%S'\n", dbiflatGTmpFd);*/
1900 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[taxfield]);
1901
1902 countfield[taxfield]++;
1903 if(systemsort)
1904 ajFmtPrintF(alistfile[taxfield],
1905 "%S %S\n", *myid, dbiflatGTmpFd);
1906 else
1907 {
1908 fd = ajCharNewS(dbiflatGTmpFd);
1909 ajListPushAppend(myfdl[taxfield], fd);
1910 }
1911 }
1912 continue;
1913 }
1914 else if(lineType == FLATTYPE_VER && svnfield >= 0)
1915 { /* special verexp for REFSEQ */
1916 if(ajRegExec(dbiflatGRegRefseqVer, dbiflatGTmpLine))
1917 {
1918 ajRegSubI(dbiflatGRegRefseqVer, 1, &dbiflatGTmpFd);
1919 ajStrFmtUpper(&dbiflatGTmpFd);
1920 /*ajDebug("++ver '%S'\n", dbiflatGTmpFd);*/
1921 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1922
1923 countfield[svnfield]++;
1924 if(systemsort)
1925 ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1926 *myid, dbiflatGTmpFd);
1927 else
1928 {
1929 fd = ajCharNewS(dbiflatGTmpFd);
1930 ajListPushAppend(myfdl[svnfield], fd);
1931 }
1932 svndone = ajTrue;
1933
1934 ajRegSubI(dbiflatGRegRefseqVer, 3, &dbiflatGTmpFd);
1935 if(!ajStrGetLen(dbiflatGTmpFd)) continue;
1936 ajStrFmtUpper(&dbiflatGTmpFd);
1937 /*ajDebug("++ver gi: '%S'\n", dbiflatGTmpFd);*/
1938
1939 if(systemsort)
1940 ajFmtPrintF(alistfile[svnfield], "%S %S\n",
1941 *myid, dbiflatGTmpFd);
1942 else
1943 {
1944 fd = ajCharNewS(dbiflatGTmpFd);
1945 ajListPushAppend(myfdl[svnfield], fd);
1946 }
1947 }
1948 continue;
1949 }
1950
1951 ipos = ajFileResetPos(libr);
1952 }
1953
1954 if(!done)
1955 return ajFalse;
1956
1957 if(svnfield >= 0 && !svndone && tmpacnum)
1958 {
1959 ajFmtPrintS(&dbiflatGTmpFd, "%S.0", tmpacnum);
1960 embDbiMaxlen(&dbiflatGTmpFd, &maxFieldLen[svnfield]);
1961
1962 countfield[svnfield]++;
1963 if(systemsort)
1964 ajFmtPrintF(alistfile[svnfield], "%S %S\n", *myid, dbiflatGTmpFd);
1965 else
1966 {
1967 fd = ajCharNewS(dbiflatGTmpFd);
1968 ajListPushAppend(myfdl[svnfield], fd);
1969 }
1970 }
1971
1972 ajStrDel(&tmpacnum);
1973
1974 return ajTrue;
1975 }
1976