1 /* @source dbxgcg application
2 **
3 ** Index GCG and PIR/NBRF format databases
4 **
5 ** @author Copyright (C) Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 #include "emboss.h"
24
25
26
27
28 #define GCGTYPE_OTHER 0
29 #define GCGTYPE_ID 1
30 #define GCGTYPE_ACC 2
31 #define GCGTYPE_DES 3
32 #define GCGTYPE_KEY 4
33 #define GCGTYPE_TAX 5
34 #define GCGTYPE_VER 6
35
36 static AjPStr dbxgcgRefline = NULL;
37 static AjPStr dbxgcgSeqline = NULL;
38 static AjPStr dbxgcgTmpfd = NULL;
39 static AjPStr dbxgcgTmpstr = NULL;
40 static AjPStr dbxgcgTmpline = NULL;
41 static AjPStr dbxgcgGcgtype = NULL;
42 static AjPStr dbxgcgGcgdate = NULL;
43 static AjPStr dbxgcgTypstr = NULL;
44 static AjPStr dbxgcgReflibstr = NULL;
45
46 static AjPRegexp dbxgcg_embl_typexp = NULL;
47 static AjPRegexp dbxgcg_embl_idexp = NULL;
48 static AjPRegexp dbxgcg_embl_verexp = NULL;
49 static AjPRegexp dbxgcg_embl_wrdexp = NULL;
50 static AjPRegexp dbxgcg_embl_phrexp = NULL;
51 static AjPRegexp dbxgcg_embl_taxexp = NULL;
52
53 static AjPRegexp dbxgcg_gcg_rexp = NULL;
54 static AjPRegexp dbxgcg_gcg_sexp = NULL;
55 static AjPRegexp dbxgcg_splitexp = NULL;
56
57 static AjPRegexp dbxgcg_pir_idexp = NULL;
58 static AjPRegexp dbxgcg_pir_acexp = NULL;
59 static AjPRegexp dbxgcg_pir_ac2exp = NULL;
60 static AjPRegexp dbxgcg_pir_keyexp = NULL;
61 static AjPRegexp dbxgcg_pir_taxexp = NULL;
62 static AjPRegexp dbxgcg_pir_tax2exp = NULL;
63 static AjPRegexp dbxgcg_pir_wrdexp = NULL;
64 static AjPRegexp dbxgcg_pir_phrexp = NULL;
65 static AjPRegexp dbxgcg_pir_pirexp = NULL;
66
67 static AjPRegexp dbxgcg_genbank_typexp = NULL;
68 static AjPRegexp dbxgcg_genbank_morexp = NULL;
69 static AjPRegexp dbxgcg_genbank_wrdexp = NULL;
70 static AjPRegexp dbxgcg_genbank_phrexp = NULL;
71 static AjPRegexp dbxgcg_genbank_taxexp = NULL;
72 static AjPRegexp dbxgcg_genbank_verexp = NULL;
73
74 static AjBool dbxgcg_ParseEmbl(AjPFile infr, AjPStr *reflibstr);
75 static AjBool dbxgcg_ParseGenbank( AjPFile infr, AjPStr *reflibstr);
76 static AjBool dbxgcg_ParsePir(AjPFile infr, AjPStr *reflibstr);
77
78 static AjBool dbxgcg_NextEntry(EmbPBtreeEntry entry, AjPFile infs,
79 AjPFile infr, const AjPStr dbtype);
80
81 static ajlong dbxgcg_gcggetent(EmbPBtreeEntry entry, AjPFile infs,
82 AjPFile infr, const AjPStr dbtype);
83 static ajlong dbxgcg_pirgetent(EmbPBtreeEntry entry, AjPFile infs,
84 AjPFile infr, const AjPStr dbtype);
85
86 static ajlong dbxgcg_gcgappent(AjPFile infr, AjPFile infs,
87 AjPRegexp rexp, AjPRegexp sexp,
88 AjPStr* libstr);
89
90
91
92 EmbPBtreeField accfield = NULL;
93 EmbPBtreeField svfield = NULL;
94 EmbPBtreeField orgfield = NULL;
95 EmbPBtreeField desfield = NULL;
96 EmbPBtreeField keyfield = NULL;
97
98
99
100
101 /* @datastatic DbxgcgPParser *************************************************
102 **
103 ** Parser definition structure
104 **
105 ** @alias DbxgcgSParser
106 ** @alias DbxgcgOParser
107 **
108 ** @attr Name [const char*] Parser name
109 ** @attr GcgType [AjBool] Gcg type parser if true, PIR type if false
110 ** @attr Padding [char[4]] Padding to alignment boundary
111 ** @attr Parser [AjBool function] Parser function
112 ** @@
113 ******************************************************************************/
114
115 typedef struct DbxgcgSParser
116 {
117 const char* Name;
118 AjBool GcgType;
119 char Padding[4];
120 AjBool (*Parser) (AjPFile infr, AjPStr *reflibstr);
121 } DbxgcgOParser;
122 #define DbxgcgPParser DbxgcgOParser*
123
124
125
126
127 static DbxgcgOParser parser[] =
128 {
129 {"EMBL", AJTRUE, "", dbxgcg_ParseEmbl},
130 {"SWISS", AJTRUE, "", dbxgcg_ParseEmbl},
131 {"GENBANK", AJTRUE, "", dbxgcg_ParseGenbank},
132 {"PIR", AJFALSE, "", dbxgcg_ParsePir},
133 {NULL, 0, "", NULL}
134 };
135
136
137
138
139
140 /* @prog dbxgcg **************************************************************
141 **
142 ** Index a flat file database
143 **
144 ******************************************************************************/
145
main(int argc,char ** argv)146 int main(int argc, char **argv)
147 {
148 EmbPBtreeEntry entry = NULL;
149
150 AjPStr dbname = NULL;
151 AjPStr dbrs = NULL;
152 AjPStr release = NULL;
153 AjPStr datestr = NULL;
154 AjBool statistics;
155 AjBool compressed;
156
157 AjPStr directory;
158 AjPStr indexdir;
159 AjPStr filename;
160 AjPStr exclude;
161 AjPStr dbtype = NULL;
162 AjPFile outf = NULL;
163
164 AjPStr *fieldarray = NULL;
165
166 ajint nfields;
167 ajint nfiles;
168
169 AjPStr refname = NULL;
170 AjPStr seqname = NULL;
171 AjPStr thysfile = NULL;
172
173 ajint i;
174 AjPFile infs = NULL;
175 AjPFile infr = NULL;
176
177 ajulong nentries = 0L;
178 ajulong ientries = 0L;
179 AjPTime starttime = NULL;
180 AjPTime begintime = NULL;
181 AjPTime nowtime = NULL;
182
183 ajulong idpricache=0L, idpriread = 0L, idpriwrite = 0L, idprisize= 0L;
184 ajulong idseccache=0L, idsecread = 0L, idsecwrite = 0L, idsecsize= 0L;
185 ajulong acpricache=0L, acpriread = 0L, acpriwrite = 0L, acprisize= 0L;
186 ajulong acseccache=0L, acsecread = 0L, acsecwrite = 0L, acsecsize= 0L;
187 ajulong svpricache=0L, svpriread = 0L, svpriwrite = 0L, svprisize= 0L;
188 ajulong svseccache=0L, svsecread = 0L, svsecwrite = 0L, svsecsize= 0L;
189 ajulong kwpricache=0L, kwpriread = 0L, kwpriwrite = 0L, kwprisize= 0L;
190 ajulong kwseccache=0L, kwsecread = 0L, kwsecwrite = 0L, kwsecsize= 0L;
191 ajulong depricache=0L, depriread = 0L, depriwrite = 0L, deprisize= 0L;
192 ajulong deseccache=0L, desecread = 0L, desecwrite = 0L, desecsize= 0L;
193 ajulong txpricache=0L, txpriread = 0L, txpriwrite = 0L, txprisize= 0L;
194 ajulong txseccache=0L, txsecread = 0L, txsecwrite = 0L, txsecsize= 0L;
195
196 embInit("dbxgcg", argc, argv);
197
198 dbtype = ajAcdGetListSingle("idformat");
199 fieldarray = ajAcdGetList("fields");
200 directory = ajAcdGetDirectoryName("directory");
201 outf = ajAcdGetOutfile("outfile");
202 indexdir = ajAcdGetOutdirName("indexoutdir");
203 filename = ajAcdGetString("filenames");
204 exclude = ajAcdGetString("exclude");
205 dbname = ajAcdGetString("dbname");
206 dbrs = ajAcdGetString("dbresource");
207 release = ajAcdGetString("release");
208 datestr = ajAcdGetString("date");
209 statistics = ajAcdGetBoolean("statistics");
210 compressed = ajAcdGetBoolean("compressed");
211
212 entry = embBtreeEntryNew(1);
213 if(compressed)
214 embBtreeEntrySetCompressed(entry);
215
216 nfields = embBtreeSetFields(entry,fieldarray);
217 embBtreeSetDbInfo(entry,dbname,dbrs,datestr,release,dbtype,directory,
218 indexdir);
219
220 for(i=0; i< nfields; i++)
221 {
222 if(ajStrMatchC(fieldarray[i], "acc"))
223 {
224 accfield = embBtreeGetFieldS(entry, fieldarray[i]);
225 if(compressed)
226 embBtreeFieldSetCompressed(accfield);
227 }
228 else if(ajStrMatchC(fieldarray[i], "sv"))
229 {
230 svfield = embBtreeGetFieldS(entry, fieldarray[i]);
231 if(compressed)
232 embBtreeFieldSetCompressed(svfield);
233 }
234 else if(ajStrMatchC(fieldarray[i], "des"))
235 {
236 desfield = embBtreeGetFieldS(entry, fieldarray[i]);
237 if(compressed)
238 embBtreeFieldSetCompressed(desfield);
239 }
240 else if(ajStrMatchC(fieldarray[i], "key"))
241 {
242 keyfield = embBtreeGetFieldS(entry, fieldarray[i]);
243 if(compressed)
244 embBtreeFieldSetCompressed(keyfield);
245 }
246 else if(ajStrMatchC(fieldarray[i], "org"))
247 {
248 orgfield = embBtreeGetFieldS(entry, fieldarray[i]);
249 if(compressed)
250 embBtreeFieldSetCompressed(orgfield);
251 }
252 else if(!ajStrMatchC(fieldarray[i], "id"))
253 ajErr("Unknown field '%S' specified for indexing", fieldarray[i]);
254 }
255
256 embBtreeGetRsInfo(entry);
257
258 nfiles = embBtreeGetFiles(entry,directory,filename,exclude);
259 if(!nfiles)
260 ajDie("No input files in '%S' matched filename '%S'",
261 directory, filename);
262
263
264 for(i=0; i<nfiles; ++i)
265 {
266 ajListPop(entry->files,(void **) &seqname);
267 refname = ajStrNew();
268 ajStrAssignS(&refname,seqname);
269 ajFilenameReplaceExtC(&seqname,"seq");
270 ajFilenameReplaceExtC(&refname,"ref");
271 ajListstrPushAppend(entry->files, seqname);
272 ajListstrPushAppend(entry->reffiles[0], refname);
273 }
274
275
276 embBtreeWriteEntryFile(entry);
277
278 embBtreeOpenCaches(entry);
279
280 starttime = ajTimeNewToday();
281
282 ajFmtPrintF(outf, "Processing directory: %S\n", directory);
283
284 for(i=0;i<nfiles;++i)
285 {
286 begintime = ajTimeNewToday();
287
288 ajListPop(entry->reffiles[0],(void **)&thysfile);
289 ajListstrPushAppend(entry->files, thysfile);
290 ajFmtPrintS(&dbxgcgTmpstr,"%S%S",entry->directory,thysfile);
291 if(!(infr=ajFileNewInNameS(dbxgcgTmpstr)))
292 ajFatal("Cannot open input file %S\n",dbxgcgTmpstr);
293
294 ajListPop(entry->files,(void **)&thysfile);
295 ajListstrPushAppend(entry->files, thysfile);
296 ajFmtPrintS(&dbxgcgTmpstr,"%S%S",entry->directory,thysfile);
297 if(!(infs=ajFileNewInNameS(dbxgcgTmpstr)))
298 ajFatal("Cannot open input file %S\n",dbxgcgTmpstr);
299
300 ajFilenameTrimPath(&dbxgcgTmpstr);
301 ajFmtPrintF(outf,"Processing file: %S\n",dbxgcgTmpstr);
302
303 ientries = 0L;
304
305 while(dbxgcg_NextEntry(entry,infs,infr,dbtype))
306 {
307 ++ientries;
308
309 if(entry->do_id)
310 embBtreeIndexEntry(entry, i);
311
312 if(accfield)
313 embBtreeIndexPrimary(accfield, entry, i);
314
315 if(svfield)
316 embBtreeIndexPrimary(svfield, entry, i);
317
318 if(keyfield)
319 embBtreeIndexSecondary(keyfield, entry);
320
321 if(desfield)
322 embBtreeIndexSecondary(desfield, entry);
323
324 if(orgfield)
325 embBtreeIndexSecondary(orgfield, entry);
326 }
327
328 ajFileClose(&infs);
329 ajFileClose(&infr);
330 nentries += ientries;
331 nowtime = ajTimeNewToday();
332 ajFmtPrintF(outf, "entries: %Lu (%Lu) time: %.1fs (%.1fs)\n",
333 nentries, ientries,
334 ajTimeDiff(starttime, nowtime),
335 ajTimeDiff(begintime, nowtime));
336
337 if(statistics)
338 {
339 if(entry->do_id)
340 ajBtreeCacheStatsOut(outf, entry->idcache,
341 &idpricache, &idseccache,
342 &idpriread, &idsecread,
343 &idpriwrite, &idsecwrite,
344 &idprisize, &idsecsize);
345 if(accfield)
346 ajBtreeCacheStatsOut(outf, accfield->cache,
347 &acpricache, &acseccache,
348 &acpriread, &acsecread,
349 &acpriwrite, &acsecwrite,
350 &acprisize, &acsecsize);
351 if(svfield)
352 ajBtreeCacheStatsOut(outf, svfield->cache,
353 &svpricache, &svseccache,
354 &svpriread, &svsecread,
355 &svpriwrite, &svsecwrite,
356 &svprisize, &svsecsize);
357 if(keyfield)
358 ajBtreeCacheStatsOut(outf, keyfield->cache,
359 &kwpricache, &kwseccache,
360 &kwpriread, &kwsecread,
361 &kwpriwrite, &kwsecwrite,
362 &kwprisize, &kwsecsize);
363 if(desfield)
364 ajBtreeCacheStatsOut(outf, desfield->cache,
365 &depricache, &deseccache,
366 &depriread, &desecread,
367 &depriwrite, &desecwrite,
368 &deprisize, &desecsize);
369 if(orgfield)
370 ajBtreeCacheStatsOut(outf, orgfield->cache,
371 &txpricache, &txseccache,
372 &txpriread, &txsecread,
373 &txpriwrite, &txsecwrite,
374 &txprisize, &txsecsize);
375 }
376
377 ajTimeDel(&begintime);
378 ajTimeDel(&nowtime);
379 }
380
381
382 nowtime = ajTimeNewToday();
383 ajFmtPrintF(outf, "Total time: %.1fs\n", ajTimeDiff(starttime, nowtime));
384 ajTimeDel(&nowtime);
385 ajTimeDel(&starttime);
386
387 embBtreeReportEntry(outf, entry);
388
389 if(accfield)
390 embBtreeReportField(outf, accfield);
391 if(svfield)
392 embBtreeReportField(outf, svfield);
393 if(orgfield)
394 embBtreeReportField(outf, orgfield);
395 if(desfield)
396 embBtreeReportField(outf, desfield);
397 if(keyfield)
398 embBtreeReportField(outf, keyfield);
399
400 embBtreeDumpParameters(entry);
401 embBtreeCloseCaches(entry);
402
403 ajFileClose(&outf);
404 embBtreeEntryDel(&entry);
405
406 ajStrDel(&filename);
407 ajStrDel(&exclude);
408 ajStrDel(&dbname);
409 ajStrDel(&dbrs);
410 ajStrDel(&release);
411 ajStrDel(&datestr);
412 ajStrDel(&directory);
413 ajStrDel(&indexdir);
414 ajStrDel(&dbtype);
415
416 ajStrDel(&dbxgcgRefline);
417 ajStrDel(&dbxgcgSeqline);
418 ajStrDel(&dbxgcgTmpfd);
419 ajStrDel(&dbxgcgTmpstr);
420 ajStrDel(&dbxgcgTmpline);
421 ajStrDel(&dbxgcgGcgtype);
422 ajStrDel(&dbxgcgGcgdate);
423 ajStrDel(&dbxgcgTypstr);
424 ajStrDel(&dbxgcgReflibstr);
425
426 nfields = 0;
427 while(fieldarray[nfields])
428 ajStrDel(&fieldarray[nfields++]);
429 AJFREE(fieldarray);
430
431 ajRegFree(&dbxgcg_embl_typexp);
432 ajRegFree(&dbxgcg_embl_idexp);
433 ajRegFree(&dbxgcg_embl_verexp);
434 ajRegFree(&dbxgcg_embl_wrdexp);
435 ajRegFree(&dbxgcg_embl_phrexp);
436 ajRegFree(&dbxgcg_embl_taxexp);
437
438 ajRegFree(&dbxgcg_gcg_rexp);
439 ajRegFree(&dbxgcg_gcg_sexp);
440
441 ajRegFree(&dbxgcg_splitexp);
442
443 ajRegFree(&dbxgcg_pir_idexp);
444 ajRegFree(&dbxgcg_pir_acexp);
445 ajRegFree(&dbxgcg_pir_ac2exp);
446 ajRegFree(&dbxgcg_pir_keyexp);
447 ajRegFree(&dbxgcg_pir_taxexp);
448 ajRegFree(&dbxgcg_pir_tax2exp);
449 ajRegFree(&dbxgcg_pir_wrdexp);
450 ajRegFree(&dbxgcg_pir_phrexp);
451 ajRegFree(&dbxgcg_pir_pirexp);
452
453 ajRegFree(&dbxgcg_genbank_typexp);
454 ajRegFree(&dbxgcg_genbank_morexp);
455 ajRegFree(&dbxgcg_genbank_wrdexp);
456 ajRegFree(&dbxgcg_genbank_phrexp);
457 ajRegFree(&dbxgcg_genbank_taxexp);
458 ajRegFree(&dbxgcg_genbank_verexp);
459
460 embExit();
461
462 return 0;
463 }
464
465
466
467
468 /* @funcstatic dbxgcg_NextEntry ***********************************************
469 **
470 ** Returns next database entry as an EmbPEntry object
471 **
472 ** @param [u] entry [EmbPBtreeEntry] b+tree entry pointer
473 ** @param [u] infs [AjPFile] sequence file
474 ** @param [u] infr [AjPFile] reference file
475 ** @param [r] dbtype [const AjPStr] Id format in GCG file
476 ** @return [AjBool] ajTrue if successful read
477 ** @@
478 ******************************************************************************/
479
dbxgcg_NextEntry(EmbPBtreeEntry entry,AjPFile infs,AjPFile infr,const AjPStr dbtype)480 static AjBool dbxgcg_NextEntry(EmbPBtreeEntry entry, AjPFile infs,
481 AjPFile infr, const AjPStr dbtype)
482 {
483 char *p;
484
485 if(!dbxgcg_splitexp)
486 dbxgcg_splitexp = ajRegCompC("_0+$");
487
488 entry->reffpos[0] = ajFileResetPos(infr);
489 entry->fpos = ajFileResetPos(infs);
490
491 if(!dbxgcg_gcggetent(entry, infs, infr, dbtype) &&
492 !dbxgcg_pirgetent(entry, infs, infr, dbtype))
493 return ajFalse;
494
495 ajDebug("id '%S' seqfpos:%d reffpos:%d\n",
496 entry->id, entry->fpos, entry->reffpos);
497
498 ajStrAssignC(&dbxgcgTmpstr,ajStrGetPtr(entry->id));
499
500 if(ajRegExec(dbxgcg_splitexp, entry->id))
501 {
502 p = strrchr(ajStrGetPtr(dbxgcgTmpstr),'_');
503 *p = '\0';
504 ajStrAssignC(&entry->id,ajStrGetPtr(dbxgcgTmpstr));
505 }
506
507 return ajTrue;
508 }
509
510
511
512
513 /* @funcstatic dbxgcg_gcggetent ***********************************************
514 **
515 ** get a single entry from the GCG database files
516 **
517 ** @param [u] entry [EmbPBtreeEntry] b+tree entry pointer
518 ** @param [u] infs [AjPFile] sequence file
519 ** @param [u] infr [AjPFile] reference file
520 ** @param [r] dbtype [const AjPStr] Id format in GCG file
521 ** @return [ajlong] Sequence length
522 ** @@
523 ******************************************************************************/
524
dbxgcg_gcggetent(EmbPBtreeEntry entry,AjPFile infs,AjPFile infr,const AjPStr dbtype)525 static ajlong dbxgcg_gcggetent(EmbPBtreeEntry entry, AjPFile infs,
526 AjPFile infr, const AjPStr dbtype)
527 {
528 static ajint called = 0;
529 static ajint iparser = -1;
530 ajlong gcglen = 0;
531 ajlong rblock;
532 ajint i;
533
534 ajStrAssignC(&dbxgcgSeqline, "");
535 ajStrAssignC(&dbxgcgRefline, "");
536
537 if(!called)
538 {
539 for(i=0; parser[i].Name; i++)
540 if(ajStrMatchC(dbtype, parser[i].Name))
541 {
542 iparser = i;
543 break;
544 }
545
546 if(iparser < 0)
547 ajFatal("dbtype '%S' unknown", dbtype);
548
549 ajDebug("dbtype '%S' Parser %d\n", dbtype, iparser);
550 called = 1;
551 }
552
553 if(!parser[iparser].GcgType)
554 {
555 return 0;
556 }
557
558 if(!dbxgcg_gcg_rexp)
559 dbxgcg_gcg_rexp = ajRegCompC("^>>>>([^ \t\n]+)");
560
561 if(!dbxgcg_gcg_sexp)
562 dbxgcg_gcg_sexp = ajRegCompC("^>>>>([^ \t]+)[ \t]+"
563 "(Dummy Header|[^ \t]+)[ \t]+([^ \t]+)"
564 "[ \t]+([^ \t]+)[ \t]+([0-9]+)");
565
566 /* check for seqid first line */
567 while(ajStrGetCharFirst(dbxgcgSeqline)!='>')
568 {
569 if(!ajReadline(infs, &dbxgcgSeqline))
570 {
571 return 0; /* end of file */
572 }
573 ajDebug("... read until next seq %Ld '%S'\n",
574 ajFileResetPos(infs), dbxgcgSeqline);
575 }
576
577 ajDebug("dbxgcg_gcggetent .seq (%S) %Ld '%S'\n",
578 dbtype, ajFileResetPos(infs), dbxgcgSeqline);
579
580 /* get the encoding/sequence length info */
581 if(!ajRegExec(dbxgcg_gcg_sexp, dbxgcgSeqline))
582 {
583 ajDebug("dbxgcg_gcggetent sequence expression FAILED\n");
584 return 0;
585 }
586
587 ajRegSubI(dbxgcg_gcg_sexp, 1, &entry->id); /* Entry ID returned */
588
589 ajRegSubI(dbxgcg_gcg_sexp, 2, &dbxgcgGcgdate);
590 ajRegSubI(dbxgcg_gcg_sexp, 3, &dbxgcgGcgtype);
591 ajRegSubI(dbxgcg_gcg_sexp, 5, &dbxgcgTmpstr);
592 ajStrToLong(dbxgcgTmpstr, &gcglen);
593
594 ajDebug("new entry '%S' date:'%S' type:'%S' len:'%S'=%Ld\n",
595 entry->id, dbxgcgGcgdate, dbxgcgGcgtype, dbxgcgTmpstr, gcglen);
596
597 ajDebug("dbxgcg_gcggetent .ref (%S) %Ld '%S'\n",
598 dbtype, ajFileResetPos(infr), dbxgcgRefline);
599
600 /* check for refid first line */
601 while(ajStrGetCharFirst(dbxgcgRefline)!='>')
602 {
603 if(!ajReadline(infr, &dbxgcgRefline))
604 {
605 ajErr("ref ended before seq");
606 break; /* end of file */
607 }
608 ajDebug("... read until next ref %Ld '%S'\n", ajFileResetPos(infr), dbxgcgRefline);
609 }
610
611 /* get the encoding/sequence length info */
612
613 ajRegExec(dbxgcg_gcg_rexp, dbxgcgRefline);
614 ajRegSubI(dbxgcg_gcg_rexp, 1, &dbxgcgReflibstr);
615
616 (*parser[iparser].Parser)(infr,
617 &dbxgcgReflibstr); /* writes alistfile data */
618
619 /* get the description line */
620 ajReadline(infs, &dbxgcgSeqline);
621
622 /* seek to the end of the sequence; +1 to jump over newline */
623 if(ajStrGetCharFirst(dbxgcgGcgtype)=='2')
624 {
625 rblock = (gcglen+3)/4;
626 ajFileSeek(infs,rblock+1,SEEK_CUR);
627 }
628 else
629 ajFileSeek(infs,gcglen+1,SEEK_CUR);
630
631 /*
632 ** for big entries, need to append until we have all the parts.
633 ** They are named with _0 on the first part, _1 on the second and so on.
634 ** or _00 on the first part, _01 on the second and so on.
635 ** We can look for the "id_" prefix.
636 */
637
638 if(!ajStrSuffixC(entry->id, "_0") &&
639 !ajStrSuffixC(entry->id,"_00") &&
640 !ajStrSuffixC(entry->id,"_000") &&
641 !ajStrSuffixC(entry->id,"_0000"))
642 return gcglen;
643
644 gcglen += dbxgcg_gcgappent(infr, infs, dbxgcg_gcg_rexp, dbxgcg_gcg_sexp,
645 &entry->id);
646
647 return gcglen;
648 }
649
650
651
652
653 /* @funcstatic dbxgcg_pirgetent ***********************************************
654 **
655 ** Get a single entry from the PIR database files
656 **
657 ** @param [u] entry [EmbPBtreeEntry] b+tree entry pointer
658 ** @param [u] infs [AjPFile] sequence file
659 ** @param [u] infr [AjPFile] reference file
660 ** @param [r] dbtype [const AjPStr] Id format in GCG file
661 ** @return [ajlong] Sequence length
662 ** @@
663 ******************************************************************************/
664
dbxgcg_pirgetent(EmbPBtreeEntry entry,AjPFile infs,AjPFile infr,const AjPStr dbtype)665 static ajlong dbxgcg_pirgetent(EmbPBtreeEntry entry, AjPFile infs,
666 AjPFile infr, const AjPStr dbtype)
667 {
668 ajint i;
669 static ajint called = 0;
670 static ajint iparser = -1;
671 ajlong gcglen;
672 ajlong spos = 0;
673
674 ajStrAssignC(&dbxgcgSeqline, "");
675 ajStrAssignC(&dbxgcgRefline, "");
676
677 if(!called)
678 {
679 for(i=0; parser[i].Name; i++)
680 if(ajStrMatchC(dbtype, parser[i].Name))
681 {
682 iparser = i;
683 break;
684 }
685
686 if(iparser < 0)
687 ajFatal("dbtype '%S' unknown", dbtype);
688 ajDebug("dbtype '%S' Parser %d\n", dbtype, iparser);
689 called = 1;
690 }
691
692 if(parser[iparser].GcgType)
693 return 0;
694
695 if(!dbxgcg_pir_pirexp)
696 dbxgcg_pir_pirexp = ajRegCompC("^>..;([^ \t\n]+)");
697
698 /* skip to seqid first line */
699 while(ajStrGetCharFirst(dbxgcgSeqline)!='>')
700 if(!ajReadline(infs, &dbxgcgSeqline))
701 {
702 return 0; /* end of file */
703 }
704
705 ajDebug("dbxgcg_pirgetent .seq (%S) %Ld '%S' \n",
706 dbtype, ajFileResetPos(infs), dbxgcgSeqline);
707
708 ajRegExec(dbxgcg_pir_pirexp, dbxgcgSeqline);
709
710 /* skip to refid first line */
711 while(ajStrGetCharFirst(dbxgcgRefline)!='>')
712 if(!ajReadline(infr, &dbxgcgRefline))
713 {
714 ajErr("ref ended before seq"); /* end of file */
715 break;
716 }
717
718 /* get the encoding/sequence length info */
719
720 ajRegExec(dbxgcg_pir_pirexp, dbxgcgRefline);
721 ajRegSubI(dbxgcg_pir_pirexp, 1, &dbxgcgReflibstr);
722 ajRegSubI(dbxgcg_pir_pirexp, 1, &entry->id);
723
724 ajDebug("dbigcg_pirgetent seqid '%S' spos: %Ld\n",
725 entry->id, ajFileResetPos(infs));
726 ajDebug("dbxgcg_pirgetent refid '%S' spos: %Ld\n",
727 entry->id, ajFileResetPos(infr));
728
729 (*parser[iparser].Parser)(infr,
730 &dbxgcgReflibstr);/* writes alistfile data */
731
732 /* get the description line */
733 ajReadline(infs, &dbxgcgSeqline);
734 gcglen = 0;
735
736 /* seek to the end of the sequence; +1 to jump over newline */
737 while(ajStrGetCharFirst(dbxgcgSeqline)!='>')
738 {
739 spos = ajFileResetPos(infs);
740 if(!ajReadline(infs, &dbxgcgSeqline))
741 {
742 spos = 0;
743 break;
744 }
745 gcglen += ajStrGetLen(dbxgcgSeqline);
746 }
747
748 if(spos)
749 ajFileSeek(infs, spos, 0);
750
751 ajDebug("dbxgcg_pirgetent end spos %Ld line '%S'\n", spos, dbxgcgSeqline);
752
753 return gcglen;
754 }
755
756
757
758
759 /* @funcstatic dbxgcg_gcgappent ***********************************************
760 **
761 ** Go to end of a split GCG entry
762 **
763 ** @param [u] infr [AjPFile] Reference file
764 ** @param [u] infs [AjPFile] Sequence file
765 ** @param [u] rexp [AjPRegexp] Regular expression to find ID in ref file
766 ** @param [u] sexp [AjPRegexp] Regular expression to find ID in seq file
767 ** @param [w] libstr [AjPStr*] ID
768 ** @return [ajlong] Sequence length for this section
769 ** @@
770 ******************************************************************************/
771
dbxgcg_gcgappent(AjPFile infr,AjPFile infs,AjPRegexp rexp,AjPRegexp sexp,AjPStr * libstr)772 static ajlong dbxgcg_gcgappent(AjPFile infr, AjPFile infs,
773 AjPRegexp rexp, AjPRegexp sexp,
774 AjPStr* libstr)
775 {
776 AjPStr reflibstr = NULL;
777 AjPStr seqlibstr = NULL;
778 AjPStr testlibstr = NULL;
779 ajint ilen;
780
781 AjBool isend;
782 const char *p;
783 char *q;
784 ajlong rpos;
785 ajlong spos;
786
787 /*
788 ** keep reading until the end of entry is reached
789 ** and return the extra number of bases
790 */
791
792 if(!testlibstr)
793 testlibstr = ajStrNew();
794
795 ajStrAssignS(&dbxgcgTmpstr,*libstr);
796
797 ajDebug("dbi_gcgappent '%S'\n", dbxgcgTmpstr);
798
799 p = ajStrGetPtr(dbxgcgTmpstr);
800 q = strrchr(p,'_');
801 *q = '\0';
802
803
804 ajFmtPrintS(&testlibstr, "%s_",p);
805 ilen = ajStrGetLen(testlibstr);
806
807 isend = ajFalse;
808
809 while(!isend)
810 {
811 spos = ajFileResetPos(infs);
812 ajReadline(infs,&dbxgcgSeqline);
813 while(strncmp(ajStrGetPtr(dbxgcgSeqline),">>>>",4))
814 {
815 spos = ajFileResetPos(infs);
816 if(!ajReadline(infs, &dbxgcgSeqline))
817 {
818 ajStrDel(&reflibstr);
819 ajStrDel(&seqlibstr);
820 ajStrDel(&testlibstr);
821 ajDebug("end of file on seq\n");
822 return 1L;
823 }
824 }
825
826 ajRegExec(sexp, dbxgcgSeqline);
827 ajRegSubI(sexp, 1, &seqlibstr);
828
829 rpos = ajFileResetPos(infr);
830 ajReadline(infr, &dbxgcgRefline);
831
832 while(ajStrGetCharFirst(dbxgcgRefline)!='>')
833 {
834 rpos = ajFileResetPos(infr);
835 if(!ajReadline(infr, &dbxgcgRefline))
836 {
837 ajDebug("end of file on seq\n");
838 ajDebug("ref ended before seq\n");
839 ajErr("ref ended before seq\n");
840 break;
841 }
842 }
843
844 ajRegExec(rexp, dbxgcgRefline);
845 ajRegSubI(rexp, 1, &reflibstr);
846
847 if(ajStrCmpLenS(reflibstr, testlibstr, ilen) ||
848 ajStrCmpLenS(seqlibstr, testlibstr, ilen))
849 isend = ajTrue;
850
851 ajDebug("gcgappent %B test: '%S' seq: '%S' ref: '%S'\n",
852 isend, testlibstr, seqlibstr, reflibstr);
853 }
854
855 ajDebug("gcgappent done at seq: '%S' ref: '%S'\n", seqlibstr, reflibstr);
856
857 ajStrAssignC(libstr,p);
858
859 ajFileSeek(infr, rpos, 0);
860 ajFileSeek(infs, spos, 0);
861
862 ajStrDel(&reflibstr);
863 ajStrDel(&seqlibstr);
864 ajStrDel(&testlibstr);
865
866 return 1L;
867 }
868
869
870
871
872 /* @funcstatic dbxgcg_ParseEmbl ***********************************************
873 **
874 ** Parse the ID, accession from an EMBL or SWISSPROT entry
875 **
876 ** @param [u] infr [AjPFile] reference file
877 ** @param [w] id [AjPStr*] ID
878 ** @return [AjBool] ajTrue on success.
879 ** @@
880 ******************************************************************************/
881
dbxgcg_ParseEmbl(AjPFile infr,AjPStr * id)882 static AjBool dbxgcg_ParseEmbl(AjPFile infr,
883 AjPStr *id)
884 {
885 ajint lineType;
886 ajlong rpos;
887
888 if(!dbxgcg_embl_typexp)
889 dbxgcg_embl_typexp = ajRegCompC("^([A-Z][A-Z]) +");
890
891 if(!dbxgcg_embl_wrdexp)
892 dbxgcg_embl_wrdexp = ajRegCompC("([A-Za-z0-9_]+)");
893
894 if(!dbxgcg_embl_verexp)
895 dbxgcg_embl_verexp = ajRegCompC("([A-Za-z0-9]+[.][0-9]+)");
896
897 if(!dbxgcg_embl_phrexp)
898 dbxgcg_embl_phrexp = ajRegCompC(" *([^;.\n\r]+)");
899
900 if(!dbxgcg_embl_taxexp)
901 dbxgcg_embl_taxexp = ajRegCompC(" *([^;.\n\r()]+)");
902
903 if(!dbxgcg_embl_idexp)
904 dbxgcg_embl_idexp = ajRegCompC("^ID ([^ \t;]+)");
905
906 rpos = ajFileResetPos(infr);
907 while(ajReadline(infr, &dbxgcgRefline))
908 {
909 if(ajStrGetCharFirst(dbxgcgRefline) == '>')
910 break;
911
912 rpos = ajFileResetPos(infr);
913
914 if(ajRegExec(dbxgcg_embl_typexp, dbxgcgRefline))
915 {
916 ajRegSubI(dbxgcg_embl_typexp, 1, &dbxgcgTypstr);
917 if(ajStrMatchC(dbxgcgTypstr, "ID"))
918 lineType = GCGTYPE_ID;
919 else if(ajStrMatchC(dbxgcgTypstr, "SV"))
920 lineType = GCGTYPE_VER;
921 else if(ajStrMatchC(dbxgcgTypstr, "AC"))
922 lineType = GCGTYPE_ACC;
923 else if(ajStrMatchC(dbxgcgTypstr, "DE"))
924 lineType = GCGTYPE_DES;
925 else if(ajStrMatchC(dbxgcgTypstr, "KW"))
926 lineType = GCGTYPE_KEY;
927 else if(ajStrMatchC(dbxgcgTypstr, "OS"))
928 lineType = GCGTYPE_TAX;
929 else if(ajStrMatchC(dbxgcgTypstr, "OC"))
930 lineType = GCGTYPE_TAX;
931 else
932 lineType=GCGTYPE_OTHER;
933
934 if(lineType != GCGTYPE_OTHER)
935 ajRegPost(dbxgcg_embl_typexp, &dbxgcgTmpline);
936 }
937 else
938 lineType = GCGTYPE_OTHER;
939
940 if(lineType == GCGTYPE_ID)
941 {
942 ajRegExec(dbxgcg_embl_idexp, dbxgcgRefline);
943 ajRegSubI(dbxgcg_embl_idexp, 1, id);
944 ajDebug("++id '%S'\n", *id);
945 continue;
946 }
947
948 if(lineType == GCGTYPE_ACC && accfield)
949 {
950 embBtreeParseField(dbxgcgTmpline, dbxgcg_embl_wrdexp, accfield);
951 continue;
952 }
953 else if(lineType == GCGTYPE_DES && desfield)
954 {
955 embBtreeParseField(dbxgcgTmpline, dbxgcg_embl_wrdexp, desfield);
956 continue;
957 }
958 else if(lineType == GCGTYPE_VER && svfield)
959 {
960 embBtreeParseField(dbxgcgTmpline, dbxgcg_embl_verexp, svfield);
961 continue;
962 }
963 else if(lineType == GCGTYPE_KEY && keyfield)
964 {
965 embBtreeParseFieldTrim(dbxgcgTmpline, dbxgcg_embl_phrexp, keyfield);
966 continue;
967 }
968 else if(lineType == GCGTYPE_TAX && orgfield)
969 {
970 embBtreeParseFieldTrim(dbxgcgTmpline, dbxgcg_embl_taxexp, orgfield);
971 continue;
972 }
973 }
974
975 if(rpos)
976 ajFileSeek(infr, rpos, 0);
977
978 return ajFalse;
979 }
980
981
982
983
984 /* @funcstatic dbxgcg_ParseGenbank ********************************************
985 **
986 ** Parse the ID, accession from a Genbank entry
987 **
988 ** @param [u] infr [AjPFile] reference file
989 ** @param [w] id [AjPStr*] ID
990 ** @return [AjBool] ajTrue on success.
991 ** @@
992 ******************************************************************************/
993
dbxgcg_ParseGenbank(AjPFile infr,AjPStr * id)994 static AjBool dbxgcg_ParseGenbank(AjPFile infr,
995 AjPStr *id)
996 {
997 ajlong rpos = 0;
998 ajint lineType=GCGTYPE_OTHER;
999
1000 if(!dbxgcg_genbank_typexp)
1001 dbxgcg_genbank_typexp = ajRegCompC("^( )?([A-Z]+)");
1002
1003 if(!dbxgcg_genbank_morexp)
1004 dbxgcg_genbank_morexp = ajRegCompC("^ ");
1005
1006 if(!dbxgcg_genbank_wrdexp)
1007 dbxgcg_genbank_wrdexp = ajRegCompC("([A-Za-z0-9_]+)");
1008
1009 if(!dbxgcg_genbank_phrexp)
1010 dbxgcg_genbank_phrexp = ajRegCompC(" *([^;.\n\r]+)");
1011
1012 if(!dbxgcg_genbank_taxexp)
1013 dbxgcg_genbank_taxexp = ajRegCompC(" *([^;.\n\r()]+)");
1014
1015 if(!dbxgcg_genbank_verexp)
1016 dbxgcg_genbank_verexp = ajRegCompC("([A-Za-z0-9]+)( +GI:([0-9]+))?");
1017
1018 while(ajReadline(infr, &dbxgcgRefline))
1019 {
1020 if(ajStrGetCharFirst(dbxgcgRefline) == '>')
1021 break;
1022
1023 rpos = ajFileResetPos(infr);
1024 ajStrAssignS(&dbxgcgTmpstr,dbxgcgRefline);
1025
1026 if(ajRegExec(dbxgcg_genbank_typexp, dbxgcgTmpstr))
1027 {
1028 ajRegSubI(dbxgcg_genbank_typexp, 2, &dbxgcgTypstr);
1029 if(ajStrMatchC(dbxgcgTypstr, "LOCUS"))
1030 lineType = GCGTYPE_ID;
1031 else if(ajStrMatchC(dbxgcgTypstr, "VERSION"))
1032 lineType = GCGTYPE_VER;
1033 else if(ajStrMatchC(dbxgcgTypstr, "ACCESSION"))
1034 lineType = GCGTYPE_ACC;
1035 else if(ajStrMatchC(dbxgcgTypstr, "DEFINITION"))
1036 lineType = GCGTYPE_DES;
1037 else if(ajStrMatchC(dbxgcgTypstr, "KEYWORDS"))
1038 lineType = GCGTYPE_KEY;
1039 else if(ajStrMatchC(dbxgcgTypstr, "ORGANISM"))
1040 lineType = GCGTYPE_TAX;
1041 else
1042 lineType=GCGTYPE_OTHER;
1043
1044 if(lineType != GCGTYPE_OTHER)
1045 ajRegPost(dbxgcg_genbank_typexp, &dbxgcgTmpline);
1046 ajDebug("++type line %d\n", lineType);
1047 }
1048 else if(lineType != GCGTYPE_OTHER &&
1049 ajRegExec(dbxgcg_genbank_morexp, dbxgcgRefline))
1050 {
1051 ajRegPost(dbxgcg_genbank_morexp, &dbxgcgTmpline);
1052 ajDebug("++more line %d\n", lineType);
1053 }
1054 else
1055 lineType = GCGTYPE_OTHER;
1056
1057 if(lineType == GCGTYPE_ID)
1058 {
1059 ajRegExec(dbxgcg_genbank_wrdexp, dbxgcgTmpline);
1060 ajRegSubI(dbxgcg_genbank_wrdexp, 1, id);
1061 }
1062 else if(lineType == GCGTYPE_ACC && accfield)
1063 {
1064 embBtreeParseField(dbxgcgTmpline, dbxgcg_genbank_wrdexp, accfield);
1065 continue;
1066 }
1067 else if(lineType == GCGTYPE_DES && desfield)
1068 {
1069 embBtreeParseField(dbxgcgTmpline, dbxgcg_genbank_wrdexp, desfield);
1070 continue;
1071 }
1072 else if(lineType == GCGTYPE_KEY && keyfield)
1073 {
1074 embBtreeParseField(dbxgcgTmpline, dbxgcg_genbank_phrexp, keyfield);
1075 continue;
1076 }
1077 else if(lineType == GCGTYPE_TAX && orgfield)
1078 {
1079 embBtreeParseField(dbxgcgTmpline, dbxgcg_genbank_taxexp, orgfield);
1080 continue;
1081 }
1082 else if(lineType == GCGTYPE_VER && svfield)
1083 {
1084 embBtreeParseFieldThird(dbxgcgTmpline, dbxgcg_genbank_verexp,
1085 svfield);
1086 continue;
1087 }
1088
1089 }
1090
1091 if(rpos)
1092 ajFileSeek(infr, rpos, 0);
1093
1094 return ajFalse;
1095 }
1096
1097
1098
1099
1100 /* @funcstatic dbxgcg_ParsePir ************************************************
1101 **
1102 ** Parse the ID, accession from a PIR entry
1103 **
1104 ** @param [u] infr [AjPFile] reference file
1105 ** @param [w] id [AjPStr*] ID
1106 ** @return [AjBool] ajTrue on success.
1107 ** @@
1108 ******************************************************************************/
1109
1110
dbxgcg_ParsePir(AjPFile infr,AjPStr * id)1111 static AjBool dbxgcg_ParsePir(AjPFile infr,
1112 AjPStr *id)
1113 {
1114 ajlong rpos;
1115
1116 if(!dbxgcg_pir_wrdexp)
1117 dbxgcg_pir_wrdexp = ajRegCompC("([A-Za-z0-9_]+)");
1118
1119 if(!dbxgcg_pir_idexp)
1120 dbxgcg_pir_idexp = ajRegCompC("^>..;([^;.\n\r]+)");
1121
1122 if(!dbxgcg_pir_phrexp) /* allow . for "sp." */
1123 dbxgcg_pir_phrexp = ajRegCompC(" *([^,;\n\r]+)");
1124
1125 if(!dbxgcg_pir_tax2exp) /* allow . for "sp." */
1126 dbxgcg_pir_tax2exp = ajRegCompC(" *([^,;\n\r()]+)");
1127
1128 if(!dbxgcg_pir_acexp)
1129 dbxgcg_pir_acexp = ajRegCompC("^C;Accession:");
1130
1131 if(!dbxgcg_pir_ac2exp)
1132 dbxgcg_pir_ac2exp = ajRegCompC("([A-Za-z0-9]+)");
1133
1134 if(!dbxgcg_pir_taxexp)
1135 dbxgcg_pir_taxexp = ajRegCompC("^C;Species:");
1136
1137 if(!dbxgcg_pir_keyexp)
1138 dbxgcg_pir_keyexp = ajRegCompC("^C;Keywords:");
1139
1140 rpos = ajFileResetPos(infr);
1141
1142 ajDebug("++id '%S'\n", *id);
1143
1144
1145 ajReadline(infr, &dbxgcgRefline);
1146 ajDebug("line-2 '%S'\n", dbxgcgRefline);
1147
1148 if(desfield)
1149 {
1150 embBtreeParseField(dbxgcgRefline, dbxgcg_pir_wrdexp, desfield);
1151 }
1152
1153 while(ajStrGetCharFirst(dbxgcgRefline)!='>')
1154 {
1155 rpos = ajFileResetPos(infr);
1156 ajStrAssignS(&dbxgcgTmpstr,dbxgcgRefline);
1157
1158 if(accfield)
1159 {
1160 if(ajRegExec(dbxgcg_pir_acexp, dbxgcgRefline))
1161 {
1162 ajRegPost(dbxgcg_pir_acexp, &dbxgcgTmpline);
1163 embBtreeParseField(dbxgcgTmpline, dbxgcg_pir_ac2exp, accfield);
1164 }
1165 }
1166
1167 if(keyfield)
1168 {
1169 if(ajRegExec(dbxgcg_pir_keyexp, dbxgcgRefline))
1170 {
1171 ajRegPost(dbxgcg_pir_keyexp, &dbxgcgTmpline);
1172 embBtreeParseFieldTrim(dbxgcgTmpline, dbxgcg_pir_phrexp,
1173 keyfield);
1174 }
1175 }
1176
1177 if(orgfield)
1178 {
1179 if(ajRegExec(dbxgcg_pir_taxexp, dbxgcgRefline))
1180 {
1181 ajRegPost(dbxgcg_pir_taxexp, &dbxgcgTmpline);
1182 embBtreeParseFieldTrim(dbxgcgTmpline, dbxgcg_pir_tax2exp,
1183 orgfield);
1184 }
1185 }
1186
1187 if(!ajReadline(infr, &dbxgcgRefline))
1188 {
1189 rpos = 0;
1190 break;
1191 }
1192 }
1193
1194 if(rpos)
1195 ajFileSeek(infr, rpos, 0);
1196
1197 return ajFalse;
1198 }
1199