1 /* @source embdbi *************************************************************
2 **
3 ** General routines for alignment.
4 **
5 ** @author Copyright (c) 2000 Peter Rice
6 ** @version $Revision: 1.63 $
7 ** @modified $Date: 2012/07/14 14:52:40 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA 02110-1301, USA.
24 **
25 ******************************************************************************/
26
27 #include "ajlib.h"
28
29 #include "embdbi.h"
30 #include "ajfile.h"
31 #include "ajlist.h"
32 #include "ajutil.h"
33 #include "ajtime.h"
34 #include "ajreg.h"
35 #include "ajsys.h"
36 #include "ajfileio.h"
37
38 #include <math.h>
39 #ifndef WIN32
40 #include <dirent.h>
41 #include <sys/types.h>
42 #include <sys/wait.h>
43 #else
44 #include "win32.h"
45 #include "dirent_w32.h"
46 #endif
47
48 #include <errno.h>
49
50
51
52 static AjPStr dbiCmdStr = NULL;
53 static AjPStr dbiCmdStr2 = NULL;
54 static AjPStr dbiDirFix = NULL;
55 static AjPStr dbiWildFname = NULL;
56 static AjPStr dbiInFname = NULL;
57 static AjPStr dbiOutFname = NULL;
58 static AjPStr dbiOutRecord = NULL;
59 static AjPStr dbiSortExt = NULL;
60 static AjPStr dbiLastId = NULL;
61 static AjPStr dbiFieldId = NULL;
62 static AjPStr dbiIdStr = NULL;
63 static AjPStr dbiTmpStr = NULL;
64 static AjPStr dbiRdLine = NULL;
65 static AjPStr dbiIdLine = NULL;
66 static AjPStr dbiFieldSort = NULL;
67 static AjPStr dbiFieldSort2 = NULL;
68 static AjPStr dbiFieldStr = NULL;
69 static AjPStr dbiFieldName = NULL;
70 static AjPStr dbiFieldId2 = NULL;
71 static AjPStr dbiCurrentId = NULL;
72
73 static AjPRegexp dbiRegFieldIdSort = NULL;
74 static AjPRegexp dbiRegFieldTokSort = NULL;
75 static AjPRegexp dbiRegFieldTokIdSort = NULL;
76 static AjPRegexp dbiRegEntryIdSort = NULL;
77 static AjPRegexp dbiRegDate = NULL;
78
79
80
81
82 /* @datastatic DbiOField ******************************************************
83 **
84 ** Database index field names and index filenames
85 **
86 ** @attr name [const char*] Field name as used in USAs
87 ** @attr index [const char*] Index filename for EMBLCD indices
88 ** @attr desc [const char*] Field description
89 ******************************************************************************/
90
91 typedef struct DbiSField
92 {
93 const char* name;
94 const char* index;
95 const char* desc;
96 } DbiOField;
97
98
99 static DbiOField fieldDef[] =
100 {
101 /* Name Index Description */
102 {"acc", "acnum", "accession number"},
103 {"sv", "seqvn", "seqeunce version and GI number"},
104 {"des", "des", "entry description"},
105 {"org", "taxon", "taxonomy and organism"},
106 {"key", "keyword", "keywords"},
107 {NULL, NULL, NULL}
108 };
109
110 static const char* dbiFieldFile(const AjPStr fieldname);
111
112
113
114
115 /* @func embDbiFieldNew *******************************************************
116 **
117 ** Constructor for field token structures.
118 **
119 ** @return [EmbPField] Field token structure.
120 **
121 ** @release 2.4.0
122 ******************************************************************************/
123
embDbiFieldNew(void)124 EmbPField embDbiFieldNew(void)
125 {
126 EmbPField ret;
127 AJNEW0(ret);
128
129 return ret;
130 }
131
132
133
134
135 /* @func embDbiFieldDel *******************************************************
136 **
137 ** Destructor for field token structures.
138 **
139 ** @param [d] pthys [EmbPField*] Field token structure.
140 ** @return [void]
141 **
142 ** @release 4.1.0
143 ******************************************************************************/
144
embDbiFieldDel(EmbPField * pthys)145 void embDbiFieldDel(EmbPField* pthys)
146 {
147 EmbPField thys;
148
149 if(!pthys || !*pthys)
150 return;
151
152 thys = *pthys;
153
154 AJFREE(thys->field);
155 AJFREE(thys->entry);
156 AJFREE(*pthys);
157
158 return;
159 }
160
161
162
163
164 /* @func embDbiFieldDelMap ****************************************************
165 **
166 ** Destructor for field token structures to be mapped to lists or tables.
167 **
168 ** @param [d] pthys [void**] Field token structure.
169 ** @param [u] cl [void*] Unused
170 ** @return [void]
171 **
172 ** @release 4.1.0
173 ******************************************************************************/
174
embDbiFieldDelMap(void ** pthys,void * cl)175 void embDbiFieldDelMap(void** pthys, void* cl)
176 {
177 EmbPField thys = (*(EmbPField*)pthys);
178
179 if(!thys)
180 return;
181
182 (void) cl; /* make it used */
183
184 thys = *pthys;
185
186 /*AJFREE(thys->field);*/
187 /*AJFREE(thys->entry);*/
188 AJFREE(*pthys);
189
190 return;
191 }
192
193
194
195
196 /* @func embDbiCmpId **********************************************************
197 **
198 ** Comparison function for two entries.
199 **
200 ** @param [r] a [const void*] First id (EmbPEntry*)
201 ** @param [r] b [const void*] Second id (EmbPEntry*)
202 ** @return [ajint] Comparison value, -1, 0 or +1.
203 **
204 ** @release 1.13.0
205 ** @@
206 ******************************************************************************/
207
embDbiCmpId(const void * a,const void * b)208 ajint embDbiCmpId(const void* a, const void* b)
209 {
210 const EmbPEntry aa;
211 const EmbPEntry bb;
212
213 aa = *(EmbPEntry const *) a;
214 bb = *(EmbPEntry const *) b;
215
216 return strcmp(aa->entry, bb->entry);
217 }
218
219
220
221
222 /* @func embDbiCmpFieldId *****************************************************
223 **
224 ** Comparison function for the entrynames in two field structures.
225 **
226 ** @param [r] a [const void*] First id (EmbPField*)
227 ** @param [r] b [const void*] Second id (EmbPField*)
228 ** @return [ajint] Comparison value, -1, 0 or +1.
229 **
230 ** @release 2.4.0
231 ** @@
232 ******************************************************************************/
233
embDbiCmpFieldId(const void * a,const void * b)234 ajint embDbiCmpFieldId(const void* a, const void* b)
235 {
236 const EmbPField aa;
237 const EmbPField bb;
238
239 aa = *(EmbPField const *) a;
240 bb = *(EmbPField const *) b;
241
242 return strcmp(aa->entry, bb->entry);
243 }
244
245
246
247
248 /* @func embDbiCmpFieldField **************************************************
249 **
250 ** Comparison function for two field token values
251 **
252 ** @param [r] a [const void*] First id (EmbPField*)
253 ** @param [r] b [const void*] Second id (EmbPField*)
254 ** @return [ajint] Comparison value, -1, 0 or +1.
255 **
256 ** @release 2.4.0
257 ** @@
258 ******************************************************************************/
259
embDbiCmpFieldField(const void * a,const void * b)260 ajint embDbiCmpFieldField(const void* a, const void* b)
261 {
262 ajint ret;
263
264 const EmbPField aa;
265 const EmbPField bb;
266
267 aa = *(EmbPField const *) a;
268 bb = *(EmbPField const *) b;
269
270 ret = strcmp(aa->field, bb->field);
271
272 if(ret)
273 return ret;
274
275 return strcmp(aa->entry, bb->entry);
276 }
277
278
279
280
281 /* @func embDbiEntryNew *******************************************************
282 **
283 ** Constructor for entry structures.
284 **
285 ** @param [r] nfields [ajuint] Number of data fields to be included
286 ** @return [EmbPEntry] Entry structure.
287 **
288 ** @release 1.13.0
289 ******************************************************************************/
290
embDbiEntryNew(ajuint nfields)291 EmbPEntry embDbiEntryNew(ajuint nfields)
292 {
293 EmbPEntry ret;
294
295 AJNEW0(ret);
296 ret->nfields = nfields;
297 AJCNEW0(ret->nfield, nfields);
298 AJCNEW0(ret->field, nfields);
299
300 return ret;
301 }
302
303
304
305
306 /* @func embDbiEntryDel *******************************************************
307 **
308 ** Destructor for entry structures.
309 **
310 ** @param [d] Pentry [EmbPEntry*] Entry structure
311 ** @return [void]
312 **
313 ** @release 4.0.0
314 ******************************************************************************/
315
embDbiEntryDel(EmbPEntry * Pentry)316 void embDbiEntryDel(EmbPEntry* Pentry)
317 {
318 EmbPEntry entry;
319 ajuint i;
320 ajuint j;
321
322 if(!*Pentry)
323 return;
324
325 entry = *Pentry;
326
327 for(i=0;i<entry->nfields;i++)
328 {
329 for(j=0;j<entry->nfield[i];j++)
330 {
331 AJFREE(entry->field[i][j]);
332 }
333
334 AJFREE(entry->field[i]);
335 }
336
337 AJFREE(entry->nfield);
338 AJFREE(entry->field);
339 AJFREE(entry->entry);
340 AJFREE(*Pentry);
341
342 return;
343 }
344
345
346
347
348 /* @func embDbiEntryDelMap ****************************************************
349 **
350 ** Destructor for entry structures to be mapped to lists or tables.
351 **
352 ** @param [d] pthys [void**] Field token structure.
353 ** @param [u] cl [void*] Unused
354 ** @return [void]
355 **
356 ** @release 4.1.0
357 ******************************************************************************/
358
embDbiEntryDelMap(void ** pthys,void * cl)359 void embDbiEntryDelMap(void** pthys, void* cl)
360 {
361 EmbPEntry entry;
362 ajuint i;
363 ajuint j;
364
365 if(!pthys || !*pthys)
366 return;
367
368 (void) cl; /* make it used */
369
370 entry = (*(EmbPEntry*)pthys);
371
372 for(i=0;i<entry->nfields;i++)
373 {
374 for(j=0;j<entry->nfield[i];j++)
375 {
376 AJFREE(entry->field[i][j]);
377 }
378
379 AJFREE(entry->field[i]);
380 }
381
382 AJFREE(entry->nfield);
383 AJFREE(entry->field);
384 AJFREE(entry->entry);
385 AJFREE(*pthys);
386
387 return;
388 }
389
390
391
392
393 /* @func embDbiFileList *******************************************************
394 **
395 ** Makes a list of all files in a directory matching a wildcard file name.
396 **
397 ** @param [r] dir [const AjPStr] Directory
398 ** @param [r] wildfile [const AjPStr] Wildcard file name
399 ** @param [r] trim [AjBool] Expand to search, trim results
400 ** @return [AjPList] New list of all files with full paths
401 **
402 ** @release 1.13.0
403 ** @@
404 ******************************************************************************/
405
embDbiFileList(const AjPStr dir,const AjPStr wildfile,AjBool trim)406 AjPList embDbiFileList(const AjPStr dir, const AjPStr wildfile, AjBool trim)
407 {
408 AjPList retlist = NULL;
409
410 DIR* dp;
411 struct dirent* de;
412 ajuint dirsize;
413
414 AjPStr name = NULL;
415 AjPStr tmp;
416 AjPStr s;
417 AjPStr s2;
418 AjPStr t;
419
420 char *p;
421 char *q;
422 AjPList l;
423 ajuint ll;
424 ajuint i;
425 AjBool d;
426
427 ajDebug("embDbiFileList dir '%S' wildfile '%S' maxsize %Ld\n",
428 dir, wildfile, (ajlong) INT_MAX);
429
430 ajStrAssignS(&dbiWildFname,wildfile);
431
432 tmp = ajStrNewS(dbiWildFname);
433
434 if(ajStrGetLen(dir))
435 ajStrAssignS(&dbiDirFix, dir);
436 else
437 ajStrAssignC(&dbiDirFix, CURRENT_DIR);
438
439 if(ajStrGetCharLast(dbiDirFix) != SLASH_CHAR)
440 ajStrAppendC(&dbiDirFix, SLASH_STRING);
441
442 if(trim)
443 ajStrAppendC(&dbiWildFname,"*");
444
445 dp = opendir(ajStrGetPtr(dbiDirFix));
446
447 if(!dp)
448 ajFatal("opendir failed on '%S'", dbiDirFix);
449
450 s = ajStrNew();
451 l = ajListNew();
452 dirsize = 0;
453 retlist = ajListstrNew();
454
455 while((de = readdir(dp)))
456 {
457 #ifndef __CYGWIN__
458 if(!de->d_ino)
459 continue; /* skip deleted files with inode zero */
460 #endif
461
462 if(ajCharMatchC(de->d_name, "."))
463 continue;
464
465 if(ajCharMatchC(de->d_name, ".."))
466 continue;
467
468 if(!ajCharMatchWildS(de->d_name, dbiWildFname))
469 continue;
470
471 ajStrAssignC(&s,de->d_name);
472 p = q =ajStrGetuniquePtr(&s);
473
474 if(trim)
475 {
476 p=strrchr(p,(int)'.');
477
478 if(p)
479 *p='\0';
480 }
481
482 s2 = ajStrNewC(q);
483 ll = (ajuint) ajListGetLength(l);
484 d = ajFalse;
485
486 for(i=0;i<ll;++i)
487 {
488 ajListPop(l,(void *)&t);
489
490 if(ajStrMatchS(t,s2))
491 d=ajTrue;
492
493 ajListPushAppend(l,(void *)t);
494 }
495
496 if(!d)
497 ajListPush(l,(void *)s2);
498 else
499 {
500 ajStrDel(&s2);
501 continue;
502 }
503
504 dirsize++;
505 name = NULL;
506 ajFmtPrintS(&name, "%S%S", dbiDirFix, s2);
507
508 if(ajFilenameGetSize(name) > (ajlong) INT_MAX)
509 ajDie("File '%S' too large for DBI indexing", name);
510
511 ajDebug("accept '%S' (%Ld)\n", s2, ajFilenameGetSize(name));
512 ajListstrPushAppend(retlist, name);
513 }
514
515 if(!ajListGetLength(retlist))
516 ajFatal("No match for file specification %S",tmp);
517
518 while(ajListPop(l,(void *)&t))
519 ajStrDel(&t);
520
521 ajListFree(&l);
522
523 ajStrDel(&s);
524 ajStrDel(&tmp);
525
526 closedir(dp);
527 ajDebug("%u files for '%S' '%S'\n", dirsize, dir, dbiWildFname);
528
529 return retlist;
530 }
531
532
533
534
535 /* @func embDbiFileListExc ****************************************************
536 **
537 ** Makes a list of all files in a directory matching a wildcard file name.
538 **
539 ** @param [r] dir [const AjPStr] Directory
540 ** @param [r] wildfile [const AjPStr] Wildcard file list
541 ** @param [r] exclude [const AjPStr] Wildcard file list
542 ** (NULL if none to exclude)
543 ** @return [AjPList] New list of all files with full paths
544 **
545 ** @release 1.13.2
546 ** @@
547 ******************************************************************************/
548
embDbiFileListExc(const AjPStr dir,const AjPStr wildfile,const AjPStr exclude)549 AjPList embDbiFileListExc(const AjPStr dir, const AjPStr wildfile,
550 const AjPStr exclude)
551 {
552 AjPList retlist = NULL;
553
554 DIR* dp;
555 struct dirent* de;
556 ajuint dirsize;
557 AjPStr name = NULL;
558
559 ajDebug("embDbiFileListExc dir '%S' wildfile '%S' exclude '%S' "
560 "maxsize %Ld\n",
561 dir, wildfile, exclude, (ajlong) INT_MAX);
562
563 if(ajStrGetLen(dir))
564 ajStrAssignS(&dbiDirFix, dir);
565 else
566 ajStrAssignC(&dbiDirFix, CURRENT_DIR);
567
568 if(ajStrGetCharLast(dbiDirFix) != SLASH_CHAR)
569 ajStrAppendC(&dbiDirFix, SLASH_STRING);
570
571 ajDebug("dirfix '%S'\n", dbiDirFix);
572
573 dp = opendir(ajStrGetPtr(dbiDirFix));
574
575 if(!dp)
576 ajFatal("opendir failed on '%S'", dbiDirFix);
577
578 dirsize = 0;
579 retlist = ajListstrNew();
580
581 while((de = readdir(dp)))
582 {
583 /* skip deleted files with inode zero */
584 #ifndef __CYGWIN__
585 if(!de->d_ino)
586 continue;
587 #endif
588
589 if(ajCharMatchC(de->d_name, "."))
590 continue;
591
592 if(ajCharMatchC(de->d_name, ".."))
593 continue;
594
595 ajStrAssignC(&dbiInFname, de->d_name);
596
597 if(exclude && !ajFilenameTestExclude(dbiInFname, exclude, wildfile))
598 continue;
599
600 dirsize++;
601 name = NULL;
602 ajFmtPrintS(&name, "%S%S", dbiDirFix, dbiInFname);
603
604 if(ajFilenameGetSize(name) > (ajlong) INT_MAX)
605 ajDie("File '%S' too large for DBI indexing", name);
606
607 ajDebug("accept '%S' (%Ld)\n", dbiInFname, ajFilenameGetSize(name));
608 ajListstrPushAppend(retlist, name);
609 }
610
611 closedir(dp);
612 ajDebug("%u files for '%S' '%S'\n", dirsize, dir, wildfile);
613
614 return retlist;
615 }
616
617
618
619
620 /* @func embDbiFlatOpenlib ****************************************************
621 **
622 ** Open a flat file library
623 **
624 ** @param [r] lname [const AjPStr] Source file basename
625 ** @param [u] libr [AjPFile*] Database file
626 ** @return [AjBool] ajTrue on success
627 **
628 ** @release 2.3.0
629 ** @@
630 ******************************************************************************/
631
embDbiFlatOpenlib(const AjPStr lname,AjPFile * libr)632 AjBool embDbiFlatOpenlib(const AjPStr lname, AjPFile* libr)
633 {
634 ajFileClose(libr);
635
636 *libr = ajFileNewInNameS(lname);
637
638 if(!*libr)
639 ajFatal("Cannot open %S for reading",lname);
640
641 if(!*libr)
642 {
643 ajErr(" cannot open library flat file: %S\n",
644 lname);
645
646 return ajFalse;
647 }
648
649 return ajTrue;
650 }
651
652
653
654
655 /* @func embDbiRmFile *********************************************************
656 **
657 ** Remove a file or a set of numbered files
658 **
659 ** @param [r] dbname [const AjPStr] Database name
660 ** @param [r] ext [const char*] Base file extension
661 ** @param [r] nfiles [ajuint] Number of files, or zero for unnumbered.
662 ** @param [r] cleanup [AjBool] If ajTrue, clean up temporary files after
663 ** @return [void]
664 **
665 ** @release 1.13.0
666 ** @@
667 ******************************************************************************/
668
embDbiRmFile(const AjPStr dbname,const char * ext,ajuint nfiles,AjBool cleanup)669 void embDbiRmFile(const AjPStr dbname, const char* ext, ajuint nfiles,
670 AjBool cleanup)
671 {
672 #ifndef WIN32
673 ajuint i;
674
675 if(!cleanup)
676 return;
677
678 if(nfiles)
679 {
680 for(i=1; i<= nfiles; i++)
681 {
682 ajFmtPrintS(&dbiCmdStr, "%S%03d.%s", dbname, i, ext);
683 ajSysCommandRemoveS(dbiCmdStr);
684 }
685 }
686 else
687 {
688 ajFmtPrintS(&dbiCmdStr, "%S.%s", dbname, ext);
689 ajSysCommandRemoveS(dbiCmdStr);
690 }
691
692 return;
693
694 #else /* WIN32 */
695 static AjPStr filestr = NULL;
696 ajuint i;
697
698 if (!cleanup)
699 return;
700
701 if (nfiles)
702 {
703 for (i=1; i<= nfiles; i++)
704 {
705 ajFmtPrintS (&filestr, "%S%03d.%s", dbname, i, ext);
706 DeleteFile(ajStrGetPtr(filestr));
707 ajDebug("Deleting file %S\n", filestr);
708 }
709 }
710 else
711 {
712 ajFmtPrintS (&filestr, "%S.%s", dbname, ext);
713 DeleteFile(ajStrGetPtr(filestr));
714 ajDebug("Deleting file %S\n", filestr);
715 }
716
717 return;
718 #endif /* WIN32 */
719 }
720
721
722
723
724 /* @func embDbiRmFileI ********************************************************
725 **
726 ** Remove a numbered file
727 **
728 ** @param [r] dbname [const AjPStr] Database name
729 ** @param [r] ext [const char*] Base file extension
730 ** @param [r] ifile [ajuint] File number.
731 ** @param [r] cleanup [AjBool] If ajTrue, clean up temporary files after
732 ** @return [void]
733 **
734 ** @release 1.13.0
735 ******************************************************************************/
736
embDbiRmFileI(const AjPStr dbname,const char * ext,ajuint ifile,AjBool cleanup)737 void embDbiRmFileI(const AjPStr dbname, const char* ext, ajuint ifile,
738 AjBool cleanup)
739 {
740 #ifndef WIN32
741 if(!cleanup)
742 return;
743
744 ajFmtPrintS(&dbiCmdStr, "%S%03d.%s", dbname, ifile, ext);
745
746 ajSysCommandRemoveS(dbiCmdStr);
747 #else
748 static AjPStr filestr = NULL;
749
750 if(!cleanup)
751 return;
752
753 ajFmtPrintS (&filestr, "%S%03d.%s", dbname, ifile, ext);
754 DeleteFile(ajStrGetPtr(filestr));
755 ajDebug("Deleting file %S\n", filestr);
756 #endif /* WIN32 */
757
758 return;
759 }
760
761
762
763
764 /* @func embDbiRmEntryFile ****************************************************
765 **
766 ** Remove the sorted entryname file (kept until end of processing
767 ** as it is the sorted list of all entries, used to count entries for
768 ** field indexing.
769 **
770 ** @param [r] dbname [const AjPStr] Database name
771 ** @param [r] cleanup [AjBool] If ajTrue, clean up temporary files after
772 ** @return [void]
773 **
774 ** @release 2.4.0
775 ** @@
776 ******************************************************************************/
777
embDbiRmEntryFile(const AjPStr dbname,AjBool cleanup)778 void embDbiRmEntryFile(const AjPStr dbname, AjBool cleanup)
779 {
780 embDbiRmFile(dbname, "idsrt", 0, cleanup);
781
782 return;
783 }
784
785
786
787
788 /* @func embDbiSortFile *******************************************************
789 **
790 ** Sort a file, or a set of numbered files, individually
791 **
792 ** @param [r] dbname [const AjPStr] Database name
793 ** @param [r] ext1 [const char*] Input file extension
794 ** @param [r] ext2 [const char*] Output file extension
795 ** @param [r] nfiles [ajuint] Number of files to sort (zero if unnumbered)
796 ** @param [r] cleanup [AjBool] If ajTrue, clean up temporary files after
797 ** @param [r] sortopt [const AjPStr] Extra options for the system sort
798 ** @return [void]
799 **
800 ** @release 1.13.0
801 ** @@
802 ******************************************************************************/
803
embDbiSortFile(const AjPStr dbname,const char * ext1,const char * ext2,ajuint nfiles,AjBool cleanup,const AjPStr sortopt)804 void embDbiSortFile(const AjPStr dbname, const char* ext1, const char* ext2,
805 ajuint nfiles, AjBool cleanup, const AjPStr sortopt)
806 {
807 ajuint i;
808 AjPStr dir = NULL;
809 ajuint j;
810 ajuint isplit;
811 ajuint nsplit;
812 double td;
813
814 #ifndef WIN32
815 static const char *prog = "sort";
816
817 dir = ajStrNewC(prog);
818 ajSysFileWhich(&dir);
819
820 #else
821 static const char *prog = "sort.exe";
822
823 char* sortProgDir = getenv("EMBOSS_ROOT");
824
825 if(sortProgDir == NULL)
826 {
827 AjPStr msg = ajStrNewC("EMBOSS_ROOT");
828 ajStrAppendC(&msg, " environment variable not defined");
829 ajFatal(ajStrGetPtr(msg));
830 }
831
832 dir = ajStrNewC(sortProgDir);
833 ajStrAppendC(&dir,SLASH_STRING);
834 ajStrAppendC(&dir,prog);
835
836 if(!ajFilenameExistsExec(dir))
837 {
838 ajFmtPrintS(&dir, "%s\\apps\\release\\%s", sortProgDir, prog);
839 }
840
841 if(!ajFilenameExistsExec(dir))
842 {
843 ajFatal("'%s' not found in EMBOSS_ROOT or apps\\release", prog);
844 }
845
846 #endif
847
848
849 if(nfiles)
850 {
851 for(i=1; i<=nfiles; i++)
852 {
853 ajFmtPrintS(&dbiInFname, "%S%03d.%s", dbname, i, ext1);
854 ajFmtPrintS(&dbiOutFname, "%S%03d.%s.srt", dbname, i, ext1);
855
856 if(sortopt)
857 ajFmtPrintS(&dbiCmdStr, "%S -o %S %S %S",
858 dir,dbiOutFname,sortopt,dbiInFname);
859 else
860 ajFmtPrintS(&dbiCmdStr, "%S -o %S %S",
861 dir,dbiOutFname,dbiInFname);
862
863 ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr), "C");
864 embDbiRmFileI(dbname, ext1, i, cleanup);
865 }
866
867 td = sqrt(nfiles);
868 nsplit = (ajuint) td;
869
870 ajDebug("embDbiSortFile nfiles:%d split:%d\n", nfiles, nsplit);
871
872 /* file merge in groups if more than 24 files ... avoids huge merges */
873
874 if(nsplit < 2) /* up to 3 source files */
875 {
876 ajFmtPrintS(&dbiCmdStr, "%S -m -o %S.%s %S",
877 dir,dbname,ext2,sortopt);
878
879 for(i=1; i<=nfiles; i++)
880 ajFmtPrintAppS(&dbiCmdStr, " %S%03d.%s.srt", dbname, i, ext1);
881
882 ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr), "C");
883 ajFmtPrintS(&dbiSortExt, "%s.srt", ext1);
884
885 for(i=1; i<=nfiles; i++)
886 embDbiRmFileI(dbname, ajStrGetPtr(dbiSortExt), i, cleanup);
887
888 }
889 else
890 {
891 ajFmtPrintS(&dbiCmdStr2, "%S -m -o %S.%s %S",
892 dir,dbname,ext2,sortopt);
893 isplit = 0;
894
895 for(i=1; i<=nfiles; i+=nsplit)
896 {
897 isplit++;
898 ajFmtPrintAppS(&dbiCmdStr2, " %S%03d.%s.mrg1",
899 dbname, isplit, ext2);
900
901 /* Now we make that .mrg1 file */
902
903 ajFmtPrintS(&dbiCmdStr, "%S -m -o %S%03d.%s.mrg1 %S",
904 dir,dbname,isplit,ext2,sortopt);
905
906 for(j=0; j<nsplit; j++)
907 if((i+j) <= nfiles)
908 ajFmtPrintAppS(&dbiCmdStr, " %S%03d.%s.srt",
909 dbname, i+j, ext1);
910
911 ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr),"C");
912 ajFmtPrintS(&dbiSortExt, "%s.srt", ext1);
913
914 for(j=0; j<nsplit; j++)
915 if((i+j) <= nfiles)
916 embDbiRmFileI(dbname, ajStrGetPtr(dbiSortExt), (i+j),
917 cleanup);
918 }
919
920 ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr2), "C");
921 ajFmtPrintS(&dbiSortExt, "%s.mrg1", ext2);
922
923 for(j=1; j<=isplit; j++)
924 embDbiRmFileI(dbname, ajStrGetPtr(dbiSortExt), j, cleanup);
925 }
926 }
927 else
928 {
929 ajFmtPrintS(&dbiInFname, "%S.%s", dbname, ext1);
930 ajFmtPrintS(&dbiOutFname, "%S.%s", dbname, ext2);
931 ajFmtPrintS(&dbiCmdStr, "%S -o %S %S %S",
932 dir,dbiOutFname,sortopt,dbiInFname);
933
934 ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr), "C");
935 embDbiRmFile(dbname, ext1, 0, cleanup);
936 }
937
938 ajStrDel(&dir);
939
940 return;
941 }
942
943
944
945
946 /* @func embDbiHeaderSize *****************************************************
947 **
948 ** Updates the file header for an index file to include the correct file size.
949 **
950 ** @param [u] file [AjPFile] Output file
951 ** @param [r] filesize [ajuint] File size (if known, can be rewritten)
952 ** @param [r] recordcnt [ajuint] Number of records
953 ** @return [void]
954 **
955 ** @release 2.4.0
956 ******************************************************************************/
957
embDbiHeaderSize(AjPFile file,ajuint filesize,ajuint recordcnt)958 void embDbiHeaderSize(AjPFile file, ajuint filesize, ajuint recordcnt)
959 {
960 ajFileSeek(file, 0, 0);
961
962 ajWritebinInt4(file, (ajint) filesize); /* filesize */
963 ajWritebinInt4(file, (ajint) recordcnt); /* #records */
964
965 return;
966 }
967
968
969
970
971 /* @func embDbiHeader *********************************************************
972 **
973 ** Writes the header for an index file. Resets the file pointer to beginning
974 ** of file, and leaves the file pointer at the start of the first record.
975 **
976 ** @param [u] file [AjPFile] Output file
977 ** @param [r] filesize [ajuint] File size (if known, can be rewritten)
978 ** @param [r] recordcnt [ajuint] Number of records
979 ** @param [r] recordlen [short] Record length (bytes)
980 ** @param [r] dbname [const AjPStr] Database name (up to 20 characters used)
981 ** @param [r] release [const AjPStr] Release as a string (up to 10
982 ** characters used)
983 ** @param [r] date [const char[4]] Date dd,mm,yy,00
984 ** @return [void]
985 **
986 ** @release 2.4.0
987 ******************************************************************************/
988
embDbiHeader(AjPFile file,ajuint filesize,ajuint recordcnt,short recordlen,const AjPStr dbname,const AjPStr release,const char date[4])989 void embDbiHeader(AjPFile file, ajuint filesize, ajuint recordcnt,
990 short recordlen, const AjPStr dbname, const AjPStr release,
991 const char date[4])
992 {
993 ajuint i;
994 static char padding[256];
995 static AjBool firstcall = AJTRUE;
996
997 if(firstcall)
998 {
999 for(i=0;i<256;i++)
1000 padding[i] = ' ';
1001
1002 firstcall = ajFalse;
1003 }
1004
1005 ajFileSeek(file, 0, 0);
1006
1007 ajWritebinInt4(file, (ajint) filesize); /* filesize */
1008
1009 ajWritebinInt4(file, (ajint) recordcnt); /* #records */
1010
1011 ajWritebinInt2(file, (ajint) recordlen); /* recordsize */
1012
1013 /* rest of the header */
1014 ajWritebinStr(file, dbname, 20); /* dbname */
1015 ajWritebinStr(file, release, 10); /* release */
1016 ajWritebinByte(file, date[0]); /* release date */
1017 ajWritebinByte(file, date[1]); /* release date */
1018 ajWritebinByte(file, date[2]); /* release date */
1019 ajWritebinByte(file, date[3]); /* release date */
1020 ajWritebinBinary(file, 1, 256, padding); /* padding 256 bytes */
1021
1022 return;
1023 }
1024
1025
1026
1027
1028 /* @func embDbiFileSingle *****************************************************
1029 **
1030 ** Builds a filename for a single temporary file to save IDs or some other
1031 ** index field, for example EMBL01.list
1032 **
1033 ** @param [r] dbname [const AjPStr] Database name
1034 ** @param [r] extension [const char*] Filename extension.
1035 ** @param [r] num [ajuint] Number for this file (start at 1)
1036 ** @return [AjPFile] Opened output file
1037 **
1038 **
1039 ** @release 2.4.0
1040 ******************************************************************************/
1041
embDbiFileSingle(const AjPStr dbname,const char * extension,ajuint num)1042 AjPFile embDbiFileSingle(const AjPStr dbname, const char* extension, ajuint num)
1043 {
1044 AjPFile ret;
1045
1046 ajFmtPrintS(&dbiOutFname, "%S%03d.%s", dbname, num, extension);
1047 ret = ajFileNewOutNameS(dbiOutFname);
1048
1049 if(!ret)
1050 ajFatal("Cannot open %S for writing", dbiOutFname);
1051
1052 return ret;
1053 }
1054
1055
1056
1057
1058 /* @func embDbiFileIn *********************************************************
1059 **
1060 ** Builds a filename for a summary file to read IDs or some other
1061 ** index field, for example EMBL.acnum_sort
1062 **
1063 ** @param [r] dbname [const AjPStr] Database name
1064 ** @param [r] extension [const char*] Filename extension.
1065 ** @return [AjPFile] Opened output file
1066 **
1067 **
1068 ** @release 2.4.0
1069 ******************************************************************************/
1070
embDbiFileIn(const AjPStr dbname,const char * extension)1071 AjPFile embDbiFileIn(const AjPStr dbname, const char* extension)
1072 {
1073 AjPFile ret;
1074
1075 ajFmtPrintS(&dbiInFname, "%S.%s", dbname, extension);
1076 ret = ajFileNewInNameS(dbiInFname);
1077
1078 if(!ret)
1079 ajFatal("Cannot open %S for reading", dbiInFname);
1080
1081 return ret;
1082 }
1083
1084
1085
1086
1087 /* @func embDbiFileOut ********************************************************
1088 **
1089 ** Builds a filename for a summary file to save IDs or some other
1090 ** index field, for example EMBL.acnum_srt2
1091 **
1092 ** @param [r] dbname [const AjPStr] Database name
1093 ** @param [r] extension [const char*] Filename extension.
1094 ** @return [AjPFile] Opened output file
1095 **
1096 **
1097 ** @release 2.4.0
1098 ******************************************************************************/
1099
embDbiFileOut(const AjPStr dbname,const char * extension)1100 AjPFile embDbiFileOut(const AjPStr dbname, const char* extension)
1101 {
1102 AjPFile ret;
1103
1104 ajFmtPrintS(&dbiOutFname, "%S.%s", dbname, extension);
1105 ret = ajFileNewOutNameS(dbiOutFname);
1106
1107 if(!ret)
1108 ajFatal("Cannot open %S for writing", dbiOutFname);
1109
1110 return ret;
1111 }
1112
1113
1114
1115
1116 /* @func embDbiFileIndex ******************************************************
1117 **
1118 ** Builds a filename for a summary file to save IDs or some other
1119 ** index field, for example EMBL.acsrt2
1120 **
1121 ** @param [r] indexdir [const AjPStr] Index directory
1122 ** @param [r] field [const AjPStr] Field name
1123 ** @param [r] extension [const char*] Filename extension.
1124 ** @return [AjPFile] Opened output file
1125 **
1126 **
1127 ** @release 2.4.0
1128 ******************************************************************************/
1129
embDbiFileIndex(const AjPStr indexdir,const AjPStr field,const char * extension)1130 AjPFile embDbiFileIndex(const AjPStr indexdir, const AjPStr field,
1131 const char* extension)
1132 {
1133 AjPFile ret;
1134
1135 ajFmtPrintS(&dbiOutFname, "%S.%s", field, extension);
1136 ret = ajFileNewOutNamePathS(dbiOutFname, indexdir);
1137
1138 if(!ret)
1139 ajFatal("Cannot open %S for writing", dbiOutFname);
1140
1141 return ret;
1142 }
1143
1144
1145
1146
1147 /* @func embDbiWriteDivision **************************************************
1148 **
1149 ** Writes the division index file
1150 **
1151 ** @param [r] indexdir [const AjPStr] Index directory
1152 ** @param [r] dbname [const AjPStr] Database name
1153 ** @param [r] release [const AjPStr] Release number as a string
1154 ** @param [r] date [const char[4]] Date
1155 ** @param [r] maxfilelen [ajuint] Max file name length
1156 ** @param [r] nfiles [ajuint] Number of files indexes
1157 ** @param [r] divfiles [AjPStr const *] Division filenames
1158 ** @param [r] seqfiles [AjPStr const *] Sequence filenames (or NULL if none)
1159 ** @return [void]
1160 **
1161 ** @release 2.4.0
1162 ******************************************************************************/
1163
embDbiWriteDivision(const AjPStr indexdir,const AjPStr dbname,const AjPStr release,const char date[4],ajuint maxfilelen,ajuint nfiles,AjPStr const * divfiles,AjPStr const * seqfiles)1164 void embDbiWriteDivision(const AjPStr indexdir,
1165 const AjPStr dbname, const AjPStr release,
1166 const char date[4], ajuint maxfilelen, ajuint nfiles,
1167 AjPStr const * divfiles, AjPStr const * seqfiles)
1168 {
1169 AjPFile divFile;
1170 AjPStr tmpfname = NULL;
1171 ajuint i;
1172 ajuint filesize;
1173
1174 short recsize;
1175
1176 ajStrAssignC(&tmpfname, "division.lkp");
1177 divFile = ajFileNewOutNamePathS(tmpfname, indexdir);
1178
1179 filesize = 256 + 44 + (nfiles * (maxfilelen+2));
1180 recsize = maxfilelen + 2;
1181
1182 embDbiHeader(divFile, filesize, nfiles, recsize, dbname, release, date);
1183
1184 for(i=0; i<nfiles; i++)
1185 {
1186 if(seqfiles)
1187 embDbiWriteDivisionRecord(divFile, maxfilelen, (short)(i+1),
1188 divfiles[i], seqfiles[i]);
1189 else
1190 embDbiWriteDivisionRecord(divFile, maxfilelen, (short)(i+1),
1191 divfiles[i], NULL);
1192 }
1193
1194 ajFileClose(&divFile);
1195 ajStrDel(&tmpfname);
1196
1197 return;
1198 }
1199
1200
1201
1202
1203 /* @func embDbiWriteDivisionRecord ********************************************
1204 **
1205 ** Writes a record to the division lookup file
1206 **
1207 ** @param [u] file [AjPFile] Index file
1208 ** @param [r] maxnamlen [ajuint] Maximum file name length
1209 ** @param [r] recnum [short] Record number
1210 ** @param [r] datfile [const AjPStr] Data file name
1211 ** @param [r] seqfile [const AjPStr] Sequence file name (or NULL if none)
1212 ** @return [void]
1213 **
1214 ** @release 2.4.0
1215 ******************************************************************************/
1216
embDbiWriteDivisionRecord(AjPFile file,ajuint maxnamlen,short recnum,const AjPStr datfile,const AjPStr seqfile)1217 void embDbiWriteDivisionRecord(AjPFile file, ajuint maxnamlen, short recnum,
1218 const AjPStr datfile, const AjPStr seqfile)
1219 {
1220 ajWritebinInt2(file, recnum);
1221
1222 if(ajStrGetLen(seqfile))
1223 {
1224 ajFmtPrintS(&dbiOutRecord, "%S %S", datfile, seqfile);
1225 ajWritebinStr(file, dbiOutRecord, maxnamlen);
1226 }
1227 else
1228 ajWritebinStr(file, datfile, maxnamlen);
1229
1230 return;
1231 }
1232
1233
1234
1235
1236 /* @func embDbiWriteEntryRecord ***********************************************
1237 **
1238 ** Writes a record to the entryname index file
1239 **
1240 ** @param [u] file [AjPFile] hit file
1241 ** @param [r] maxidlen [ajuint] Maximum length for an id string
1242 ** @param [r] id [const AjPStr] The id string for this entry
1243 ** @param [r] rpos [ajuint] Data file offset
1244 ** @param [r] spos [ajuint] sequence file offset
1245 ** @param [r] filenum [ajushort] file number in division file
1246 ** @return [void]
1247 **
1248 ** @release 2.4.0
1249 ******************************************************************************/
1250
embDbiWriteEntryRecord(AjPFile file,ajuint maxidlen,const AjPStr id,ajuint rpos,ajuint spos,ajushort filenum)1251 void embDbiWriteEntryRecord(AjPFile file, ajuint maxidlen, const AjPStr id,
1252 ajuint rpos, ajuint spos, ajushort filenum)
1253 {
1254
1255 ajWritebinStr(file, id, maxidlen);
1256 ajWritebinInt4(file, rpos);
1257 ajWritebinInt4(file, spos);
1258 ajWritebinInt2(file, filenum);
1259
1260 return;
1261 }
1262
1263
1264
1265
1266 /* @func embDbiWriteHit *******************************************************
1267 **
1268 ** Writes a record to the field hit (.hit) index file
1269 **
1270 ** @param [u] file [AjPFile] hit file
1271 ** @param [r] idnum [ajuint] Entry number (1 for the first) in the
1272 ** entryname file
1273 ** @return [void]
1274 **
1275 ** @release 2.4.0
1276 ******************************************************************************/
1277
embDbiWriteHit(AjPFile file,ajuint idnum)1278 void embDbiWriteHit(AjPFile file, ajuint idnum)
1279 {
1280 ajWritebinInt4(file, (ajint) idnum);
1281
1282 return;
1283 }
1284
1285
1286
1287
1288 /* @func embDbiWriteTrg *******************************************************
1289 **
1290 ** Writes a record to the field target (.trg) index file
1291 **
1292 ** @param [u] file [AjPFile] hit file
1293 ** @param [r] maxfieldlen [ajuint] Maximum field token length
1294 ** @param [r] idnum [ajuint] First record number (1 for the first) in the
1295 ** field hit index file
1296 ** @param [r] idcnt [ajuint] Number of entries for this field value
1297 ** in the field hit index file
1298 ** @param [r] hitstr [const AjPStr] Field token string
1299 ** @return [void]
1300 **
1301 ** @release 2.4.0
1302 ******************************************************************************/
1303
embDbiWriteTrg(AjPFile file,ajuint maxfieldlen,ajuint idnum,ajuint idcnt,const AjPStr hitstr)1304 void embDbiWriteTrg(AjPFile file, ajuint maxfieldlen, ajuint idnum,
1305 ajuint idcnt, const AjPStr hitstr)
1306 {
1307 ajWritebinInt4(file, (ajint) idnum);
1308 ajWritebinInt4(file, (ajint) idcnt);
1309 ajWritebinStr(file, hitstr, maxfieldlen);
1310
1311 return;
1312 }
1313
1314
1315
1316
1317 /* @func embDbiSortOpen *******************************************************
1318 **
1319 ** Open sort files for entries and all fields
1320 **
1321 ** @param [w] alistfile [AjPFile*] Sort files for each field.
1322 ** @param [r] ifile [ajuint] Input file number (used for temporary file names)
1323 ** @param [r] dbname [const AjPStr] Database name
1324 ** (used for temporary file names)
1325 ** @param [r] fields [AjPStr const *] Field names (used for temporary
1326 ** file names)
1327 ** @param [r] nfields [ajuint] Number of fields
1328 ** @return [AjPFile] Sort file for entries
1329 **
1330 ** @release 2.4.0
1331 ******************************************************************************/
1332
embDbiSortOpen(AjPFile * alistfile,ajuint ifile,const AjPStr dbname,AjPStr const * fields,ajuint nfields)1333 AjPFile embDbiSortOpen(AjPFile* alistfile,
1334 ajuint ifile, const AjPStr dbname,
1335 AjPStr const * fields, ajuint nfields)
1336 {
1337 AjPFile elistfile;
1338 ajuint ifield;
1339
1340 elistfile = embDbiFileSingle(dbname, "list", ifile+1);
1341
1342 for(ifield=0;ifield < nfields; ifield++)
1343 alistfile[ifield] = embDbiFileSingle(dbname,
1344 dbiFieldFile(fields[ifield]),
1345 ifile+1);
1346
1347 return elistfile;
1348 }
1349
1350
1351
1352
1353 /* @funcstatic dbiFieldFile ***************************************************
1354 **
1355 ** Returns the index filename that relates to a USA field name
1356 **
1357 ** @param [r] fieldname [const AjPStr] Field name
1358 ** @return [const char*] Index filename for this field
1359 **
1360 ** @release 4.0.0
1361 ******************************************************************************/
1362
dbiFieldFile(const AjPStr fieldname)1363 static const char* dbiFieldFile(const AjPStr fieldname)
1364 {
1365 ajuint i = 0;
1366
1367 for(i=0;fieldDef[i].name;i++)
1368 if(ajStrMatchCaseC(fieldname, fieldDef[i].name))
1369 return fieldDef[i].index;
1370
1371 ajErr("Unknown query field '%S' in index filename lookup", fieldname);
1372 return NULL;
1373 }
1374
1375
1376
1377
1378 /* @func embDbiSortClose ******************************************************
1379 **
1380 ** Close the sort files for entries and all fields
1381 **
1382 ** @param [u] elistfile [AjPFile*] Sort file for entries
1383 ** @param [u] alistfile [AjPFile*] Sort files for each field.
1384 ** @param [r] nfields [ajuint] Number of fields
1385 ** @return [void]
1386 **
1387 ** @release 2.4.0
1388 ******************************************************************************/
1389
embDbiSortClose(AjPFile * elistfile,AjPFile * alistfile,ajuint nfields)1390 void embDbiSortClose(AjPFile* elistfile, AjPFile* alistfile, ajuint nfields)
1391 {
1392 ajuint ifield;
1393
1394 ajFileClose(elistfile);
1395
1396 for(ifield=0; ifield < nfields; ifield++)
1397 ajFileClose(&alistfile[ifield]);
1398
1399 return;
1400 }
1401
1402
1403
1404
1405 /* @func embDbiMemEntry *******************************************************
1406 **
1407 ** Stores data for current entry in memory by appending to lists
1408 **
1409 ** @param [u] idlist [AjPList] List of entry IDs
1410 ** @param [u] fieldList [AjPList*] List of field tokens for each field
1411 ** @param [r] nfields [ajuint] Number of fields
1412 ** @param [u] entry [EmbPEntry] Current entry
1413 ** @param [r] ifile [ajuint] Current input file number
1414 ** @return [void]
1415 **
1416 ** @release 2.4.0
1417 ******************************************************************************/
1418
embDbiMemEntry(AjPList idlist,AjPList * fieldList,ajuint nfields,EmbPEntry entry,ajuint ifile)1419 void embDbiMemEntry(AjPList idlist, AjPList* fieldList, ajuint nfields,
1420 EmbPEntry entry, ajuint ifile)
1421 {
1422 ajuint ifield;
1423 ajuint i;
1424 EmbPField fieldData = NULL;
1425
1426 entry->filenum = ifile+1;
1427 ajListPushAppend(idlist, entry);
1428
1429 for(ifield=0; ifield < nfields; ifield++)
1430 for(i=0;i<entry->nfield[ifield]; i++)
1431 {
1432 fieldData = embDbiFieldNew();
1433 fieldData->entry = entry->entry;
1434 fieldData->field = entry->field[ifield][i];
1435 ajListPushAppend(fieldList[ifield], fieldData);
1436 }
1437
1438 return;
1439 }
1440
1441
1442
1443
1444 /* @func embDbiSortWriteEntry *************************************************
1445 **
1446 ** Write the entryname index file using data from the entry sort file.
1447 **
1448 ** @param [u] entFile [AjPFile] Entry file
1449 ** @param [r] maxidlen [ajuint] Maximum id length
1450 ** @param [r] dbname [const AjPStr] Database name (used in temp file names)
1451 ** @param [r] nfiles [ajuint] Number of files
1452 ** @param [r] cleanup [AjBool] Cleanup temp files if true
1453 ** @param [r] sortopt [const AjPStr] Sort commandline options
1454 ** @return [ajuint] Number of entries
1455 **
1456 ** @release 2.4.0
1457 ******************************************************************************/
1458
embDbiSortWriteEntry(AjPFile entFile,ajuint maxidlen,const AjPStr dbname,ajuint nfiles,AjBool cleanup,const AjPStr sortopt)1459 ajuint embDbiSortWriteEntry(AjPFile entFile, ajuint maxidlen,
1460 const AjPStr dbname, ajuint nfiles,
1461 AjBool cleanup, const AjPStr sortopt)
1462 {
1463 AjPFile esortfile;
1464 ajint rpos;
1465 ajint spos;
1466 ajint filenum;
1467 ajuint idcnt = 0;
1468
1469 if(!dbiRegEntryIdSort)
1470 dbiRegEntryIdSort =
1471 ajRegCompC("^([^ ]+) +([0-9]+) +([0-9]+) +([0-9]+)");
1472
1473 embDbiSortFile(dbname, "list", "idsrt", nfiles, cleanup, sortopt);
1474 ajStrAssignC(&dbiLastId, " ");
1475 esortfile = embDbiFileIn(dbname, "idsrt");
1476
1477 while(ajReadline(esortfile, &dbiRdLine))
1478 {
1479 ajRegExec(dbiRegEntryIdSort, dbiRdLine);
1480 ajRegSubI(dbiRegEntryIdSort, 1, &dbiIdStr);
1481 ajRegSubI(dbiRegEntryIdSort, 2, &dbiTmpStr);
1482 ajStrToInt(dbiTmpStr, &rpos);
1483 ajRegSubI(dbiRegEntryIdSort, 3, &dbiTmpStr);
1484 ajStrToInt(dbiTmpStr, &spos);
1485 ajRegSubI(dbiRegEntryIdSort, 4, &dbiTmpStr);
1486 ajStrToInt(dbiTmpStr, &filenum);
1487
1488 if(ajStrMatchCaseS(dbiIdStr, dbiLastId))
1489 {
1490 ajDebug("Duplicate ID '%S' filenum: %d",
1491 dbiIdStr, filenum);
1492 ajWarn("Duplicate ID skipped: '%S' "
1493 "All hits will point to first ID found",
1494 dbiIdStr);
1495 continue;
1496 }
1497
1498 embDbiWriteEntryRecord(entFile, maxidlen, dbiIdStr,
1499 rpos, spos, filenum);
1500 ajStrAssignS(&dbiLastId, dbiIdStr);
1501 idcnt++;
1502 }
1503 ajFileClose(&esortfile);
1504
1505 return idcnt;
1506 }
1507
1508
1509
1510
1511 /* @func embDbiMemWriteEntry **************************************************
1512 **
1513 ** Write entryname index for in-memory processing
1514 **
1515 ** @param [u] entFile [AjPFile] entryname index file
1516 ** @param [r] maxidlen [ajuint] Maximum entry id length
1517 ** @param [r] idlist [const AjPList] List of entry IDs to be written
1518 ** @param [w] ids [void***] AjPStr* array of IDs from list
1519 ** @return [ajuint] Number of entries written (excluding duplicates)
1520 **
1521 ** @release 2.4.0
1522 ******************************************************************************/
1523
embDbiMemWriteEntry(AjPFile entFile,ajuint maxidlen,const AjPList idlist,void *** ids)1524 ajuint embDbiMemWriteEntry(AjPFile entFile, ajuint maxidlen,
1525 const AjPList idlist,
1526 void ***ids)
1527 {
1528 ajuint idCount;
1529 ajuint i;
1530 EmbPEntry entry;
1531 ajuint idcnt = 0;
1532
1533 idCount = (ajuint) ajListToarray(idlist, ids);
1534 qsort(*ids, idCount, sizeof(void*), embDbiCmpId);
1535 ajDebug("ids sorted\n");
1536
1537 for(i = 0; i < idCount; i++)
1538 {
1539 entry = (EmbPEntry)(*ids)[i];
1540
1541 if(ajStrMatchCaseC(dbiIdStr, entry->entry))
1542 {
1543 ajErr("Duplicate ID found: '%S'", dbiIdStr);
1544 continue;
1545 }
1546
1547 ajStrAssignC(&dbiIdStr, entry->entry);
1548 embDbiWriteEntryRecord(entFile, maxidlen, dbiIdStr,
1549 entry->rpos, entry->spos, entry->filenum);
1550 idcnt++;
1551 }
1552
1553 return idcnt;
1554 }
1555
1556
1557
1558
1559 /* @func embDbiSortWriteFields ************************************************
1560 **
1561 ** Write the indices for a field.
1562 **
1563 ** @param [r] dbname [const AjPStr] Database name (used for temp file names)
1564 ** @param [r] release [const AjPStr] Release number as a string
1565 ** @param [r] date [const char[4]] Date
1566 ** @param [r] indexdir [const AjPStr] Index directory
1567 ** @param [r] fieldname [const AjPStr] Field name (used for temp file names)
1568 ** @param [r] maxFieldLen [ajuint] Maximum field token length
1569 ** @param [r] nfiles [ajuint] Number of data files
1570 ** @param [r] nentries [ajuint] Number of entries
1571 ** @param [r] cleanup [AjBool] Cleanup temp files if true
1572 ** @param [r] sortopt [const AjPStr] Sort command line options
1573 ** @return [ajuint] Number of unique field targets written
1574 **
1575 ** @release 2.4.0
1576 ******************************************************************************/
1577
embDbiSortWriteFields(const AjPStr dbname,const AjPStr release,const char date[4],const AjPStr indexdir,const AjPStr fieldname,ajuint maxFieldLen,ajuint nfiles,ajuint nentries,AjBool cleanup,const AjPStr sortopt)1578 ajuint embDbiSortWriteFields(const AjPStr dbname, const AjPStr release,
1579 const char date[4], const AjPStr indexdir,
1580 const AjPStr fieldname, ajuint maxFieldLen,
1581 ajuint nfiles, ajuint nentries,
1582 AjBool cleanup, const AjPStr sortopt)
1583 {
1584 AjPFile asortfile;
1585 AjPFile asrt2file;
1586 AjPFile blistfile;
1587 AjPFile elistfile;
1588 ajuint ient;
1589
1590 ajuint fieldCount=0;
1591 ajuint idwidth;
1592
1593 AjPFile trgFile;
1594 AjPFile hitFile;
1595 short alen;
1596 ajuint asize;
1597 ajuint ahsize;
1598 ajuint itoken = 0;
1599 ajuint i;
1600 ajuint j;
1601 ajuint k;
1602 ajint idnum;
1603 ajint lastidnum;
1604
1605 ajStrAssignC(&dbiFieldName, dbiFieldFile(fieldname));
1606 ajFmtPrintS(&dbiTmpStr, "%d", nentries);
1607 idwidth = ajStrGetLen(dbiTmpStr);
1608
1609 if(!dbiRegFieldIdSort)
1610 dbiRegFieldIdSort = ajRegCompC("^([^ ]+) +");
1611
1612 if(!dbiRegFieldTokSort)
1613 dbiRegFieldTokSort = ajRegCompC("^([^ ]+) +([^\n\r]+)");
1614
1615 if(!dbiRegFieldTokIdSort)
1616 dbiRegFieldTokIdSort = ajRegCompC("^(.*[^ ]) +([0-9]+)[\r\n]+$");
1617
1618 ajFmtPrintS(&dbiFieldId2, "%S_id2", dbiFieldName);
1619 ajFmtPrintS(&dbiFieldSort, "%S_sort", dbiFieldName);
1620 ajFmtPrintS(&dbiFieldSort2, "%S_sort2", dbiFieldName);
1621
1622 trgFile = embDbiFileIndex(indexdir, dbiFieldName, "trg");
1623 hitFile = embDbiFileIndex(indexdir, dbiFieldName, "hit");
1624
1625 embDbiSortFile(dbname, ajStrGetPtr(dbiFieldName),
1626 ajStrGetPtr(dbiFieldSort),
1627 nfiles, cleanup, sortopt);
1628
1629 /* put in the entry numbers and remove the names */
1630 /* read dbname.<field>srt, for each entry, increment the count */
1631
1632 elistfile = embDbiFileIn(dbname, "idsrt");
1633 asortfile = embDbiFileIn(dbname, ajStrGetPtr(dbiFieldSort));
1634 blistfile = embDbiFileOut(dbname, ajStrGetPtr(dbiFieldId2));
1635
1636 fieldCount = 0;
1637
1638 ient=0;
1639 ajStrAssignC(&dbiCurrentId, "");
1640
1641 while(ajReadline(asortfile, &dbiRdLine))
1642 {
1643 ajRegExec(dbiRegFieldTokSort, dbiRdLine);
1644 ajRegSubI(dbiRegFieldTokSort, 1, &dbiIdStr);
1645 ajRegSubI(dbiRegFieldTokSort, 2, &dbiFieldStr);
1646
1647 ajDebug("asortfile curr '%S' id '%S' field '%S'\n",
1648 dbiCurrentId, dbiIdStr, dbiFieldStr);
1649
1650 while(!ajStrMatchS(dbiIdStr, dbiCurrentId))
1651 {
1652 ajStrAssignS(&dbiFieldId, dbiCurrentId);
1653
1654 if(!ajReadline(elistfile, &dbiIdLine))
1655 ajFatal("Error in embDbiSortWriteFields, "
1656 "expected entry %S not found, last was '%S'",
1657 dbiIdStr, dbiCurrentId);
1658 ajRegExec(dbiRegFieldIdSort, dbiIdLine);
1659 ajRegSubI(dbiRegFieldIdSort, 1, &dbiCurrentId);
1660
1661 ajDebug("curr '%S' line '%S'\n", dbiCurrentId, dbiIdLine);
1662
1663 if(!ajStrMatchS(dbiFieldId, dbiCurrentId))
1664 ient++;
1665 ajDebug("asortfile curr '%S' id '%S' ient: %u\n",
1666 dbiCurrentId, dbiIdStr, ient);
1667 }
1668
1669 ajFmtPrintF(blistfile, "%S %0*d\n", dbiFieldStr, idwidth, ient);
1670 fieldCount++;
1671 }
1672
1673 ajFileClose(&asortfile);
1674 ajFileClose(&blistfile);
1675 ajFileClose(&elistfile);
1676
1677 /* sort again */
1678
1679 embDbiRmFile(dbname, ajStrGetPtr(dbiFieldSort), 0, cleanup);
1680 embDbiSortFile(dbname, ajStrGetPtr(dbiFieldId2),
1681 ajStrGetPtr(dbiFieldSort2),
1682 0, cleanup, sortopt);
1683
1684 alen = maxFieldLen+8;
1685 asize = 300 + (fieldCount*(ajuint)alen); /* to be fixed later */
1686 embDbiHeader(trgFile, asize, fieldCount,
1687 alen, dbname, release, date);
1688
1689 ahsize = 300 + (fieldCount*4);
1690 embDbiHeader(hitFile, ahsize, fieldCount, 4,
1691 dbname, release, date);
1692
1693 itoken = 0;
1694 j = 0;
1695 k = 1;
1696
1697 i = 0;
1698 lastidnum = 999999999;
1699 ajStrAssignC(&dbiFieldId, "");
1700 asrt2file = embDbiFileIn(dbname, ajStrGetPtr(dbiFieldSort2));
1701
1702 while(ajReadline(asrt2file, &dbiRdLine))
1703 {
1704 ajRegExec(dbiRegFieldTokIdSort, dbiRdLine);
1705 ajRegSubI(dbiRegFieldTokIdSort, 1, &dbiIdStr);
1706 ajRegSubI(dbiRegFieldTokIdSort, 2, &dbiTmpStr);
1707 ajStrToInt(dbiTmpStr, &idnum);
1708
1709 if(!i)
1710 ajStrAssignS(&dbiFieldId, dbiIdStr);
1711
1712 if(!ajStrMatchS(dbiFieldId, dbiIdStr))
1713 {
1714 embDbiWriteHit(hitFile, idnum);
1715 embDbiWriteTrg(trgFile, maxFieldLen,
1716 j, k, dbiFieldId);
1717 j = 1; /* number of hits */
1718 k = i+1; /* first hit */
1719 ajStrAssignS(&dbiFieldId, dbiIdStr);
1720 i++;
1721 itoken++;
1722 lastidnum=idnum;
1723 }
1724 else if(idnum != lastidnum) /* dbiIdStr is the same */
1725 {
1726 embDbiWriteHit(hitFile, idnum);
1727 lastidnum = idnum;
1728 j++;
1729 i++;
1730 }
1731 }
1732
1733 ajFileClose(&asrt2file);
1734 embDbiRmFile(dbname, ajStrGetPtr(dbiFieldSort2), 0, cleanup);
1735
1736 ajDebug("targets i:%d itoken: %d\n", i, itoken);
1737
1738 if(i)
1739 {
1740 /* possibly there were no target tokens */
1741 embDbiWriteTrg(trgFile, maxFieldLen,
1742 j, k, dbiFieldId);
1743 itoken++;
1744 }
1745
1746 ajDebug("wrote %F %d\n", trgFile, itoken);
1747
1748 embDbiHeaderSize(trgFile, 300+itoken*(ajuint)alen, itoken);
1749
1750 ajDebug("finished...\n%7d files\n%7d %F\n%7d %F\n",
1751 nfiles, itoken, trgFile,
1752 fieldCount, hitFile);
1753
1754 ajFileClose(&trgFile);
1755 ajFileClose(&hitFile);
1756
1757 return itoken;
1758 }
1759
1760
1761
1762
1763 /* @func embDbiMemWriteFields *************************************************
1764 **
1765 ** Write the fields indices
1766 **
1767 ** @param [r] dbname [const AjPStr] Database name (used for temp file names)
1768 ** @param [r] release [const AjPStr] Release number as a string
1769 ** @param [r] date [const char[4]] Date
1770 ** @param [r] indexdir [const AjPStr] Index directory
1771 ** @param [r] fieldname [const AjPStr] Field name (used for file names)
1772 ** @param [r] maxFieldLen [ajuint] Maximum field token length
1773 ** @param [r] fieldList [const AjPList] List of field tokens to be written
1774 ** @param [r] ids [void**] AjPStr* array offield token s from list
1775 ** @return [ajuint] Number of unique field targets written
1776 **
1777 ** @release 2.4.0
1778 ******************************************************************************/
1779
embDbiMemWriteFields(const AjPStr dbname,const AjPStr release,const char date[4],const AjPStr indexdir,const AjPStr fieldname,ajuint maxFieldLen,const AjPList fieldList,void ** ids)1780 ajuint embDbiMemWriteFields(const AjPStr dbname,const AjPStr release,
1781 const char date[4], const AjPStr indexdir,
1782 const AjPStr fieldname, ajuint maxFieldLen,
1783 const AjPList fieldList, void** ids)
1784 {
1785 AjPStr field = NULL;
1786
1787 ajuint fieldCount = 0;
1788 ajuint ient;
1789 ajuint fieldent;
1790 ajuint i;
1791 ajuint j;
1792 ajint k;
1793 void **fieldItems = NULL;
1794 AjPFile trgFile;
1795 AjPFile hitFile;
1796 short alen;
1797 ajuint asize;
1798 ajuint ahsize;
1799 ajuint itoken = 0;
1800 ajuint idup = 0;
1801 EmbPField fieldData = NULL;
1802 static const char* lastfd = "";
1803 ajuint lastidnum = 0;
1804
1805 ajStrAssignC(&field, dbiFieldFile(fieldname));
1806 trgFile = embDbiFileIndex(indexdir, field, "trg");
1807 hitFile = embDbiFileIndex(indexdir, field, "hit");
1808
1809 fieldCount = (ajuint) ajListToarray(fieldList, &fieldItems);
1810
1811 ajDebug("fieldItems: %d %x\n",
1812 fieldCount, fieldItems);
1813
1814 if(fieldCount)
1815 {
1816 qsort(fieldItems, fieldCount, sizeof(void*),
1817 embDbiCmpFieldId);
1818 ajDebug("%S sorted by id\n", field);
1819 ient = 0;
1820 fieldent = 0;
1821
1822 while(ids[ient] && fieldItems[fieldent])
1823 {
1824 k = strcmp(((EmbPEntry)ids[ient])->entry,
1825 ((EmbPField)fieldItems[fieldent])->entry);
1826 if(k < 0)
1827 ient++;
1828 else if(k > 0)
1829 fieldent++;
1830 else
1831 ((EmbPField)fieldItems[fieldent++])->nid = ient+1;
1832 }
1833 ajDebug("checked ids: %d fieldItems: %d %d\n",
1834 ient, fieldent, fieldCount);
1835
1836 qsort(fieldItems, fieldCount, sizeof(void*),
1837 embDbiCmpFieldField);
1838 ajDebug("%S sorted by %S\n", field, field);
1839 }
1840
1841 alen = maxFieldLen+8;
1842 asize = 300 + (fieldCount*(ajuint)alen); /* to be fixed later */
1843 embDbiHeader(trgFile, asize, fieldCount,
1844 alen, dbname, release, date);
1845
1846 ahsize = 300 + (fieldCount*4);
1847 embDbiHeader(hitFile, ahsize, fieldCount, 4,
1848 dbname, release, date);
1849
1850 itoken = 0;
1851 j = 0;
1852 k = 1;
1853 idup = 0;
1854
1855 for(i = 0; i < fieldCount; i++)
1856 {
1857 fieldData = (EmbPField)fieldItems[i];
1858
1859 if(!i)
1860 {
1861 lastfd = fieldData->field;
1862 lastidnum = 999999999;
1863 }
1864
1865 if(strcmp(lastfd, fieldData->field))
1866 {
1867 embDbiWriteHit(hitFile, fieldData->nid);
1868 ajStrAssignC(&dbiFieldStr, lastfd);
1869 embDbiWriteTrg(trgFile, maxFieldLen,
1870 j, k,dbiFieldStr);
1871 j = 1;
1872 k = i+1-idup;
1873 itoken++;
1874 lastfd = fieldData->field;
1875 lastidnum=fieldData->nid;
1876 }
1877 else if(fieldData->nid != lastidnum) /* lastfd is the same */
1878 {
1879 embDbiWriteHit(hitFile, fieldData->nid);
1880 lastidnum = fieldData->nid;
1881 j++;
1882 }
1883 else
1884 idup++;
1885 }
1886
1887 ajStrAssignC(&dbiFieldStr, lastfd);
1888
1889 if(fieldCount)
1890 {
1891 embDbiWriteTrg(trgFile, maxFieldLen, j, k, dbiFieldStr);
1892 itoken++;
1893 }
1894
1895 ajDebug("wrote %F %d\n", trgFile, itoken);
1896
1897 embDbiHeaderSize(trgFile, 300+itoken*(ajuint)alen, itoken);
1898
1899 ajDebug("finished...\n%7d %F\n%7d %F\n",
1900 itoken, trgFile,
1901 fieldCount, hitFile);
1902
1903 ajFileClose(&trgFile);
1904 ajFileClose(&hitFile);
1905
1906 ajStrDel(&field);
1907 AJFREE(fieldItems);
1908
1909 return itoken;
1910 }
1911
1912
1913
1914
1915 /* @func embDbiDateSet ********************************************************
1916 **
1917 ** Sets the date as an integer array from a formatted string.
1918 ** The integer array is the internal format in database index headers
1919 **
1920 ** @param [r] datestr [const AjPStr] Date as a string
1921 ** @param [w] date [char[4]] Data char (1 byte int) array
1922 ** @return [void]
1923 **
1924 ** @release 2.4.0
1925 ******************************************************************************/
1926
embDbiDateSet(const AjPStr datestr,char date[4])1927 void embDbiDateSet(const AjPStr datestr, char date[4])
1928 {
1929 ajuint i;
1930 ajint j;
1931
1932 if(!dbiRegDate)
1933 dbiRegDate = ajRegCompC("^([0-9]+).([0-9]+).([0-9]+)");
1934
1935 date[3] = 0;
1936
1937 if(ajRegExec(dbiRegDate, datestr))
1938 for(i=1; i<4; i++)
1939 {
1940 ajRegSubI(dbiRegDate, i, &dbiTmpStr);
1941 ajStrToInt(dbiTmpStr, &j);
1942 date[3-i] = j;
1943 }
1944
1945 return;
1946 }
1947
1948
1949
1950
1951 /* @func embDbiMaxlen *********************************************************
1952 **
1953 ** Compares a string to a maximum string length.
1954 **
1955 ** A negative maximum length limits the string to that absolute length.
1956 **
1957 ** A non-negative length is updated if the string is longer
1958 **
1959 ** @param [u] token [AjPStr*] Token string
1960 ** @param [u] maxlen [ajint*] Maximum string length
1961 ** @return [void]
1962 **
1963 ** @release 2.4.0
1964 ******************************************************************************/
1965
embDbiMaxlen(AjPStr * token,ajint * maxlen)1966 void embDbiMaxlen(AjPStr* token, ajint* maxlen)
1967 {
1968 if(*maxlen < 0)
1969 ajStrKeepRange(token, 1, -(*maxlen));
1970 else
1971 {
1972 if((ajint)ajStrGetLen(*token) > *maxlen)
1973 *maxlen = ajStrGetLen(*token);
1974 }
1975
1976 return;
1977 }
1978
1979
1980
1981
1982 /* @func embDbiLogHeader ******************************************************
1983 **
1984 ** Writes the header to a database indexing logfile
1985 **
1986 ** @param [u] logfile [AjPFile] Log file
1987 ** @param [r] dbname [const AjPStr] Database name
1988 ** @param [r] release [const AjPStr] Release number, name or code
1989 ** @param [r] datestr [const AjPStr] Indexing date as a string dd/mm/yy
1990 ** @param [r] indexdir [const AjPStr] Index directory relative path
1991 ** @param [r] maxindex [ajuint] Maximum index token length (usually zero)
1992 ** @return [void]
1993 **
1994 ** @release 4.0.0
1995 ******************************************************************************/
1996
embDbiLogHeader(AjPFile logfile,const AjPStr dbname,const AjPStr release,const AjPStr datestr,const AjPStr indexdir,ajuint maxindex)1997 void embDbiLogHeader(AjPFile logfile, const AjPStr dbname,
1998 const AjPStr release, const AjPStr datestr,
1999 const AjPStr indexdir,
2000 ajuint maxindex)
2001 {
2002 AjPStr dirname = NULL;
2003 AjPTime today = NULL;
2004
2005 today = ajTimeNewTodayFmt("report");
2006 ajFmtPrintF(logfile, "########################################\n");
2007 ajFmtPrintF(logfile, "# Program: %S\n", ajUtilGetProgram());
2008 ajFmtPrintF(logfile, "# Rundate: %D\n", today);
2009 ajFmtPrintF(logfile, "# Dbname: %S\n", dbname);
2010 ajFmtPrintF(logfile, "# Release: %S\n", release);
2011 ajFmtPrintF(logfile, "# Date: %S\n", datestr);
2012 ajFmtPrintF(logfile, "# CurrentDirectory: %S\n", ajFileValueCwd());
2013 ajFmtPrintF(logfile, "# IndexDirectory: %S\n", indexdir);
2014 ajStrAssignS(&dirname, indexdir);
2015 ajDirnameFillPath(&dirname);
2016 ajFmtPrintF(logfile, "# IndexDirectoryPath: %S\n", dirname);
2017 ajFmtPrintF(logfile, "# Maxindex: %d\n", maxindex);
2018
2019 ajTimeDel(&today);
2020 ajStrDel(&dirname);
2021
2022 return;
2023 }
2024
2025
2026
2027
2028 /* @func embDbiLogFields ******************************************************
2029 **
2030 ** Writes database indexing logfile report of fields selected for indexing
2031 **
2032 ** @param [u] logfile [AjPFile] Log file
2033 ** @param [r] fields [AjPStr const *] Field names
2034 ** @param [r] nfields [ajuint] Number of fields
2035 ** @return [void]
2036 **
2037 ** @release 4.0.0
2038 ******************************************************************************/
2039
embDbiLogFields(AjPFile logfile,AjPStr const * fields,ajuint nfields)2040 void embDbiLogFields(AjPFile logfile, AjPStr const * fields, ajuint nfields)
2041 {
2042 ajuint i;
2043
2044 ajFmtPrintF(logfile, "# Fields: %d\n", nfields+1);
2045 ajFmtPrintF(logfile, "# Field 1: id\n");
2046
2047 for(i=0;i<nfields;i++)
2048 ajFmtPrintF(logfile, "# Field %d: %S\n", i+2, fields[i]);
2049
2050 return;
2051 }
2052
2053
2054
2055
2056 /* @func embDbiLogSource ******************************************************
2057 **
2058 ** Writes database indexing logfile report of source data selected for indexing
2059 **
2060 ** @param [u] logfile [AjPFile] Log file
2061 ** @param [r] directory [const AjPStr] Data directory relative path
2062 ** @param [r] filename [const AjPStr] Selected filenames wildcard
2063 ** @param [r] exclude [const AjPStr] Excluded filenames wildcard
2064 ** @param [r] inputFiles [AjPStr const *] File names
2065 ** @param [r] nfiles [ajuint] Number of files
2066 ** @return [void]
2067 **
2068 ** @release 4.0.0
2069 ******************************************************************************/
2070
embDbiLogSource(AjPFile logfile,const AjPStr directory,const AjPStr filename,const AjPStr exclude,AjPStr const * inputFiles,ajuint nfiles)2071 void embDbiLogSource(AjPFile logfile, const AjPStr directory,
2072 const AjPStr filename, const AjPStr exclude,
2073 AjPStr const * inputFiles, ajuint nfiles)
2074 {
2075 AjPStr dirname = NULL;
2076 ajuint i;
2077
2078 ajFmtPrintF(logfile, "# Directory: %S\n", directory);
2079 ajStrAssignS(&dirname, directory);
2080 ajDirnameFillPath(&dirname);
2081 ajFmtPrintF(logfile, "# DirectoryPath: %S\n", dirname);
2082 ajFmtPrintF(logfile, "# Filenames: %S\n", filename);
2083 ajFmtPrintF(logfile, "# Exclude: %S\n", exclude);
2084 ajFmtPrintF(logfile, "# Files: %d\n", nfiles);
2085
2086 for(i=0;i<nfiles;i++)
2087 ajFmtPrintF(logfile, "# File %d: %S\n", i+1, inputFiles[i]);
2088
2089 ajStrDel(&dirname);
2090
2091 return;
2092 }
2093
2094
2095
2096
2097 /* @func embDbiLogCmdline *****************************************************
2098 **
2099 ** Writes database indexing logfile report of commandline used
2100 **
2101 ** @param [u] logfile [AjPFile] Log file
2102 ** @return [void]
2103 **
2104 ** @release 4.0.0
2105 ******************************************************************************/
2106
embDbiLogCmdline(AjPFile logfile)2107 void embDbiLogCmdline(AjPFile logfile)
2108 {
2109 AjPStr cmdline = NULL;
2110
2111 ajFmtPrintF(logfile, "########################################\n");
2112 ajFmtPrintF(logfile, "# Commandline: %S\n", ajUtilGetProgram());
2113 ajStrAssignS(&cmdline, ajUtilGetCmdline());
2114
2115 if(ajStrGetLen(cmdline))
2116 {
2117 ajStrExchangeCC(&cmdline, "\n", "\1# ");
2118 ajStrExchangeCC(&cmdline, "\1", "\n");
2119 ajFmtPrintF(logfile, "# %S\n", cmdline);
2120 }
2121
2122 ajStrAssignS(&cmdline, ajUtilGetInputs());
2123
2124 if(ajStrGetLen(cmdline))
2125 {
2126 ajStrExchangeCC(&cmdline, "\n", "\1# ");
2127 ajStrExchangeCC(&cmdline, "\1", "\n");
2128 ajFmtPrintF(logfile, "# %S\n", cmdline);
2129 }
2130
2131 ajFmtPrintF(logfile, "########################################\n\n");
2132 ajStrDel(&cmdline);
2133
2134 return;
2135 }
2136
2137
2138
2139
2140 /* @func embDbiLogFile ********************************************************
2141 **
2142 ** Writes database indexing logfile report of a single source file
2143 **
2144 ** @param [u] logfile [AjPFile] Log file
2145 ** @param [r] curfilename [const AjPStr] Source filename
2146 ** @param [r] idCountFile [ajuint] Number of IDs in file
2147 ** @param [r] fields [AjPStr const *] Field names
2148 ** @param [r] countField [const ajuint*] Number of field tokens in this file
2149 ** @param [r] nfields [ajuint] Number of fields
2150 ** @return [void]
2151 **
2152 ** @release 4.0.0
2153 ******************************************************************************/
2154
embDbiLogFile(AjPFile logfile,const AjPStr curfilename,ajuint idCountFile,AjPStr const * fields,const ajuint * countField,ajuint nfields)2155 void embDbiLogFile(AjPFile logfile, const AjPStr curfilename,
2156 ajuint idCountFile, AjPStr const * fields,
2157 const ajuint* countField,
2158 ajuint nfields)
2159 {
2160 ajuint i;
2161
2162 ajFmtPrintF(logfile, "filename: '%S'\n", curfilename);
2163 ajFmtPrintF(logfile, " id: %d\n", idCountFile);
2164
2165 for(i=0;i<nfields;i++)
2166 ajFmtPrintF(logfile, " %3S: %d\n", fields[i], countField[i]);
2167
2168 return;
2169 }
2170
2171
2172
2173
2174 /* @func embDbiLogFinal *******************************************************
2175 **
2176 ** Writes database indexing logfile report of final totals
2177 **
2178 ** @param [u] logfile [AjPFile] Log file
2179 ** @param [r] maxindex [ajuint] User defined maximum index token length
2180 ** (usually zero)
2181 ** @param [r] maxFieldLen [const ajint*] Maximum index token length
2182 ** for each field. Negative values
2183 ** were upper limits. Positive values
2184 ** are the maximum in the data
2185 ** @param [r] fields [AjPStr const *] Field names
2186 ** @param [r] fieldTot [const ajuint*] Number of unique field tokens
2187 ** @param [r] nfields [ajuint] Number of fields
2188 ** @param [r] nfiles [ajuint] Number of input files
2189 ** @param [r] idDone [ajuint] Number of unique IDs indexed
2190 ** @param [r] idCount [ajuint] Total number of IDs indexed
2191 ** @return [void]
2192 **
2193 ** @release 4.0.0
2194 ******************************************************************************/
2195
embDbiLogFinal(AjPFile logfile,ajuint maxindex,const ajint * maxFieldLen,AjPStr const * fields,const ajuint * fieldTot,ajuint nfields,ajuint nfiles,ajuint idDone,ajuint idCount)2196 void embDbiLogFinal(AjPFile logfile, ajuint maxindex,
2197 const ajint* maxFieldLen,
2198 AjPStr const * fields, const ajuint* fieldTot,
2199 ajuint nfields, ajuint nfiles, ajuint idDone,
2200 ajuint idCount)
2201 {
2202 ajuint i;
2203 ajuint maxlen;
2204
2205 ajFmtPrintF(logfile, "\n");
2206
2207 for(i=0;i<nfields;i++)
2208 {
2209 if(maxindex)
2210 maxlen = maxindex;
2211 else
2212 maxlen = maxFieldLen[i];
2213
2214 ajFmtPrintF(logfile, "Index %S: maxlen %d items %d\n",
2215 fields[i], maxlen, fieldTot[i]);
2216 }
2217
2218 ajFmtPrintF(logfile, "\nTotal %d files %d entries (%d duplicates)\n",
2219 nfiles, idCount, (idCount-idDone));
2220 return;
2221 }
2222
2223
2224
2225
2226 /* @func embDbiExit ***********************************************************
2227 **
2228 ** Cleanup database indexing internals on exit
2229 **
2230 ** @return [void]
2231 **
2232 ** @release 4.0.0
2233 ******************************************************************************/
2234
embDbiExit(void)2235 void embDbiExit(void)
2236 {
2237 ajStrDel(&dbiCmdStr);
2238 ajStrDel(&dbiCmdStr2);
2239 ajStrDel(&dbiDirFix);
2240 ajStrDel(&dbiWildFname);
2241 ajStrDel(&dbiInFname);
2242 ajStrDel(&dbiOutFname);
2243 ajStrDel(&dbiOutRecord);
2244 ajStrDel(&dbiSortExt);
2245 ajStrDel(&dbiLastId);
2246 ajStrDel(&dbiFieldId);
2247 ajStrDel(&dbiIdStr);
2248 ajStrDel(&dbiTmpStr);
2249 ajStrDel(&dbiRdLine);
2250 ajStrDel(&dbiIdLine);
2251 ajStrDel(&dbiFieldSort);
2252 ajStrDel(&dbiFieldSort2);
2253 ajStrDel(&dbiFieldStr);
2254 ajStrDel(&dbiFieldName);
2255 ajStrDel(&dbiFieldId2);
2256 ajStrDel(&dbiCurrentId);
2257
2258 ajRegFree(&dbiRegFieldIdSort);
2259 ajRegFree(&dbiRegFieldTokSort);
2260 ajRegFree(&dbiRegFieldTokIdSort);
2261 ajRegFree(&dbiRegEntryIdSort);
2262 ajRegFree(&dbiRegDate);
2263
2264 return;
2265 }
2266