1 /* @source embdbi *************************************************************
2 **
3 ** General routines for alignment.
4 **
5 ** @author Copyright (c) 2000 Peter Rice
6 ** @version $Revision: 1.63 $
7 ** @modified $Date: 2012/07/14 14:52:40 $ by $Author: rice $
8 ** @@
9 **
10 ** This library is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU Lesser General Public
12 ** License as published by the Free Software Foundation; either
13 ** version 2.1 of the License, or (at your option) any later version.
14 **
15 ** This library is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 ** Lesser General Public License for more details.
19 **
20 ** You should have received a copy of the GNU Lesser General Public
21 ** License along with this library; if not, write to the Free Software
22 ** Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23 ** MA  02110-1301,  USA.
24 **
25 ******************************************************************************/
26 
27 #include "ajlib.h"
28 
29 #include "embdbi.h"
30 #include "ajfile.h"
31 #include "ajlist.h"
32 #include "ajutil.h"
33 #include "ajtime.h"
34 #include "ajreg.h"
35 #include "ajsys.h"
36 #include "ajfileio.h"
37 
38 #include <math.h>
39 #ifndef WIN32
40 #include <dirent.h>
41 #include <sys/types.h>
42 #include <sys/wait.h>
43 #else
44 #include "win32.h"
45 #include "dirent_w32.h"
46 #endif
47 
48 #include <errno.h>
49 
50 
51 
52 static AjPStr dbiCmdStr    = NULL;
53 static AjPStr dbiCmdStr2    = NULL;
54 static AjPStr dbiDirFix    = NULL;
55 static AjPStr dbiWildFname = NULL;
56 static AjPStr dbiInFname   = NULL;
57 static AjPStr dbiOutFname  = NULL;
58 static AjPStr dbiOutRecord = NULL;
59 static AjPStr dbiSortExt   = NULL;
60 static AjPStr dbiLastId    = NULL;
61 static AjPStr dbiFieldId   = NULL;
62 static AjPStr dbiIdStr     = NULL;
63 static AjPStr dbiTmpStr     = NULL;
64 static AjPStr dbiRdLine     = NULL;
65 static AjPStr dbiIdLine     = NULL;
66 static AjPStr dbiFieldSort     = NULL;
67 static AjPStr dbiFieldSort2     = NULL;
68 static AjPStr dbiFieldStr     = NULL;
69 static AjPStr dbiFieldName     = NULL;
70 static AjPStr dbiFieldId2     = NULL;
71 static AjPStr dbiCurrentId = NULL;
72 
73 static AjPRegexp dbiRegFieldIdSort    = NULL;
74 static AjPRegexp dbiRegFieldTokSort   = NULL;
75 static AjPRegexp dbiRegFieldTokIdSort = NULL;
76 static AjPRegexp dbiRegEntryIdSort    = NULL;
77 static AjPRegexp dbiRegDate           = NULL;
78 
79 
80 
81 
82 /* @datastatic DbiOField ******************************************************
83 **
84 ** Database index field names and index filenames
85 **
86 ** @attr name [const char*] Field name as used in USAs
87 ** @attr index [const char*] Index filename for EMBLCD indices
88 ** @attr desc [const char*] Field description
89 ******************************************************************************/
90 
91 typedef struct DbiSField
92 {
93     const char* name;
94     const char* index;
95     const char* desc;
96 } DbiOField;
97 
98 
99 static DbiOField fieldDef[] =
100 {
101    /* Name  Index      Description */
102     {"acc", "acnum",   "accession number"},
103     {"sv",  "seqvn",   "seqeunce version and GI number"},
104     {"des", "des",     "entry description"},
105     {"org", "taxon",   "taxonomy and organism"},
106     {"key", "keyword", "keywords"},
107     {NULL, NULL, NULL}
108 };
109 
110 static const char* dbiFieldFile(const AjPStr fieldname);
111 
112 
113 
114 
115 /* @func embDbiFieldNew *******************************************************
116 **
117 ** Constructor for field token structures.
118 **
119 ** @return [EmbPField] Field token structure.
120 **
121 ** @release 2.4.0
122 ******************************************************************************/
123 
embDbiFieldNew(void)124 EmbPField embDbiFieldNew(void)
125 {
126     EmbPField ret;
127     AJNEW0(ret);
128 
129     return ret;
130 }
131 
132 
133 
134 
135 /* @func embDbiFieldDel *******************************************************
136 **
137 ** Destructor for field token structures.
138 **
139 ** @param [d] pthys [EmbPField*] Field token structure.
140 ** @return [void]
141 **
142 ** @release 4.1.0
143 ******************************************************************************/
144 
embDbiFieldDel(EmbPField * pthys)145 void embDbiFieldDel(EmbPField* pthys)
146 {
147     EmbPField thys;
148 
149     if(!pthys || !*pthys)
150 	return;
151 
152     thys = *pthys;
153 
154     AJFREE(thys->field);
155     AJFREE(thys->entry);
156     AJFREE(*pthys);
157 
158     return;
159 }
160 
161 
162 
163 
164 /* @func embDbiFieldDelMap ****************************************************
165 **
166 ** Destructor for field token structures to be mapped to lists or tables.
167 **
168 ** @param [d] pthys [void**] Field token structure.
169 ** @param [u] cl [void*] Unused
170 ** @return [void]
171 **
172 ** @release 4.1.0
173 ******************************************************************************/
174 
embDbiFieldDelMap(void ** pthys,void * cl)175 void embDbiFieldDelMap(void** pthys, void* cl)
176 {
177     EmbPField thys = (*(EmbPField*)pthys);
178 
179     if(!thys)
180 	return;
181 
182     (void) cl;				/* make it used */
183 
184     thys = *pthys;
185 
186     /*AJFREE(thys->field);*/
187     /*AJFREE(thys->entry);*/
188     AJFREE(*pthys);
189 
190     return;
191 }
192 
193 
194 
195 
196 /* @func embDbiCmpId **********************************************************
197 **
198 ** Comparison function for two entries.
199 **
200 ** @param [r] a [const void*] First id (EmbPEntry*)
201 ** @param [r] b [const void*] Second id (EmbPEntry*)
202 ** @return [ajint] Comparison value, -1, 0 or +1.
203 **
204 ** @release 1.13.0
205 ** @@
206 ******************************************************************************/
207 
embDbiCmpId(const void * a,const void * b)208 ajint embDbiCmpId(const void* a, const void* b)
209 {
210     const EmbPEntry aa;
211     const EmbPEntry bb;
212 
213     aa = *(EmbPEntry const *) a;
214     bb = *(EmbPEntry const *) b;
215 
216     return strcmp(aa->entry, bb->entry);
217 }
218 
219 
220 
221 
222 /* @func embDbiCmpFieldId *****************************************************
223 **
224 ** Comparison function for the entrynames in two field structures.
225 **
226 ** @param [r] a [const void*] First id (EmbPField*)
227 ** @param [r] b [const void*] Second id (EmbPField*)
228 ** @return [ajint] Comparison value, -1, 0 or +1.
229 **
230 ** @release 2.4.0
231 ** @@
232 ******************************************************************************/
233 
embDbiCmpFieldId(const void * a,const void * b)234 ajint embDbiCmpFieldId(const void* a, const void* b)
235 {
236     const EmbPField aa;
237     const EmbPField bb;
238 
239     aa = *(EmbPField const *) a;
240     bb = *(EmbPField const *) b;
241 
242     return strcmp(aa->entry, bb->entry);
243 }
244 
245 
246 
247 
248 /* @func embDbiCmpFieldField **************************************************
249 **
250 ** Comparison function for two field token values
251 **
252 ** @param [r] a [const void*] First id (EmbPField*)
253 ** @param [r] b [const void*] Second id (EmbPField*)
254 ** @return [ajint] Comparison value, -1, 0 or +1.
255 **
256 ** @release 2.4.0
257 ** @@
258 ******************************************************************************/
259 
embDbiCmpFieldField(const void * a,const void * b)260 ajint embDbiCmpFieldField(const void* a, const void* b)
261 {
262     ajint ret;
263 
264     const EmbPField aa;
265     const EmbPField bb;
266 
267     aa = *(EmbPField const *) a;
268     bb = *(EmbPField const *) b;
269 
270     ret = strcmp(aa->field, bb->field);
271 
272     if(ret)
273 	return ret;
274 
275     return strcmp(aa->entry, bb->entry);
276 }
277 
278 
279 
280 
281 /* @func embDbiEntryNew *******************************************************
282 **
283 ** Constructor for entry structures.
284 **
285 ** @param [r] nfields [ajuint] Number of data fields to be included
286 ** @return [EmbPEntry] Entry structure.
287 **
288 ** @release 1.13.0
289 ******************************************************************************/
290 
embDbiEntryNew(ajuint nfields)291 EmbPEntry embDbiEntryNew(ajuint nfields)
292 {
293     EmbPEntry ret;
294 
295     AJNEW0(ret);
296     ret->nfields = nfields;
297     AJCNEW0(ret->nfield, nfields);
298     AJCNEW0(ret->field, nfields);
299 
300     return ret;
301 }
302 
303 
304 
305 
306 /* @func embDbiEntryDel *******************************************************
307 **
308 ** Destructor for entry structures.
309 **
310 ** @param [d] Pentry [EmbPEntry*] Entry structure
311 ** @return [void]
312 **
313 ** @release 4.0.0
314 ******************************************************************************/
315 
embDbiEntryDel(EmbPEntry * Pentry)316 void embDbiEntryDel(EmbPEntry* Pentry)
317 {
318     EmbPEntry entry;
319     ajuint i;
320     ajuint j;
321 
322     if(!*Pentry)
323         return;
324 
325     entry = *Pentry;
326 
327     for(i=0;i<entry->nfields;i++)
328     {
329 	for(j=0;j<entry->nfield[i];j++)
330 	{
331 	    AJFREE(entry->field[i][j]);
332 	}
333 
334 	AJFREE(entry->field[i]);
335     }
336 
337     AJFREE(entry->nfield);
338     AJFREE(entry->field);
339     AJFREE(entry->entry);
340     AJFREE(*Pentry);
341 
342     return;
343 }
344 
345 
346 
347 
348 /* @func embDbiEntryDelMap ****************************************************
349 **
350 ** Destructor for entry structures to be mapped to lists or tables.
351 **
352 ** @param [d] pthys [void**] Field token structure.
353 ** @param [u] cl [void*] Unused
354 ** @return [void]
355 **
356 ** @release 4.1.0
357 ******************************************************************************/
358 
embDbiEntryDelMap(void ** pthys,void * cl)359 void embDbiEntryDelMap(void** pthys, void* cl)
360 {
361     EmbPEntry entry;
362     ajuint i;
363     ajuint j;
364 
365     if(!pthys || !*pthys)
366 	return;
367 
368     (void) cl;				/* make it used */
369 
370     entry = (*(EmbPEntry*)pthys);
371 
372     for(i=0;i<entry->nfields;i++)
373     {
374 	for(j=0;j<entry->nfield[i];j++)
375 	{
376 	    AJFREE(entry->field[i][j]);
377 	}
378 
379 	AJFREE(entry->field[i]);
380     }
381 
382     AJFREE(entry->nfield);
383     AJFREE(entry->field);
384     AJFREE(entry->entry);
385     AJFREE(*pthys);
386 
387     return;
388 }
389 
390 
391 
392 
393 /* @func embDbiFileList *******************************************************
394 **
395 ** Makes a list of all files in a directory matching a wildcard file name.
396 **
397 ** @param [r] dir [const AjPStr] Directory
398 ** @param [r] wildfile [const AjPStr] Wildcard file name
399 ** @param [r] trim [AjBool] Expand to search, trim results
400 ** @return [AjPList] New list of all files with full paths
401 **
402 ** @release 1.13.0
403 ** @@
404 ******************************************************************************/
405 
embDbiFileList(const AjPStr dir,const AjPStr wildfile,AjBool trim)406 AjPList embDbiFileList(const AjPStr dir, const AjPStr wildfile, AjBool trim)
407 {
408     AjPList retlist = NULL;
409 
410     DIR* dp;
411     struct dirent* de;
412     ajuint dirsize;
413 
414     AjPStr name = NULL;
415     AjPStr tmp;
416     AjPStr s;
417     AjPStr s2;
418     AjPStr t;
419 
420     char *p;
421     char *q;
422     AjPList l;
423     ajuint ll;
424     ajuint i;
425     AjBool d;
426 
427     ajDebug("embDbiFileList dir '%S' wildfile '%S' maxsize %Ld\n",
428 	    dir, wildfile, (ajlong) INT_MAX);
429 
430     ajStrAssignS(&dbiWildFname,wildfile);
431 
432     tmp = ajStrNewS(dbiWildFname);
433 
434     if(ajStrGetLen(dir))
435 	ajStrAssignS(&dbiDirFix, dir);
436     else
437 	ajStrAssignC(&dbiDirFix, CURRENT_DIR);
438 
439     if(ajStrGetCharLast(dbiDirFix) != SLASH_CHAR)
440 	ajStrAppendC(&dbiDirFix, SLASH_STRING);
441 
442     if(trim)
443 	ajStrAppendC(&dbiWildFname,"*");
444 
445     dp = opendir(ajStrGetPtr(dbiDirFix));
446 
447     if(!dp)
448 	ajFatal("opendir failed on '%S'", dbiDirFix);
449 
450     s = ajStrNew();
451     l = ajListNew();
452     dirsize = 0;
453     retlist = ajListstrNew();
454 
455     while((de = readdir(dp)))
456     {
457 #ifndef __CYGWIN__
458 	if(!de->d_ino)
459 	    continue; 		/* skip deleted files with inode zero */
460 #endif
461 
462 	if(ajCharMatchC(de->d_name, "."))
463 	    continue;
464 
465 	if(ajCharMatchC(de->d_name, ".."))
466 	    continue;
467 
468 	if(!ajCharMatchWildS(de->d_name, dbiWildFname))
469 	    continue;
470 
471 	ajStrAssignC(&s,de->d_name);
472 	p = q =ajStrGetuniquePtr(&s);
473 
474 	if(trim)
475 	{
476 	    p=strrchr(p,(int)'.');
477 
478 	    if(p)
479 		*p='\0';
480 	}
481 
482 	s2 = ajStrNewC(q);
483 	ll = (ajuint) ajListGetLength(l);
484 	d = ajFalse;
485 
486 	for(i=0;i<ll;++i)
487 	{
488 	    ajListPop(l,(void *)&t);
489 
490 	    if(ajStrMatchS(t,s2))
491 		d=ajTrue;
492 
493 	    ajListPushAppend(l,(void *)t);
494 	}
495 
496 	if(!d)
497 	    ajListPush(l,(void *)s2);
498 	else
499 	{
500 	    ajStrDel(&s2);
501 	    continue;
502 	}
503 
504 	dirsize++;
505 	name = NULL;
506 	ajFmtPrintS(&name, "%S%S", dbiDirFix, s2);
507 
508 	if(ajFilenameGetSize(name) > (ajlong) INT_MAX)
509 	  ajDie("File '%S' too large for DBI indexing", name);
510 
511 	ajDebug("accept '%S' (%Ld)\n", s2, ajFilenameGetSize(name));
512 	ajListstrPushAppend(retlist, name);
513     }
514 
515     if(!ajListGetLength(retlist))
516 	ajFatal("No match for file specification %S",tmp);
517 
518     while(ajListPop(l,(void *)&t))
519 	ajStrDel(&t);
520 
521     ajListFree(&l);
522 
523     ajStrDel(&s);
524     ajStrDel(&tmp);
525 
526     closedir(dp);
527     ajDebug("%u files for '%S' '%S'\n", dirsize, dir, dbiWildFname);
528 
529     return retlist;
530 }
531 
532 
533 
534 
535 /* @func embDbiFileListExc ****************************************************
536 **
537 ** Makes a list of all files in a directory matching a wildcard file name.
538 **
539 ** @param [r] dir [const AjPStr] Directory
540 ** @param [r] wildfile [const AjPStr] Wildcard file list
541 ** @param [r] exclude [const AjPStr] Wildcard file list
542 **                                   (NULL if none to exclude)
543 ** @return [AjPList] New list of all files with full paths
544 **
545 ** @release 1.13.2
546 ** @@
547 ******************************************************************************/
548 
embDbiFileListExc(const AjPStr dir,const AjPStr wildfile,const AjPStr exclude)549 AjPList embDbiFileListExc(const AjPStr dir, const AjPStr wildfile,
550 			  const AjPStr exclude)
551 {
552     AjPList retlist = NULL;
553 
554     DIR* dp;
555     struct dirent* de;
556     ajuint dirsize;
557     AjPStr name = NULL;
558 
559     ajDebug("embDbiFileListExc dir '%S' wildfile '%S' exclude '%S' "
560             "maxsize %Ld\n",
561 	    dir, wildfile, exclude, (ajlong) INT_MAX);
562 
563     if(ajStrGetLen(dir))
564 	ajStrAssignS(&dbiDirFix, dir);
565     else
566 	ajStrAssignC(&dbiDirFix, CURRENT_DIR);
567 
568     if(ajStrGetCharLast(dbiDirFix) != SLASH_CHAR)
569 	ajStrAppendC(&dbiDirFix, SLASH_STRING);
570 
571     ajDebug("dirfix '%S'\n", dbiDirFix);
572 
573     dp = opendir(ajStrGetPtr(dbiDirFix));
574 
575     if(!dp)
576 	ajFatal("opendir failed on '%S'", dbiDirFix);
577 
578     dirsize = 0;
579     retlist = ajListstrNew();
580 
581     while((de = readdir(dp)))
582     {
583 	/* skip deleted files with inode zero */
584 #ifndef __CYGWIN__
585 	if(!de->d_ino)
586 	    continue;
587 #endif
588 
589 	if(ajCharMatchC(de->d_name, "."))
590 	    continue;
591 
592 	if(ajCharMatchC(de->d_name, ".."))
593 	    continue;
594 
595 	ajStrAssignC(&dbiInFname, de->d_name);
596 
597 	if(exclude && !ajFilenameTestExclude(dbiInFname, exclude, wildfile))
598 	    continue;
599 
600 	dirsize++;
601 	name = NULL;
602 	ajFmtPrintS(&name, "%S%S", dbiDirFix, dbiInFname);
603 
604 	if(ajFilenameGetSize(name) > (ajlong) INT_MAX)
605 	  ajDie("File '%S' too large for DBI indexing", name);
606 
607 	ajDebug("accept '%S' (%Ld)\n", dbiInFname, ajFilenameGetSize(name));
608 	ajListstrPushAppend(retlist, name);
609     }
610 
611     closedir(dp);
612     ajDebug("%u files for '%S' '%S'\n", dirsize, dir, wildfile);
613 
614     return retlist;
615 }
616 
617 
618 
619 
620 /* @func embDbiFlatOpenlib ****************************************************
621 **
622 ** Open a flat file library
623 **
624 ** @param [r] lname [const AjPStr] Source file basename
625 ** @param [u] libr [AjPFile*] Database file
626 ** @return [AjBool] ajTrue on success
627 **
628 ** @release 2.3.0
629 ** @@
630 ******************************************************************************/
631 
embDbiFlatOpenlib(const AjPStr lname,AjPFile * libr)632 AjBool embDbiFlatOpenlib(const AjPStr lname, AjPFile* libr)
633 {
634     ajFileClose(libr);
635 
636     *libr = ajFileNewInNameS(lname);
637 
638     if(!*libr)
639 	ajFatal("Cannot open %S for reading",lname);
640 
641     if(!*libr)
642     {
643 	ajErr(" cannot open library flat file: %S\n",
644 	      lname);
645 
646 	return ajFalse;
647     }
648 
649     return ajTrue;
650 }
651 
652 
653 
654 
655 /* @func embDbiRmFile *********************************************************
656 **
657 ** Remove a file or a set of numbered files
658 **
659 ** @param [r] dbname [const AjPStr] Database name
660 ** @param [r] ext [const char*] Base file extension
661 ** @param [r] nfiles [ajuint] Number of files, or zero for unnumbered.
662 ** @param [r] cleanup [AjBool] If ajTrue, clean up temporary files after
663 ** @return [void]
664 **
665 ** @release 1.13.0
666 ** @@
667 ******************************************************************************/
668 
embDbiRmFile(const AjPStr dbname,const char * ext,ajuint nfiles,AjBool cleanup)669 void embDbiRmFile(const AjPStr dbname, const char* ext, ajuint nfiles,
670 		  AjBool cleanup)
671 {
672 #ifndef WIN32
673     ajuint i;
674 
675     if(!cleanup)
676 	return;
677 
678     if(nfiles)
679     {
680 	for(i=1; i<= nfiles; i++)
681         {
682 	    ajFmtPrintS(&dbiCmdStr, "%S%03d.%s", dbname, i, ext);
683             ajSysCommandRemoveS(dbiCmdStr);
684         }
685     }
686     else
687     {
688 	ajFmtPrintS(&dbiCmdStr, "%S.%s", dbname, ext);
689         ajSysCommandRemoveS(dbiCmdStr);
690     }
691 
692     return;
693 
694 #else	/* WIN32 */
695     static AjPStr filestr = NULL;
696     ajuint i;
697 
698     if (!cleanup)
699 	return;
700 
701     if (nfiles)
702     {
703 	for (i=1; i<= nfiles; i++)
704 	{
705 	    ajFmtPrintS (&filestr, "%S%03d.%s", dbname, i, ext);
706 	    DeleteFile(ajStrGetPtr(filestr));
707 	    ajDebug("Deleting file %S\n", filestr);
708 	}
709     }
710     else
711     {
712 	ajFmtPrintS (&filestr, "%S.%s", dbname, ext);
713 	DeleteFile(ajStrGetPtr(filestr));
714 	ajDebug("Deleting file %S\n", filestr);
715     }
716 
717     return;
718 #endif	/* WIN32 */
719 }
720 
721 
722 
723 
724 /* @func embDbiRmFileI ********************************************************
725 **
726 ** Remove a numbered file
727 **
728 ** @param [r] dbname [const AjPStr] Database name
729 ** @param [r] ext [const char*] Base file extension
730 ** @param [r] ifile [ajuint] File number.
731 ** @param [r] cleanup [AjBool] If ajTrue, clean up temporary files after
732 ** @return [void]
733 **
734 ** @release 1.13.0
735 ******************************************************************************/
736 
embDbiRmFileI(const AjPStr dbname,const char * ext,ajuint ifile,AjBool cleanup)737 void embDbiRmFileI(const AjPStr dbname, const char* ext, ajuint ifile,
738 		   AjBool cleanup)
739 {
740 #ifndef WIN32
741     if(!cleanup)
742 	return;
743 
744     ajFmtPrintS(&dbiCmdStr, "%S%03d.%s", dbname, ifile, ext);
745 
746     ajSysCommandRemoveS(dbiCmdStr);
747 #else
748     static AjPStr filestr = NULL;
749 
750     if(!cleanup)
751 	return;
752 
753     ajFmtPrintS (&filestr, "%S%03d.%s", dbname, ifile, ext);
754     DeleteFile(ajStrGetPtr(filestr));
755     ajDebug("Deleting file %S\n", filestr);
756 #endif	/* WIN32 */
757 
758     return;
759 }
760 
761 
762 
763 
764 /* @func embDbiRmEntryFile ****************************************************
765 **
766 ** Remove the sorted entryname file (kept until end of processing
767 ** as it is the sorted list of all entries, used to count entries for
768 ** field indexing.
769 **
770 ** @param [r] dbname [const AjPStr] Database name
771 ** @param [r] cleanup [AjBool] If ajTrue, clean up temporary files after
772 ** @return [void]
773 **
774 ** @release 2.4.0
775 ** @@
776 ******************************************************************************/
777 
embDbiRmEntryFile(const AjPStr dbname,AjBool cleanup)778 void embDbiRmEntryFile(const AjPStr dbname,  AjBool cleanup)
779 {
780     embDbiRmFile(dbname, "idsrt", 0, cleanup);
781 
782     return;
783 }
784 
785 
786 
787 
788 /* @func embDbiSortFile *******************************************************
789 **
790 ** Sort a file, or a set of numbered files, individually
791 **
792 ** @param [r] dbname [const AjPStr] Database name
793 ** @param [r] ext1 [const char*] Input file extension
794 ** @param [r] ext2 [const char*] Output file extension
795 ** @param [r] nfiles [ajuint] Number of files to sort (zero if unnumbered)
796 ** @param [r] cleanup [AjBool] If ajTrue, clean up temporary files after
797 ** @param [r] sortopt [const AjPStr] Extra options for the system sort
798 ** @return [void]
799 **
800 ** @release 1.13.0
801 ** @@
802 ******************************************************************************/
803 
embDbiSortFile(const AjPStr dbname,const char * ext1,const char * ext2,ajuint nfiles,AjBool cleanup,const AjPStr sortopt)804 void embDbiSortFile(const AjPStr dbname, const char* ext1, const char* ext2,
805 		    ajuint nfiles, AjBool cleanup, const AjPStr sortopt)
806 {
807     ajuint i;
808     AjPStr dir = NULL;
809     ajuint j;
810     ajuint isplit;
811     ajuint nsplit;
812     double td;
813 
814 #ifndef WIN32
815     static const char *prog = "sort";
816 
817     dir = ajStrNewC(prog);
818     ajSysFileWhich(&dir);
819 
820 #else
821     static const char *prog = "sort.exe";
822 
823     char* sortProgDir = getenv("EMBOSS_ROOT");
824 
825     if(sortProgDir == NULL)
826     {
827 	AjPStr msg = ajStrNewC("EMBOSS_ROOT");
828 	ajStrAppendC(&msg, " environment variable not defined");
829 	ajFatal(ajStrGetPtr(msg));
830     }
831 
832     dir = ajStrNewC(sortProgDir);
833     ajStrAppendC(&dir,SLASH_STRING);
834     ajStrAppendC(&dir,prog);
835 
836     if(!ajFilenameExistsExec(dir))
837     {
838         ajFmtPrintS(&dir, "%s\\apps\\release\\%s", sortProgDir, prog);
839     }
840 
841     if(!ajFilenameExistsExec(dir))
842     {
843         ajFatal("'%s' not found in EMBOSS_ROOT or apps\\release", prog);
844     }
845 
846 #endif
847 
848 
849     if(nfiles)
850     {
851 	for(i=1; i<=nfiles; i++)
852 	{
853 	    ajFmtPrintS(&dbiInFname, "%S%03d.%s", dbname, i, ext1);
854 	    ajFmtPrintS(&dbiOutFname, "%S%03d.%s.srt", dbname, i, ext1);
855 
856 	    if(sortopt)
857 		ajFmtPrintS(&dbiCmdStr, "%S -o %S %S %S",
858 			    dir,dbiOutFname,sortopt,dbiInFname);
859 	    else
860 		ajFmtPrintS(&dbiCmdStr, "%S -o %S %S",
861 			    dir,dbiOutFname,dbiInFname);
862 
863 	    ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr), "C");
864 	    embDbiRmFileI(dbname, ext1, i, cleanup);
865 	}
866 
867 	td = sqrt(nfiles);
868 	nsplit = (ajuint) td;
869 
870 	ajDebug("embDbiSortFile nfiles:%d split:%d\n", nfiles, nsplit);
871 
872 	/* file merge in groups if more than 24 files ... avoids huge merges */
873 
874 	if(nsplit < 2)		/* up to 3 source files */
875 	{
876             ajFmtPrintS(&dbiCmdStr, "%S -m -o %S.%s %S",
877                         dir,dbname,ext2,sortopt);
878 
879             for(i=1; i<=nfiles; i++)
880                 ajFmtPrintAppS(&dbiCmdStr, " %S%03d.%s.srt", dbname, i, ext1);
881 
882             ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr), "C");
883             ajFmtPrintS(&dbiSortExt, "%s.srt", ext1);
884 
885             for(i=1; i<=nfiles; i++)
886                 embDbiRmFileI(dbname, ajStrGetPtr(dbiSortExt), i, cleanup);
887 
888 	}
889 	else
890 	{
891             ajFmtPrintS(&dbiCmdStr2, "%S -m -o %S.%s %S",
892                         dir,dbname,ext2,sortopt);
893             isplit = 0;
894 
895             for(i=1; i<=nfiles; i+=nsplit)
896             {
897                 isplit++;
898                 ajFmtPrintAppS(&dbiCmdStr2, " %S%03d.%s.mrg1",
899                                dbname, isplit, ext2);
900 
901                 /* Now we make that .mrg1 file */
902 
903                 ajFmtPrintS(&dbiCmdStr, "%S -m -o %S%03d.%s.mrg1 %S",
904                             dir,dbname,isplit,ext2,sortopt);
905 
906                 for(j=0; j<nsplit; j++)
907                     if((i+j) <= nfiles)
908                         ajFmtPrintAppS(&dbiCmdStr, " %S%03d.%s.srt",
909                                        dbname, i+j, ext1);
910 
911                 ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr),"C");
912                 ajFmtPrintS(&dbiSortExt, "%s.srt", ext1);
913 
914                 for(j=0; j<nsplit; j++)
915                     if((i+j) <= nfiles)
916                         embDbiRmFileI(dbname, ajStrGetPtr(dbiSortExt), (i+j),
917                                       cleanup);
918             }
919 
920             ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr2), "C");
921             ajFmtPrintS(&dbiSortExt, "%s.mrg1", ext2);
922 
923             for(j=1; j<=isplit; j++)
924                 embDbiRmFileI(dbname, ajStrGetPtr(dbiSortExt), j, cleanup);
925 	}
926     }
927     else
928     {
929 	ajFmtPrintS(&dbiInFname, "%S.%s", dbname, ext1);
930 	ajFmtPrintS(&dbiOutFname, "%S.%s", dbname, ext2);
931 	ajFmtPrintS(&dbiCmdStr, "%S -o %S %S %S",
932 		    dir,dbiOutFname,sortopt,dbiInFname);
933 
934 	ajSysExecLocaleC(ajStrGetPtr(dbiCmdStr), "C");
935 	embDbiRmFile(dbname, ext1, 0, cleanup);
936     }
937 
938     ajStrDel(&dir);
939 
940     return;
941 }
942 
943 
944 
945 
946 /* @func embDbiHeaderSize *****************************************************
947 **
948 ** Updates the file header for an index file to include the correct file size.
949 **
950 ** @param [u] file [AjPFile] Output file
951 ** @param [r] filesize [ajuint] File size (if known, can be rewritten)
952 ** @param [r] recordcnt [ajuint] Number of records
953 ** @return [void]
954 **
955 ** @release 2.4.0
956 ******************************************************************************/
957 
embDbiHeaderSize(AjPFile file,ajuint filesize,ajuint recordcnt)958 void embDbiHeaderSize(AjPFile file, ajuint filesize, ajuint recordcnt)
959 {
960     ajFileSeek(file, 0, 0);
961 
962     ajWritebinInt4(file, (ajint) filesize);	/* filesize */
963     ajWritebinInt4(file, (ajint) recordcnt);	/* #records */
964 
965     return;
966 }
967 
968 
969 
970 
971 /* @func embDbiHeader *********************************************************
972 **
973 ** Writes the header for an index file. Resets the file pointer to beginning
974 ** of file, and leaves the file pointer at the start of the first record.
975 **
976 ** @param [u] file [AjPFile] Output file
977 ** @param [r] filesize [ajuint] File size (if known, can be rewritten)
978 ** @param [r] recordcnt [ajuint] Number of records
979 ** @param [r] recordlen [short] Record length (bytes)
980 ** @param [r] dbname [const AjPStr] Database name (up to 20 characters used)
981 ** @param [r] release [const AjPStr] Release as a string (up to 10
982 **                                   characters used)
983 ** @param [r] date [const char[4]] Date dd,mm,yy,00
984 ** @return [void]
985 **
986 ** @release 2.4.0
987 ******************************************************************************/
988 
embDbiHeader(AjPFile file,ajuint filesize,ajuint recordcnt,short recordlen,const AjPStr dbname,const AjPStr release,const char date[4])989 void embDbiHeader(AjPFile file, ajuint filesize, ajuint recordcnt,
990 		  short recordlen, const AjPStr dbname, const AjPStr release,
991 		  const char date[4])
992 {
993     ajuint i;
994     static char padding[256];
995     static AjBool firstcall = AJTRUE;
996 
997     if(firstcall)
998     {
999 	for(i=0;i<256;i++)
1000 	    padding[i] = ' ';
1001 
1002 	firstcall = ajFalse;
1003     }
1004 
1005     ajFileSeek(file, 0, 0);
1006 
1007     ajWritebinInt4(file, (ajint) filesize);	/* filesize */
1008 
1009     ajWritebinInt4(file, (ajint) recordcnt);	/* #records */
1010 
1011     ajWritebinInt2(file, (ajint) recordlen);	/* recordsize */
1012 
1013     /* rest of the header */
1014     ajWritebinStr(file, dbname,  20); /* dbname */
1015     ajWritebinStr(file, release, 10); /* release */
1016     ajWritebinByte(file, date[0]);	/* release date */
1017     ajWritebinByte(file, date[1]);	/* release date */
1018     ajWritebinByte(file, date[2]);	/* release date */
1019     ajWritebinByte(file, date[3]);	/* release date */
1020     ajWritebinBinary(file, 1, 256, padding); /* padding 256 bytes */
1021 
1022     return;
1023 }
1024 
1025 
1026 
1027 
1028 /* @func embDbiFileSingle *****************************************************
1029 **
1030 ** Builds a filename for a single temporary file to save IDs or some other
1031 ** index field, for example EMBL01.list
1032 **
1033 ** @param [r] dbname [const AjPStr] Database name
1034 ** @param [r] extension [const char*] Filename extension.
1035 ** @param [r] num [ajuint] Number for this file (start at 1)
1036 ** @return [AjPFile] Opened output file
1037 **
1038 **
1039 ** @release 2.4.0
1040 ******************************************************************************/
1041 
embDbiFileSingle(const AjPStr dbname,const char * extension,ajuint num)1042 AjPFile embDbiFileSingle(const AjPStr dbname, const char* extension, ajuint num)
1043 {
1044     AjPFile ret;
1045 
1046     ajFmtPrintS(&dbiOutFname, "%S%03d.%s", dbname, num, extension);
1047     ret = ajFileNewOutNameS(dbiOutFname);
1048 
1049     if(!ret)
1050 	ajFatal("Cannot open %S for writing", dbiOutFname);
1051 
1052     return ret;
1053 }
1054 
1055 
1056 
1057 
1058 /* @func embDbiFileIn *********************************************************
1059 **
1060 ** Builds a filename for a summary file to read IDs or some other
1061 ** index field, for example EMBL.acnum_sort
1062 **
1063 ** @param [r] dbname [const AjPStr] Database name
1064 ** @param [r] extension [const char*] Filename extension.
1065 ** @return [AjPFile] Opened output file
1066 **
1067 **
1068 ** @release 2.4.0
1069 ******************************************************************************/
1070 
embDbiFileIn(const AjPStr dbname,const char * extension)1071 AjPFile embDbiFileIn(const AjPStr dbname, const char* extension)
1072 {
1073     AjPFile ret;
1074 
1075     ajFmtPrintS(&dbiInFname, "%S.%s", dbname, extension);
1076     ret = ajFileNewInNameS(dbiInFname);
1077 
1078     if(!ret)
1079 	ajFatal("Cannot open %S for reading", dbiInFname);
1080 
1081     return ret;
1082 }
1083 
1084 
1085 
1086 
1087 /* @func embDbiFileOut ********************************************************
1088 **
1089 ** Builds a filename for a summary file to save IDs or some other
1090 ** index field, for example EMBL.acnum_srt2
1091 **
1092 ** @param [r] dbname [const AjPStr] Database name
1093 ** @param [r] extension [const char*] Filename extension.
1094 ** @return [AjPFile] Opened output file
1095 **
1096 **
1097 ** @release 2.4.0
1098 ******************************************************************************/
1099 
embDbiFileOut(const AjPStr dbname,const char * extension)1100 AjPFile embDbiFileOut(const AjPStr dbname, const char* extension)
1101 {
1102     AjPFile ret;
1103 
1104     ajFmtPrintS(&dbiOutFname, "%S.%s", dbname, extension);
1105     ret = ajFileNewOutNameS(dbiOutFname);
1106 
1107     if(!ret)
1108 	ajFatal("Cannot open %S for writing", dbiOutFname);
1109 
1110     return ret;
1111 }
1112 
1113 
1114 
1115 
1116 /* @func embDbiFileIndex ******************************************************
1117 **
1118 ** Builds a filename for a summary file to save IDs or some other
1119 ** index field, for example EMBL.acsrt2
1120 **
1121 ** @param [r] indexdir [const AjPStr] Index directory
1122 ** @param [r] field [const AjPStr] Field name
1123 ** @param [r] extension [const char*] Filename extension.
1124 ** @return [AjPFile] Opened output file
1125 **
1126 **
1127 ** @release 2.4.0
1128 ******************************************************************************/
1129 
embDbiFileIndex(const AjPStr indexdir,const AjPStr field,const char * extension)1130 AjPFile embDbiFileIndex(const AjPStr indexdir, const AjPStr field,
1131 			const char* extension)
1132 {
1133     AjPFile ret;
1134 
1135     ajFmtPrintS(&dbiOutFname, "%S.%s", field, extension);
1136     ret = ajFileNewOutNamePathS(dbiOutFname, indexdir);
1137 
1138     if(!ret)
1139 	ajFatal("Cannot open %S for writing", dbiOutFname);
1140 
1141     return ret;
1142 }
1143 
1144 
1145 
1146 
1147 /* @func embDbiWriteDivision **************************************************
1148 **
1149 ** Writes the division index file
1150 **
1151 ** @param [r] indexdir [const AjPStr] Index directory
1152 ** @param [r] dbname [const AjPStr] Database name
1153 ** @param [r] release [const AjPStr] Release number as a string
1154 ** @param [r] date [const char[4]] Date
1155 ** @param [r] maxfilelen [ajuint] Max file name length
1156 ** @param [r] nfiles [ajuint] Number of files indexes
1157 ** @param [r] divfiles [AjPStr const *] Division filenames
1158 ** @param [r] seqfiles [AjPStr const *] Sequence filenames (or NULL if none)
1159 ** @return [void]
1160 **
1161 ** @release 2.4.0
1162 ******************************************************************************/
1163 
embDbiWriteDivision(const AjPStr indexdir,const AjPStr dbname,const AjPStr release,const char date[4],ajuint maxfilelen,ajuint nfiles,AjPStr const * divfiles,AjPStr const * seqfiles)1164 void embDbiWriteDivision(const AjPStr indexdir,
1165 			 const AjPStr dbname, const AjPStr release,
1166 			 const char date[4],  ajuint maxfilelen, ajuint nfiles,
1167 			 AjPStr const * divfiles, AjPStr const * seqfiles)
1168 {
1169     AjPFile divFile;
1170     AjPStr tmpfname = NULL;
1171     ajuint i;
1172     ajuint filesize;
1173 
1174     short recsize;
1175 
1176     ajStrAssignC(&tmpfname, "division.lkp");
1177     divFile = ajFileNewOutNamePathS(tmpfname, indexdir);
1178 
1179     filesize = 256 + 44 + (nfiles * (maxfilelen+2));
1180     recsize = maxfilelen + 2;
1181 
1182     embDbiHeader(divFile, filesize, nfiles, recsize, dbname, release, date);
1183 
1184     for(i=0; i<nfiles; i++)
1185     {
1186         if(seqfiles)
1187 	    embDbiWriteDivisionRecord(divFile, maxfilelen, (short)(i+1),
1188 				      divfiles[i], seqfiles[i]);
1189 	else
1190 	    embDbiWriteDivisionRecord(divFile, maxfilelen, (short)(i+1),
1191 				      divfiles[i], NULL);
1192     }
1193 
1194     ajFileClose(&divFile);
1195     ajStrDel(&tmpfname);
1196 
1197     return;
1198 }
1199 
1200 
1201 
1202 
1203 /* @func embDbiWriteDivisionRecord ********************************************
1204 **
1205 ** Writes a record to the division lookup file
1206 **
1207 ** @param [u] file [AjPFile] Index file
1208 ** @param [r] maxnamlen [ajuint] Maximum file name length
1209 ** @param [r] recnum [short] Record number
1210 ** @param [r] datfile [const AjPStr] Data file name
1211 ** @param [r] seqfile [const AjPStr] Sequence file name (or NULL if none)
1212 ** @return [void]
1213 **
1214 ** @release 2.4.0
1215 ******************************************************************************/
1216 
embDbiWriteDivisionRecord(AjPFile file,ajuint maxnamlen,short recnum,const AjPStr datfile,const AjPStr seqfile)1217 void embDbiWriteDivisionRecord(AjPFile file, ajuint maxnamlen, short recnum,
1218 			       const AjPStr datfile, const AjPStr seqfile)
1219 {
1220     ajWritebinInt2(file, recnum);
1221 
1222     if(ajStrGetLen(seqfile))
1223     {
1224 	ajFmtPrintS(&dbiOutRecord, "%S %S", datfile, seqfile);
1225 	ajWritebinStr(file, dbiOutRecord, maxnamlen);
1226     }
1227     else
1228 	ajWritebinStr(file, datfile, maxnamlen);
1229 
1230     return;
1231 }
1232 
1233 
1234 
1235 
1236 /* @func embDbiWriteEntryRecord ***********************************************
1237 **
1238 ** Writes a record to the entryname index file
1239 **
1240 ** @param [u] file [AjPFile] hit file
1241 ** @param [r] maxidlen [ajuint] Maximum length for an id string
1242 ** @param [r] id [const AjPStr] The id string for this entry
1243 ** @param [r] rpos [ajuint] Data file offset
1244 ** @param [r] spos [ajuint] sequence file offset
1245 ** @param [r] filenum [ajushort] file number in division file
1246 ** @return [void]
1247 **
1248 ** @release 2.4.0
1249 ******************************************************************************/
1250 
embDbiWriteEntryRecord(AjPFile file,ajuint maxidlen,const AjPStr id,ajuint rpos,ajuint spos,ajushort filenum)1251 void embDbiWriteEntryRecord(AjPFile file, ajuint maxidlen, const AjPStr id,
1252 			    ajuint rpos, ajuint spos, ajushort filenum)
1253 {
1254 
1255     ajWritebinStr(file, id, maxidlen);
1256     ajWritebinInt4(file, rpos);
1257     ajWritebinInt4(file, spos);
1258     ajWritebinInt2(file, filenum);
1259 
1260     return;
1261 }
1262 
1263 
1264 
1265 
1266 /* @func embDbiWriteHit *******************************************************
1267 **
1268 ** Writes a record to the field hit (.hit) index file
1269 **
1270 ** @param [u] file [AjPFile] hit file
1271 ** @param [r] idnum [ajuint] Entry number (1 for the first) in the
1272 **                          entryname file
1273 ** @return [void]
1274 **
1275 ** @release 2.4.0
1276 ******************************************************************************/
1277 
embDbiWriteHit(AjPFile file,ajuint idnum)1278 void embDbiWriteHit(AjPFile file, ajuint idnum)
1279 {
1280     ajWritebinInt4(file, (ajint) idnum);
1281 
1282     return;
1283 }
1284 
1285 
1286 
1287 
1288 /* @func embDbiWriteTrg *******************************************************
1289 **
1290 ** Writes a record to the field target (.trg) index file
1291 **
1292 ** @param [u] file [AjPFile] hit file
1293 ** @param [r] maxfieldlen [ajuint] Maximum field token length
1294 ** @param [r] idnum [ajuint] First record number (1 for the first) in the
1295 **                          field hit index file
1296 ** @param [r] idcnt [ajuint] Number of entries for this field value
1297 **                          in the field hit index file
1298 ** @param [r] hitstr [const AjPStr] Field token string
1299 ** @return [void]
1300 **
1301 ** @release 2.4.0
1302 ******************************************************************************/
1303 
embDbiWriteTrg(AjPFile file,ajuint maxfieldlen,ajuint idnum,ajuint idcnt,const AjPStr hitstr)1304 void embDbiWriteTrg(AjPFile file, ajuint maxfieldlen, ajuint idnum,
1305 		    ajuint idcnt, const AjPStr hitstr)
1306 {
1307     ajWritebinInt4(file, (ajint) idnum);
1308     ajWritebinInt4(file, (ajint) idcnt);
1309     ajWritebinStr(file, hitstr, maxfieldlen);
1310 
1311     return;
1312 }
1313 
1314 
1315 
1316 
1317 /* @func embDbiSortOpen *******************************************************
1318 **
1319 ** Open sort files for entries and all fields
1320 **
1321 ** @param [w] alistfile [AjPFile*] Sort files for each field.
1322 ** @param [r] ifile [ajuint] Input file number (used for temporary file names)
1323 ** @param [r] dbname [const AjPStr] Database name
1324 **                                  (used for temporary file names)
1325 ** @param [r] fields [AjPStr const *] Field names (used for temporary
1326 **                                   file names)
1327 ** @param [r] nfields [ajuint] Number of fields
1328 ** @return [AjPFile] Sort file for entries
1329 **
1330 ** @release 2.4.0
1331 ******************************************************************************/
1332 
embDbiSortOpen(AjPFile * alistfile,ajuint ifile,const AjPStr dbname,AjPStr const * fields,ajuint nfields)1333 AjPFile embDbiSortOpen(AjPFile* alistfile,
1334 		       ajuint ifile, const AjPStr dbname,
1335 		       AjPStr const * fields, ajuint nfields)
1336 {
1337     AjPFile elistfile;
1338     ajuint ifield;
1339 
1340     elistfile = embDbiFileSingle(dbname, "list", ifile+1);
1341 
1342     for(ifield=0;ifield < nfields; ifield++)
1343 	alistfile[ifield] = embDbiFileSingle(dbname,
1344 					     dbiFieldFile(fields[ifield]),
1345 					     ifile+1);
1346 
1347     return elistfile;
1348 }
1349 
1350 
1351 
1352 
1353 /* @funcstatic dbiFieldFile ***************************************************
1354 **
1355 ** Returns the index filename that relates to a USA field name
1356 **
1357 ** @param [r] fieldname [const AjPStr] Field name
1358 ** @return [const char*] Index filename for this field
1359 **
1360 ** @release 4.0.0
1361 ******************************************************************************/
1362 
dbiFieldFile(const AjPStr fieldname)1363 static const char* dbiFieldFile(const AjPStr fieldname)
1364 {
1365     ajuint i = 0;
1366 
1367     for(i=0;fieldDef[i].name;i++)
1368 	if(ajStrMatchCaseC(fieldname, fieldDef[i].name))
1369 	    return fieldDef[i].index;
1370 
1371     ajErr("Unknown query field '%S' in index filename lookup", fieldname);
1372     return NULL;
1373 }
1374 
1375 
1376 
1377 
1378 /* @func embDbiSortClose ******************************************************
1379 **
1380 ** Close the sort files for entries and all fields
1381 **
1382 ** @param [u] elistfile [AjPFile*] Sort file for entries
1383 ** @param [u] alistfile [AjPFile*] Sort files for each field.
1384 ** @param [r] nfields [ajuint] Number of fields
1385 ** @return [void]
1386 **
1387 ** @release 2.4.0
1388 ******************************************************************************/
1389 
embDbiSortClose(AjPFile * elistfile,AjPFile * alistfile,ajuint nfields)1390 void embDbiSortClose(AjPFile* elistfile, AjPFile* alistfile, ajuint nfields)
1391 {
1392     ajuint ifield;
1393 
1394     ajFileClose(elistfile);
1395 
1396     for(ifield=0; ifield < nfields; ifield++)
1397 	ajFileClose(&alistfile[ifield]);
1398 
1399     return;
1400 }
1401 
1402 
1403 
1404 
1405 /* @func embDbiMemEntry *******************************************************
1406 **
1407 ** Stores data for current entry in memory by appending to lists
1408 **
1409 ** @param [u] idlist [AjPList] List of entry IDs
1410 ** @param [u] fieldList [AjPList*] List of field tokens for each field
1411 ** @param [r] nfields [ajuint] Number of fields
1412 ** @param [u] entry [EmbPEntry] Current entry
1413 ** @param [r] ifile [ajuint] Current input file number
1414 ** @return [void]
1415 **
1416 ** @release 2.4.0
1417 ******************************************************************************/
1418 
embDbiMemEntry(AjPList idlist,AjPList * fieldList,ajuint nfields,EmbPEntry entry,ajuint ifile)1419 void embDbiMemEntry(AjPList idlist, AjPList* fieldList, ajuint nfields,
1420 		    EmbPEntry entry, ajuint ifile)
1421 {
1422     ajuint ifield;
1423     ajuint i;
1424     EmbPField fieldData = NULL;
1425 
1426     entry->filenum = ifile+1;
1427     ajListPushAppend(idlist, entry);
1428 
1429     for(ifield=0; ifield < nfields; ifield++)
1430 	for(i=0;i<entry->nfield[ifield]; i++)
1431 	{
1432 	    fieldData = embDbiFieldNew();
1433 	    fieldData->entry = entry->entry;
1434 	    fieldData->field = entry->field[ifield][i];
1435 	    ajListPushAppend(fieldList[ifield], fieldData);
1436 	}
1437 
1438     return;
1439 }
1440 
1441 
1442 
1443 
1444 /* @func embDbiSortWriteEntry *************************************************
1445 **
1446 ** Write the entryname index file using data from the entry sort file.
1447 **
1448 ** @param [u] entFile [AjPFile] Entry file
1449 ** @param [r] maxidlen [ajuint] Maximum id length
1450 ** @param [r] dbname [const AjPStr] Database name (used in temp file names)
1451 ** @param [r] nfiles [ajuint] Number of files
1452 ** @param [r] cleanup [AjBool] Cleanup temp files if true
1453 ** @param [r] sortopt [const AjPStr] Sort commandline options
1454 ** @return [ajuint] Number of entries
1455 **
1456 ** @release 2.4.0
1457 ******************************************************************************/
1458 
embDbiSortWriteEntry(AjPFile entFile,ajuint maxidlen,const AjPStr dbname,ajuint nfiles,AjBool cleanup,const AjPStr sortopt)1459 ajuint embDbiSortWriteEntry(AjPFile entFile, ajuint maxidlen,
1460                             const AjPStr dbname, ajuint nfiles,
1461                             AjBool cleanup, const AjPStr sortopt)
1462 {
1463     AjPFile esortfile;
1464     ajint rpos;
1465     ajint spos;
1466     ajint filenum;
1467     ajuint idcnt = 0;
1468 
1469     if(!dbiRegEntryIdSort)
1470 	dbiRegEntryIdSort =
1471 	    ajRegCompC("^([^ ]+) +([0-9]+) +([0-9]+) +([0-9]+)");
1472 
1473     embDbiSortFile(dbname, "list", "idsrt", nfiles, cleanup, sortopt);
1474     ajStrAssignC(&dbiLastId, " ");
1475     esortfile = embDbiFileIn(dbname, "idsrt");
1476 
1477     while(ajReadline(esortfile, &dbiRdLine))
1478     {
1479 	ajRegExec(dbiRegEntryIdSort, dbiRdLine);
1480 	ajRegSubI(dbiRegEntryIdSort, 1, &dbiIdStr);
1481 	ajRegSubI(dbiRegEntryIdSort, 2, &dbiTmpStr);
1482 	ajStrToInt(dbiTmpStr, &rpos);
1483 	ajRegSubI(dbiRegEntryIdSort, 3, &dbiTmpStr);
1484 	ajStrToInt(dbiTmpStr, &spos);
1485 	ajRegSubI(dbiRegEntryIdSort, 4, &dbiTmpStr);
1486 	ajStrToInt(dbiTmpStr, &filenum);
1487 
1488 	if(ajStrMatchCaseS(dbiIdStr, dbiLastId))
1489 	{
1490             ajDebug("Duplicate ID '%S' filenum: %d",
1491                     dbiIdStr, filenum);
1492             ajWarn("Duplicate ID skipped: '%S' "
1493                    "All hits will point to first ID found",
1494                    dbiIdStr);
1495             continue;
1496 	}
1497 
1498 	embDbiWriteEntryRecord(entFile, maxidlen, dbiIdStr,
1499 			       rpos, spos, filenum);
1500 	ajStrAssignS(&dbiLastId, dbiIdStr);
1501 	idcnt++;
1502     }
1503     ajFileClose(&esortfile);
1504 
1505     return idcnt;
1506 }
1507 
1508 
1509 
1510 
1511 /* @func embDbiMemWriteEntry **************************************************
1512 **
1513 ** Write entryname index for in-memory processing
1514 **
1515 ** @param [u] entFile [AjPFile] entryname index file
1516 ** @param [r] maxidlen [ajuint] Maximum entry id length
1517 ** @param [r] idlist [const AjPList] List of entry IDs to be written
1518 ** @param [w] ids [void***] AjPStr* array of IDs from list
1519 ** @return [ajuint] Number of entries written (excluding duplicates)
1520 **
1521 ** @release 2.4.0
1522 ******************************************************************************/
1523 
embDbiMemWriteEntry(AjPFile entFile,ajuint maxidlen,const AjPList idlist,void *** ids)1524 ajuint embDbiMemWriteEntry(AjPFile entFile, ajuint maxidlen,
1525 			  const AjPList idlist,
1526 			  void ***ids)
1527 {
1528     ajuint idCount;
1529     ajuint i;
1530     EmbPEntry entry;
1531     ajuint idcnt = 0;
1532 
1533     idCount = (ajuint) ajListToarray(idlist, ids);
1534     qsort(*ids, idCount, sizeof(void*), embDbiCmpId);
1535     ajDebug("ids sorted\n");
1536 
1537     for(i = 0; i < idCount; i++)
1538     {
1539 	entry = (EmbPEntry)(*ids)[i];
1540 
1541 	if(ajStrMatchCaseC(dbiIdStr, entry->entry))
1542 	{
1543 	    ajErr("Duplicate ID found: '%S'", dbiIdStr);
1544 	    continue;
1545 	}
1546 
1547 	ajStrAssignC(&dbiIdStr, entry->entry);
1548 	embDbiWriteEntryRecord(entFile, maxidlen, dbiIdStr,
1549 			       entry->rpos, entry->spos, entry->filenum);
1550 	idcnt++;
1551     }
1552 
1553     return idcnt;
1554 }
1555 
1556 
1557 
1558 
1559 /* @func embDbiSortWriteFields ************************************************
1560 **
1561 ** Write the indices for a field.
1562 **
1563 ** @param [r] dbname [const AjPStr] Database name (used for temp file names)
1564 ** @param [r] release [const AjPStr] Release number as a string
1565 ** @param [r] date [const char[4]] Date
1566 ** @param [r] indexdir [const AjPStr] Index directory
1567 ** @param [r] fieldname [const AjPStr] Field name (used for temp file names)
1568 ** @param [r] maxFieldLen [ajuint] Maximum field token length
1569 ** @param [r] nfiles [ajuint] Number of data files
1570 ** @param [r] nentries [ajuint] Number of entries
1571 ** @param [r] cleanup [AjBool] Cleanup temp files if true
1572 ** @param [r] sortopt [const AjPStr] Sort command line options
1573 ** @return [ajuint] Number of unique field targets written
1574 **
1575 ** @release 2.4.0
1576 ******************************************************************************/
1577 
embDbiSortWriteFields(const AjPStr dbname,const AjPStr release,const char date[4],const AjPStr indexdir,const AjPStr fieldname,ajuint maxFieldLen,ajuint nfiles,ajuint nentries,AjBool cleanup,const AjPStr sortopt)1578 ajuint embDbiSortWriteFields(const AjPStr dbname, const AjPStr release,
1579 			    const char date[4], const AjPStr indexdir,
1580 			    const AjPStr fieldname, ajuint maxFieldLen,
1581 			    ajuint nfiles, ajuint nentries,
1582 			    AjBool cleanup, const AjPStr sortopt)
1583 {
1584     AjPFile asortfile;
1585     AjPFile asrt2file;
1586     AjPFile blistfile;
1587     AjPFile elistfile;
1588     ajuint ient;
1589 
1590     ajuint fieldCount=0;
1591     ajuint idwidth;
1592 
1593     AjPFile trgFile;
1594     AjPFile hitFile;
1595     short alen;
1596     ajuint asize;
1597     ajuint ahsize;
1598     ajuint itoken = 0;
1599     ajuint i;
1600     ajuint j;
1601     ajuint k;
1602     ajint idnum;
1603     ajint lastidnum;
1604 
1605     ajStrAssignC(&dbiFieldName, dbiFieldFile(fieldname));
1606     ajFmtPrintS(&dbiTmpStr, "%d", nentries);
1607     idwidth = ajStrGetLen(dbiTmpStr);
1608 
1609     if(!dbiRegFieldIdSort)
1610 	dbiRegFieldIdSort = ajRegCompC("^([^ ]+) +");
1611 
1612     if(!dbiRegFieldTokSort)
1613 	dbiRegFieldTokSort = ajRegCompC("^([^ ]+) +([^\n\r]+)");
1614 
1615     if(!dbiRegFieldTokIdSort)
1616 	dbiRegFieldTokIdSort = ajRegCompC("^(.*[^ ]) +([0-9]+)[\r\n]+$");
1617 
1618     ajFmtPrintS(&dbiFieldId2, "%S_id2", dbiFieldName);
1619     ajFmtPrintS(&dbiFieldSort, "%S_sort", dbiFieldName);
1620     ajFmtPrintS(&dbiFieldSort2, "%S_sort2", dbiFieldName);
1621 
1622     trgFile = embDbiFileIndex(indexdir, dbiFieldName, "trg");
1623     hitFile = embDbiFileIndex(indexdir, dbiFieldName, "hit");
1624 
1625     embDbiSortFile(dbname, ajStrGetPtr(dbiFieldName),
1626 		   ajStrGetPtr(dbiFieldSort),
1627 		   nfiles, cleanup, sortopt);
1628 
1629     /* put in the entry numbers and remove the names */
1630     /* read dbname.<field>srt, for each entry, increment the count */
1631 
1632     elistfile = embDbiFileIn(dbname, "idsrt");
1633     asortfile = embDbiFileIn(dbname, ajStrGetPtr(dbiFieldSort));
1634     blistfile = embDbiFileOut(dbname, ajStrGetPtr(dbiFieldId2));
1635 
1636     fieldCount = 0;
1637 
1638     ient=0;
1639     ajStrAssignC(&dbiCurrentId, "");
1640 
1641     while(ajReadline(asortfile, &dbiRdLine))
1642     {
1643 	ajRegExec(dbiRegFieldTokSort, dbiRdLine);
1644 	ajRegSubI(dbiRegFieldTokSort, 1, &dbiIdStr);
1645 	ajRegSubI(dbiRegFieldTokSort, 2, &dbiFieldStr);
1646 
1647 	ajDebug("asortfile curr '%S' id '%S' field '%S'\n",
1648 		dbiCurrentId, dbiIdStr, dbiFieldStr);
1649 
1650 	while(!ajStrMatchS(dbiIdStr, dbiCurrentId))
1651 	{
1652 	    ajStrAssignS(&dbiFieldId, dbiCurrentId);
1653 
1654 	    if(!ajReadline(elistfile, &dbiIdLine))
1655 		ajFatal("Error in embDbiSortWriteFields, "
1656 			"expected entry %S not found, last was '%S'",
1657 			dbiIdStr, dbiCurrentId);
1658 	    ajRegExec(dbiRegFieldIdSort, dbiIdLine);
1659 	    ajRegSubI(dbiRegFieldIdSort, 1, &dbiCurrentId);
1660 
1661 	    ajDebug("curr '%S' line '%S'\n", dbiCurrentId, dbiIdLine);
1662 
1663 	    if(!ajStrMatchS(dbiFieldId, dbiCurrentId))
1664 		ient++;
1665 	    ajDebug("asortfile curr '%S' id '%S' ient: %u\n",
1666 		    dbiCurrentId, dbiIdStr, ient);
1667 	}
1668 
1669 	ajFmtPrintF(blistfile, "%S %0*d\n", dbiFieldStr, idwidth, ient);
1670 	fieldCount++;
1671     }
1672 
1673     ajFileClose(&asortfile);
1674     ajFileClose(&blistfile);
1675     ajFileClose(&elistfile);
1676 
1677     /* sort again */
1678 
1679     embDbiRmFile(dbname, ajStrGetPtr(dbiFieldSort), 0, cleanup);
1680     embDbiSortFile(dbname, ajStrGetPtr(dbiFieldId2),
1681 		   ajStrGetPtr(dbiFieldSort2),
1682 		   0, cleanup, sortopt);
1683 
1684     alen = maxFieldLen+8;
1685     asize = 300 + (fieldCount*(ajuint)alen); /* to be fixed later */
1686     embDbiHeader(trgFile, asize, fieldCount,
1687 		 alen, dbname, release, date);
1688 
1689     ahsize = 300 + (fieldCount*4);
1690     embDbiHeader(hitFile, ahsize, fieldCount, 4,
1691 		 dbname, release, date);
1692 
1693     itoken = 0;
1694     j = 0;
1695     k = 1;
1696 
1697     i = 0;
1698     lastidnum = 999999999;
1699     ajStrAssignC(&dbiFieldId, "");
1700     asrt2file = embDbiFileIn(dbname, ajStrGetPtr(dbiFieldSort2));
1701 
1702     while(ajReadline(asrt2file, &dbiRdLine))
1703     {
1704 	ajRegExec(dbiRegFieldTokIdSort, dbiRdLine);
1705 	ajRegSubI(dbiRegFieldTokIdSort, 1, &dbiIdStr);
1706 	ajRegSubI(dbiRegFieldTokIdSort, 2, &dbiTmpStr);
1707 	ajStrToInt(dbiTmpStr, &idnum);
1708 
1709 	if(!i)
1710 	    ajStrAssignS(&dbiFieldId, dbiIdStr);
1711 
1712 	if(!ajStrMatchS(dbiFieldId, dbiIdStr))
1713 	{
1714 	    embDbiWriteHit(hitFile, idnum);
1715 	    embDbiWriteTrg(trgFile, maxFieldLen,
1716 			   j, k, dbiFieldId);
1717 	    j = 1;			/* number of hits */
1718 	    k = i+1;			/* first hit */
1719 	    ajStrAssignS(&dbiFieldId, dbiIdStr);
1720 	    i++;
1721 	    itoken++;
1722 	    lastidnum=idnum;
1723 	}
1724 	else if(idnum != lastidnum)	/* dbiIdStr is the same */
1725 	{
1726 	    embDbiWriteHit(hitFile, idnum);
1727 	    lastidnum = idnum;
1728 	    j++;
1729 	    i++;
1730 	}
1731     }
1732 
1733     ajFileClose(&asrt2file);
1734     embDbiRmFile(dbname, ajStrGetPtr(dbiFieldSort2), 0, cleanup);
1735 
1736     ajDebug("targets i:%d itoken: %d\n", i, itoken);
1737 
1738     if(i)
1739     {
1740 	/* possibly there were no target tokens */
1741 	embDbiWriteTrg(trgFile, maxFieldLen,
1742 		       j, k, dbiFieldId);
1743 	itoken++;
1744     }
1745 
1746     ajDebug("wrote %F %d\n", trgFile, itoken);
1747 
1748     embDbiHeaderSize(trgFile, 300+itoken*(ajuint)alen, itoken);
1749 
1750     ajDebug("finished...\n%7d files\n%7d %F\n%7d %F\n",
1751 	    nfiles, itoken, trgFile,
1752 	    fieldCount, hitFile);
1753 
1754     ajFileClose(&trgFile);
1755     ajFileClose(&hitFile);
1756 
1757     return itoken;
1758 }
1759 
1760 
1761 
1762 
1763 /* @func embDbiMemWriteFields *************************************************
1764 **
1765 ** Write the fields indices
1766 **
1767 ** @param [r] dbname [const AjPStr] Database name (used for temp file names)
1768 ** @param [r] release [const AjPStr] Release number as a string
1769 ** @param [r] date [const char[4]] Date
1770 ** @param [r] indexdir [const AjPStr] Index directory
1771 ** @param [r] fieldname [const AjPStr] Field name (used for file names)
1772 ** @param [r] maxFieldLen [ajuint] Maximum field token length
1773 ** @param [r] fieldList [const AjPList] List of field tokens to be written
1774 ** @param [r] ids [void**] AjPStr* array offield token s from list
1775 ** @return [ajuint] Number of unique field targets written
1776 **
1777 ** @release 2.4.0
1778 ******************************************************************************/
1779 
embDbiMemWriteFields(const AjPStr dbname,const AjPStr release,const char date[4],const AjPStr indexdir,const AjPStr fieldname,ajuint maxFieldLen,const AjPList fieldList,void ** ids)1780 ajuint embDbiMemWriteFields(const AjPStr dbname,const  AjPStr release,
1781 			   const char date[4], const AjPStr indexdir,
1782 			   const AjPStr fieldname, ajuint maxFieldLen,
1783 			   const AjPList fieldList, void** ids)
1784 {
1785     AjPStr field = NULL;
1786 
1787     ajuint fieldCount = 0;
1788     ajuint ient;
1789     ajuint fieldent;
1790     ajuint i;
1791     ajuint j;
1792     ajint k;
1793     void **fieldItems = NULL;
1794     AjPFile trgFile;
1795     AjPFile hitFile;
1796     short alen;
1797     ajuint asize;
1798     ajuint ahsize;
1799     ajuint itoken = 0;
1800     ajuint idup   = 0;
1801     EmbPField fieldData    = NULL;
1802     static const char* lastfd    = "";
1803     ajuint lastidnum = 0;
1804 
1805     ajStrAssignC(&field, dbiFieldFile(fieldname));
1806     trgFile = embDbiFileIndex(indexdir, field, "trg");
1807     hitFile = embDbiFileIndex(indexdir, field, "hit");
1808 
1809     fieldCount = (ajuint) ajListToarray(fieldList, &fieldItems);
1810 
1811     ajDebug("fieldItems: %d %x\n",
1812 	    fieldCount, fieldItems);
1813 
1814     if(fieldCount)
1815     {
1816 	qsort(fieldItems, fieldCount, sizeof(void*),
1817 	      embDbiCmpFieldId);
1818 	ajDebug("%S sorted by id\n", field);
1819 	ient = 0;
1820 	fieldent = 0;
1821 
1822 	while(ids[ient] && fieldItems[fieldent])
1823 	{
1824 	    k = strcmp(((EmbPEntry)ids[ient])->entry,
1825 		       ((EmbPField)fieldItems[fieldent])->entry);
1826 	    if(k < 0)
1827 		ient++;
1828 	    else if(k > 0)
1829 		fieldent++;
1830 	    else
1831 		((EmbPField)fieldItems[fieldent++])->nid = ient+1;
1832 	}
1833 	ajDebug("checked ids: %d fieldItems: %d %d\n",
1834 		ient, fieldent, fieldCount);
1835 
1836 	qsort(fieldItems, fieldCount, sizeof(void*),
1837 	      embDbiCmpFieldField);
1838 	ajDebug("%S sorted by %S\n", field, field);
1839     }
1840 
1841     alen = maxFieldLen+8;
1842     asize = 300 + (fieldCount*(ajuint)alen); /* to be fixed later */
1843     embDbiHeader(trgFile, asize, fieldCount,
1844 		 alen, dbname, release, date);
1845 
1846     ahsize = 300 + (fieldCount*4);
1847     embDbiHeader(hitFile, ahsize, fieldCount, 4,
1848 		 dbname, release, date);
1849 
1850     itoken = 0;
1851     j      = 0;
1852     k      = 1;
1853     idup   = 0;
1854 
1855     for(i = 0; i < fieldCount; i++)
1856     {
1857 	fieldData = (EmbPField)fieldItems[i];
1858 
1859 	if(!i)
1860 	{
1861 	    lastfd = fieldData->field;
1862 	    lastidnum = 999999999;
1863 	}
1864 
1865 	if(strcmp(lastfd, fieldData->field))
1866 	{
1867 	    embDbiWriteHit(hitFile, fieldData->nid);
1868 	    ajStrAssignC(&dbiFieldStr, lastfd);
1869 	    embDbiWriteTrg(trgFile, maxFieldLen,
1870 			   j, k,dbiFieldStr);
1871 	    j = 1;
1872 	    k = i+1-idup;
1873 	    itoken++;
1874 	    lastfd = fieldData->field;
1875 	    lastidnum=fieldData->nid;
1876 	}
1877 	else if(fieldData->nid != lastidnum) /* lastfd is the same */
1878 	{
1879 	    embDbiWriteHit(hitFile, fieldData->nid);
1880 	    lastidnum = fieldData->nid;
1881 	    j++;
1882 	}
1883 	else
1884 	    idup++;
1885     }
1886 
1887     ajStrAssignC(&dbiFieldStr, lastfd);
1888 
1889     if(fieldCount)
1890     {
1891 	embDbiWriteTrg(trgFile, maxFieldLen, j, k, dbiFieldStr);
1892 	itoken++;
1893     }
1894 
1895     ajDebug("wrote %F %d\n", trgFile, itoken);
1896 
1897     embDbiHeaderSize(trgFile, 300+itoken*(ajuint)alen, itoken);
1898 
1899     ajDebug("finished...\n%7d %F\n%7d %F\n",
1900 	    itoken, trgFile,
1901 	    fieldCount, hitFile);
1902 
1903     ajFileClose(&trgFile);
1904     ajFileClose(&hitFile);
1905 
1906     ajStrDel(&field);
1907     AJFREE(fieldItems);
1908 
1909     return itoken;
1910 }
1911 
1912 
1913 
1914 
1915 /* @func embDbiDateSet ********************************************************
1916 **
1917 ** Sets the date as an integer array from a formatted string.
1918 ** The integer array is the internal format in database index headers
1919 **
1920 ** @param [r] datestr [const AjPStr] Date as a string
1921 ** @param [w] date [char[4]] Data char (1 byte int) array
1922 ** @return [void]
1923 **
1924 ** @release 2.4.0
1925 ******************************************************************************/
1926 
embDbiDateSet(const AjPStr datestr,char date[4])1927 void embDbiDateSet(const AjPStr datestr, char date[4])
1928 {
1929     ajuint i;
1930     ajint j;
1931 
1932     if(!dbiRegDate)
1933 	dbiRegDate = ajRegCompC("^([0-9]+).([0-9]+).([0-9]+)");
1934 
1935     date[3] = 0;
1936 
1937     if(ajRegExec(dbiRegDate, datestr))
1938 	for(i=1; i<4; i++)
1939 	{
1940 	    ajRegSubI(dbiRegDate, i, &dbiTmpStr);
1941 	    ajStrToInt(dbiTmpStr, &j);
1942 	    date[3-i] = j;
1943 	}
1944 
1945     return;
1946 }
1947 
1948 
1949 
1950 
1951 /* @func embDbiMaxlen *********************************************************
1952 **
1953 ** Compares a string to a maximum string length.
1954 **
1955 ** A negative maximum length limits the string to that absolute length.
1956 **
1957 ** A non-negative length is updated if the string is longer
1958 **
1959 ** @param [u] token [AjPStr*] Token string
1960 ** @param [u] maxlen [ajint*] Maximum string length
1961 ** @return [void]
1962 **
1963 ** @release 2.4.0
1964 ******************************************************************************/
1965 
embDbiMaxlen(AjPStr * token,ajint * maxlen)1966 void embDbiMaxlen(AjPStr* token, ajint* maxlen)
1967 {
1968     if(*maxlen < 0)
1969 	ajStrKeepRange(token, 1, -(*maxlen));
1970     else
1971     {
1972 	if((ajint)ajStrGetLen(*token) > *maxlen)
1973 	    *maxlen = ajStrGetLen(*token);
1974     }
1975 
1976     return;
1977 }
1978 
1979 
1980 
1981 
1982 /* @func embDbiLogHeader ******************************************************
1983 **
1984 ** Writes the header to a database indexing logfile
1985 **
1986 ** @param [u] logfile [AjPFile] Log file
1987 ** @param [r] dbname [const AjPStr] Database name
1988 ** @param [r] release [const AjPStr] Release number, name or code
1989 ** @param [r] datestr [const AjPStr] Indexing date as a string dd/mm/yy
1990 ** @param [r] indexdir [const AjPStr] Index directory relative path
1991 ** @param [r] maxindex [ajuint] Maximum index token length (usually zero)
1992 ** @return [void]
1993 **
1994 ** @release 4.0.0
1995 ******************************************************************************/
1996 
embDbiLogHeader(AjPFile logfile,const AjPStr dbname,const AjPStr release,const AjPStr datestr,const AjPStr indexdir,ajuint maxindex)1997 void embDbiLogHeader(AjPFile logfile, const AjPStr dbname,
1998 		     const AjPStr release, const AjPStr datestr,
1999 		     const AjPStr indexdir,
2000 		     ajuint maxindex)
2001 {
2002     AjPStr dirname = NULL;
2003     AjPTime today = NULL;
2004 
2005     today =  ajTimeNewTodayFmt("report");
2006     ajFmtPrintF(logfile, "########################################\n");
2007     ajFmtPrintF(logfile, "# Program: %S\n", ajUtilGetProgram());
2008     ajFmtPrintF(logfile, "# Rundate: %D\n", today);
2009     ajFmtPrintF(logfile, "# Dbname: %S\n", dbname);
2010     ajFmtPrintF(logfile, "# Release: %S\n", release);
2011     ajFmtPrintF(logfile, "# Date: %S\n", datestr);
2012     ajFmtPrintF(logfile, "# CurrentDirectory: %S\n", ajFileValueCwd());
2013     ajFmtPrintF(logfile, "# IndexDirectory: %S\n", indexdir);
2014     ajStrAssignS(&dirname, indexdir);
2015     ajDirnameFillPath(&dirname);
2016     ajFmtPrintF(logfile, "# IndexDirectoryPath: %S\n", dirname);
2017     ajFmtPrintF(logfile, "# Maxindex: %d\n", maxindex);
2018 
2019     ajTimeDel(&today);
2020     ajStrDel(&dirname);
2021 
2022     return;
2023 }
2024 
2025 
2026 
2027 
2028 /* @func embDbiLogFields ******************************************************
2029 **
2030 ** Writes database indexing logfile report of fields selected for indexing
2031 **
2032 ** @param [u] logfile [AjPFile] Log file
2033 ** @param [r] fields [AjPStr const *] Field names
2034 ** @param [r] nfields [ajuint] Number of fields
2035 ** @return [void]
2036 **
2037 ** @release 4.0.0
2038 ******************************************************************************/
2039 
embDbiLogFields(AjPFile logfile,AjPStr const * fields,ajuint nfields)2040 void embDbiLogFields(AjPFile logfile, AjPStr const * fields, ajuint nfields)
2041 {
2042     ajuint i;
2043 
2044     ajFmtPrintF(logfile, "# Fields: %d\n", nfields+1);
2045     ajFmtPrintF(logfile, "#   Field 1: id\n");
2046 
2047     for(i=0;i<nfields;i++)
2048 	ajFmtPrintF(logfile, "#   Field %d: %S\n", i+2, fields[i]);
2049 
2050     return;
2051 }
2052 
2053 
2054 
2055 
2056 /* @func embDbiLogSource ******************************************************
2057 **
2058 ** Writes database indexing logfile report of source data selected for indexing
2059 **
2060 ** @param [u] logfile [AjPFile] Log file
2061 ** @param [r] directory [const AjPStr] Data directory relative path
2062 ** @param [r] filename [const AjPStr] Selected filenames wildcard
2063 ** @param [r] exclude [const AjPStr] Excluded filenames wildcard
2064 ** @param [r] inputFiles [AjPStr const *] File names
2065 ** @param [r] nfiles [ajuint] Number of files
2066 ** @return [void]
2067 **
2068 ** @release 4.0.0
2069 ******************************************************************************/
2070 
embDbiLogSource(AjPFile logfile,const AjPStr directory,const AjPStr filename,const AjPStr exclude,AjPStr const * inputFiles,ajuint nfiles)2071 void embDbiLogSource(AjPFile logfile, const AjPStr directory,
2072 		     const AjPStr filename, const AjPStr exclude,
2073 		     AjPStr const * inputFiles, ajuint nfiles)
2074 {
2075     AjPStr dirname = NULL;
2076     ajuint i;
2077 
2078     ajFmtPrintF(logfile, "# Directory: %S\n", directory);
2079     ajStrAssignS(&dirname, directory);
2080     ajDirnameFillPath(&dirname);
2081     ajFmtPrintF(logfile, "# DirectoryPath: %S\n", dirname);
2082     ajFmtPrintF(logfile, "# Filenames: %S\n", filename);
2083     ajFmtPrintF(logfile, "# Exclude: %S\n", exclude);
2084     ajFmtPrintF(logfile, "# Files: %d\n", nfiles);
2085 
2086     for(i=0;i<nfiles;i++)
2087 	ajFmtPrintF(logfile, "#   File %d: %S\n", i+1, inputFiles[i]);
2088 
2089     ajStrDel(&dirname);
2090 
2091     return;
2092 }
2093 
2094 
2095 
2096 
2097 /* @func embDbiLogCmdline *****************************************************
2098 **
2099 ** Writes database indexing logfile report of commandline used
2100 **
2101 ** @param [u] logfile [AjPFile] Log file
2102 ** @return [void]
2103 **
2104 ** @release 4.0.0
2105 ******************************************************************************/
2106 
embDbiLogCmdline(AjPFile logfile)2107 void embDbiLogCmdline(AjPFile logfile)
2108 {
2109     AjPStr cmdline = NULL;
2110 
2111     ajFmtPrintF(logfile, "########################################\n");
2112     ajFmtPrintF(logfile, "# Commandline: %S\n", ajUtilGetProgram());
2113     ajStrAssignS(&cmdline, ajUtilGetCmdline());
2114 
2115     if(ajStrGetLen(cmdline))
2116     {
2117 	ajStrExchangeCC(&cmdline, "\n", "\1#    ");
2118 	ajStrExchangeCC(&cmdline, "\1", "\n");
2119 	ajFmtPrintF(logfile, "#    %S\n", cmdline);
2120     }
2121 
2122     ajStrAssignS(&cmdline, ajUtilGetInputs());
2123 
2124     if(ajStrGetLen(cmdline))
2125     {
2126 	ajStrExchangeCC(&cmdline, "\n", "\1#    ");
2127 	ajStrExchangeCC(&cmdline, "\1", "\n");
2128 	ajFmtPrintF(logfile, "#    %S\n", cmdline);
2129     }
2130 
2131     ajFmtPrintF(logfile, "########################################\n\n");
2132     ajStrDel(&cmdline);
2133 
2134     return;
2135 }
2136 
2137 
2138 
2139 
2140 /* @func embDbiLogFile ********************************************************
2141 **
2142 ** Writes database indexing logfile report of a single source file
2143 **
2144 ** @param [u] logfile [AjPFile] Log file
2145 ** @param [r] curfilename [const AjPStr] Source filename
2146 ** @param [r] idCountFile [ajuint] Number of IDs in file
2147 ** @param [r] fields [AjPStr const *] Field names
2148 ** @param [r] countField [const ajuint*] Number of field tokens in this file
2149 ** @param [r] nfields [ajuint] Number of fields
2150 ** @return [void]
2151 **
2152 ** @release 4.0.0
2153 ******************************************************************************/
2154 
embDbiLogFile(AjPFile logfile,const AjPStr curfilename,ajuint idCountFile,AjPStr const * fields,const ajuint * countField,ajuint nfields)2155 void embDbiLogFile(AjPFile logfile, const AjPStr curfilename,
2156 		   ajuint idCountFile, AjPStr const * fields,
2157 		   const ajuint* countField,
2158 		   ajuint nfields)
2159 {
2160     ajuint i;
2161 
2162     ajFmtPrintF(logfile, "filename: '%S'\n", curfilename);
2163     ajFmtPrintF(logfile, "    id: %d\n", idCountFile);
2164 
2165     for(i=0;i<nfields;i++)
2166 	ajFmtPrintF(logfile, "   %3S: %d\n", fields[i], countField[i]);
2167 
2168     return;
2169 }
2170 
2171 
2172 
2173 
2174 /* @func embDbiLogFinal *******************************************************
2175 **
2176 ** Writes database indexing logfile report of final totals
2177 **
2178 ** @param [u] logfile [AjPFile] Log file
2179 ** @param [r] maxindex [ajuint] User defined maximum index token length
2180 **                             (usually zero)
2181 ** @param [r] maxFieldLen [const ajint*] Maximum index token length
2182 **                                       for each field. Negative values
2183 **                                       were upper limits. Positive values
2184 **                                       are the maximum in the data
2185 ** @param [r] fields [AjPStr const *] Field names
2186 ** @param [r] fieldTot [const ajuint*] Number of unique field tokens
2187 ** @param [r] nfields [ajuint] Number of fields
2188 ** @param [r] nfiles [ajuint] Number of input files
2189 ** @param [r] idDone [ajuint] Number of unique IDs indexed
2190 ** @param [r] idCount [ajuint] Total number of IDs indexed
2191 ** @return [void]
2192 **
2193 ** @release 4.0.0
2194 ******************************************************************************/
2195 
embDbiLogFinal(AjPFile logfile,ajuint maxindex,const ajint * maxFieldLen,AjPStr const * fields,const ajuint * fieldTot,ajuint nfields,ajuint nfiles,ajuint idDone,ajuint idCount)2196 void embDbiLogFinal(AjPFile logfile, ajuint maxindex,
2197 		    const ajint* maxFieldLen,
2198 		    AjPStr const * fields, const ajuint* fieldTot,
2199 		    ajuint nfields, ajuint nfiles, ajuint idDone,
2200 		    ajuint idCount)
2201 {
2202     ajuint i;
2203     ajuint maxlen;
2204 
2205     ajFmtPrintF(logfile, "\n");
2206 
2207     for(i=0;i<nfields;i++)
2208     {
2209         if(maxindex)
2210 	    maxlen = maxindex;
2211 	else
2212 	    maxlen = maxFieldLen[i];
2213 
2214 	ajFmtPrintF(logfile, "Index %S: maxlen %d items %d\n",
2215 		    fields[i], maxlen, fieldTot[i]);
2216     }
2217 
2218     ajFmtPrintF(logfile, "\nTotal %d files %d entries (%d duplicates)\n",
2219 		nfiles, idCount, (idCount-idDone));
2220     return;
2221 }
2222 
2223 
2224 
2225 
2226 /* @func embDbiExit ***********************************************************
2227 **
2228 ** Cleanup database indexing internals on exit
2229 **
2230 ** @return [void]
2231 **
2232 ** @release 4.0.0
2233 ******************************************************************************/
2234 
embDbiExit(void)2235 void embDbiExit(void)
2236 {
2237     ajStrDel(&dbiCmdStr);
2238     ajStrDel(&dbiCmdStr2);
2239     ajStrDel(&dbiDirFix);
2240     ajStrDel(&dbiWildFname);
2241     ajStrDel(&dbiInFname);
2242     ajStrDel(&dbiOutFname);
2243     ajStrDel(&dbiOutRecord);
2244     ajStrDel(&dbiSortExt);
2245     ajStrDel(&dbiLastId);
2246     ajStrDel(&dbiFieldId);
2247     ajStrDel(&dbiIdStr);
2248     ajStrDel(&dbiTmpStr);
2249     ajStrDel(&dbiRdLine);
2250     ajStrDel(&dbiIdLine);
2251     ajStrDel(&dbiFieldSort);
2252     ajStrDel(&dbiFieldSort2);
2253     ajStrDel(&dbiFieldStr);
2254     ajStrDel(&dbiFieldName);
2255     ajStrDel(&dbiFieldId2);
2256     ajStrDel(&dbiCurrentId);
2257 
2258     ajRegFree(&dbiRegFieldIdSort);
2259     ajRegFree(&dbiRegFieldTokSort);
2260     ajRegFree(&dbiRegFieldTokIdSort);
2261     ajRegFree(&dbiRegEntryIdSort);
2262     ajRegFree(&dbiRegDate);
2263 
2264     return;
2265 }
2266