1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -
4  -  Redistribution and use in source and binary forms, with or without
5  -  modification, are permitted provided that the following conditions
6  -  are met:
7  -  1. Redistributions of source code must retain the above copyright
8  -     notice, this list of conditions and the following disclaimer.
9  -  2. Redistributions in binary form must reproduce the above
10  -     copyright notice, this list of conditions and the following
11  -     disclaimer in the documentation and/or other materials
12  -     provided with the distribution.
13  -
14  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
27 /*!
28  * \file dnahash.c
29  * <pre>
30  *
31  *      DnaHash creation, destruction
32  *          L_DNAHASH   *l_dnaHashCreate()
33  *          void         l_dnaHashDestroy()
34  *
35  *      DnaHash: Accessors and modifiers                      *
36  *          l_int32      l_dnaHashGetCount()
37  *          l_int32      l_dnaHashGetTotalCount()
38  *          L_DNA       *l_dnaHashGetDna()
39  *          void         l_dnaHashAdd()
40  *
41  *      DnaHash: Operations on Dna
42  *          L_DNAHASH   *l_dnaHashCreateFromDna()
43  *          l_int32      l_dnaRemoveDupsByHash()
44  *          l_int32      l_dnaMakeHistoByHash()
45  *          L_DNA       *l_dnaIntersectionByHash()
46  *          l_int32      l_dnaFindValByHash()
47  *
48  *    (1) The DnaHash is an array of Dna.  It is useful for fast
49  *        storage and lookup for sets and maps.  If the set or map
50  *        is on a Dna itself, the hash is a simple function that
51  *        maps a double to a l_uint64; otherwise the function will
52  *        map a string or a (x,y) point to a l_uint64.  The result of
53  *        the map is the "key", which is then used with the mod
54  *        function to select which Dna array is to be used.  The
55  *        number of arrays in a DnaHash should be a prime number.
56  *        If there are N items, we set up the DnaHash array to have
57  *        approximately N/20 Dna, so the average size of these arrays
58  *        will be about 20 when fully populated.  The number 20 was
59  *        found empirically to be in a broad maximum of efficiency.
60  *    (2) Note that the word "hash" is overloaded.  There are actually
61  *        two hashing steps: the first hashes the object to a l_uint64,
62  *        called the "key", and the second uses the mod function to
63  *        "hash" the "key" to the index of a particular Dna in the
64  *        DnaHash array.
65  *    (3) Insertion and lookup time for DnaHash is O(1).  Hash collisions
66  *        are easily handled (we expect an average of 20 for each key),
67  *        so we can use simple (fast) hash functions: we deal with
68  *        collisions by storing an array for each hash key.
69  *        This can be contrasted with using rbtree for sets and
70  *        maps, where insertion and lookup are O(logN) and hash functions
71  *        are slower because they must be good enough (i.e, random
72  *        enough with arbitrary input) to avoid collisions.
73  *    (4) Hash functions that map points, strings and floats to l_uint64
74  *        are given in utils.c.
75  *    (5) The use of the DnaHash (and RBTree) with strings and
76  *        (x,y) points can be found in string2.c and ptafunc2.c, rsp.
77  *        This file has similar hash set functions, using DnaHash on
78  *        two input Dna, for removing duplicates and finding the
79  *        intersection.  It also uses DnaHash as a hash map to find
80  *        a histogram of counts from an input Dna.
81  *    (6) Comparisons in running time, between DnaHash and RBTree, for
82  *        large sets of strings and points, are given in prog/hashtest.c.
83  *    (7) This is a very simple implementation, that expects that you
84  *        know approximately (i.e., within a factor of 2 or 3) how many
85  *        items are to be stored when you initialize the DnaHash.
86  *        (It would be nice to modify the l_dnaHashAdd() function
87  *        to increase the number of bins when the average occupation
88  *        exceeds 40 or so.)
89  *    (8) Useful rule of thumb for hashing collisions:
90  *        For a random hashing function (say, from strings to l_uint64),
91  *        the probability of a collision increases as N^2 for N much
92  *        less than 2^32.  The quadratic behavior switches over to
93  *        approaching 1.0 around 2^32, which is the square root of 2^64.
94  *        So, for example, if you have 10^7 strings, the probability
95  *        of a single collision using an l_uint64 key is on the order of
96  *            (10^7/10^9)^2 ~ 10^-4.
97  *        For a million strings you don't need to worry about collisons
98  *        (~10-6 probability), and for most applications can use the
99  *        RBTree (sorting) implementation with confidence.
100  * </pre>
101  */
102 
103 #include "allheaders.h"
104 
105 /*--------------------------------------------------------------------------*
106  *                     Dna hash: Creation and destruction                   *
107  *--------------------------------------------------------------------------*/
108 /*!
109  * \brief   l_dnaHashCreate()
110  *
111  * \param[in]   nbuckets the number of buckets in the hash table,
112  *                       which should be prime.
113  * \param[in]   initsize initial size of each allocated dna; 0 for default
114  * \return  ptr to new dnahash, or NULL on error
115  *
116  * <pre>
117  * Notes:
118  *      (1) Actual dna are created only as required by l_dnaHashAdd()
119  * </pre>
120  */
121 L_DNAHASH *
l_dnaHashCreate(l_int32 nbuckets,l_int32 initsize)122 l_dnaHashCreate(l_int32  nbuckets,
123                 l_int32  initsize)
124 {
125 L_DNAHASH  *dahash;
126 
127     PROCNAME("l_dnaHashCreate");
128 
129     if (nbuckets <= 0)
130         return (L_DNAHASH *)ERROR_PTR("negative hash size", procName, NULL);
131     if ((dahash = (L_DNAHASH *)LEPT_CALLOC(1, sizeof(L_DNAHASH))) == NULL)
132         return (L_DNAHASH *)ERROR_PTR("dahash not made", procName, NULL);
133     if ((dahash->dna = (L_DNA **)LEPT_CALLOC(nbuckets, sizeof(L_DNA *)))
134         == NULL) {
135         LEPT_FREE(dahash);
136         return (L_DNAHASH *)ERROR_PTR("dna ptr array not made", procName, NULL);
137     }
138 
139     dahash->nbuckets = nbuckets;
140     dahash->initsize = initsize;
141     return dahash;
142 }
143 
144 
145 /*!
146  * \brief   l_dnaHashDestroy()
147  *
148  * \param[in,out]   pdahash to be nulled, if it exists
149  * \return  void
150  */
151 void
l_dnaHashDestroy(L_DNAHASH ** pdahash)152 l_dnaHashDestroy(L_DNAHASH **pdahash)
153 {
154 L_DNAHASH  *dahash;
155 l_int32    i;
156 
157     PROCNAME("l_dnaHashDestroy");
158 
159     if (pdahash == NULL) {
160         L_WARNING("ptr address is NULL!\n", procName);
161         return;
162     }
163 
164     if ((dahash = *pdahash) == NULL)
165         return;
166 
167     for (i = 0; i < dahash->nbuckets; i++)
168         l_dnaDestroy(&dahash->dna[i]);
169     LEPT_FREE(dahash->dna);
170     LEPT_FREE(dahash);
171     *pdahash = NULL;
172 }
173 
174 
175 /*--------------------------------------------------------------------------*
176  *                   Dna hash: Accessors and modifiers                      *
177  *--------------------------------------------------------------------------*/
178 /*!
179  * \brief   l_dnaHashGetCount()
180  *
181  * \param[in]    dahash
182  * \return  nbuckets allocated, or 0 on error
183  */
184 l_int32
l_dnaHashGetCount(L_DNAHASH * dahash)185 l_dnaHashGetCount(L_DNAHASH  *dahash)
186 {
187 
188     PROCNAME("l_dnaHashGetCount");
189 
190     if (!dahash)
191         return ERROR_INT("dahash not defined", procName, 0);
192     return dahash->nbuckets;
193 }
194 
195 
196 /*!
197  * \brief   l_dnaHashGetTotalCount()
198  *
199  * \param[in]    dahash
200  * \return  n number of numbers in all dna, or 0 on error
201  */
202 l_int32
l_dnaHashGetTotalCount(L_DNAHASH * dahash)203 l_dnaHashGetTotalCount(L_DNAHASH  *dahash)
204 {
205 l_int32  i, n;
206 L_DNA   *da;
207 
208     PROCNAME("l_dnaHashGetTotalCount");
209 
210     if (!dahash)
211         return ERROR_INT("dahash not defined", procName, 0);
212 
213     for (i = 0, n = 0; i < dahash->nbuckets; i++) {
214         da = l_dnaHashGetDna(dahash, i, L_NOCOPY);
215         if (da)
216             n += l_dnaGetCount(da);
217     }
218 
219     return n;
220 }
221 
222 
223 /*!
224  * \brief   l_dnaHashGetDna()
225  *
226  * \param[in]    dahash
227  * \param[in]    key  key to be hashed into a bucket number
228  * \param[in]    copyflag L_NOCOPY, L_COPY, L_CLONE
229  * \return  ptr to dna
230  */
231 L_DNA *
l_dnaHashGetDna(L_DNAHASH * dahash,l_uint64 key,l_int32 copyflag)232 l_dnaHashGetDna(L_DNAHASH  *dahash,
233                 l_uint64    key,
234                 l_int32     copyflag)
235 {
236 l_int32  bucket;
237 L_DNA   *da;
238 
239     PROCNAME("l_dnaHashGetDna");
240 
241     if (!dahash)
242         return (L_DNA *)ERROR_PTR("dahash not defined", procName, NULL);
243     bucket = key % dahash->nbuckets;
244     da = dahash->dna[bucket];
245     if (da) {
246         if (copyflag == L_NOCOPY)
247             return da;
248         else if (copyflag == L_COPY)
249             return l_dnaCopy(da);
250         else
251             return l_dnaClone(da);
252     }
253     else
254         return NULL;
255 }
256 
257 
258 /*!
259  * \brief   l_dnaHashAdd()
260  *
261  * \param[in]    dahash
262  * \param[in]    key  key to be hashed into a bucket number
263  * \param[in]    value  float value to be appended to the specific dna
264  * \return  0 if OK; 1 on error
265  */
266 l_int32
l_dnaHashAdd(L_DNAHASH * dahash,l_uint64 key,l_float64 value)267 l_dnaHashAdd(L_DNAHASH  *dahash,
268              l_uint64    key,
269              l_float64   value)
270 {
271 l_int32  bucket;
272 L_DNA   *da;
273 
274     PROCNAME("l_dnaHashAdd");
275 
276     if (!dahash)
277         return ERROR_INT("dahash not defined", procName, 1);
278     bucket = key % dahash->nbuckets;
279     da = dahash->dna[bucket];
280     if (!da) {
281         if ((da = l_dnaCreate(dahash->initsize)) == NULL)
282             return ERROR_INT("da not made", procName, 1);
283         dahash->dna[bucket] = da;
284     }
285     l_dnaAddNumber(da, value);
286     return 0;
287 }
288 
289 
290 /*--------------------------------------------------------------------------*
291  *                      DnaHash: Operations on Dna                          *
292  *--------------------------------------------------------------------------*/
293 /*!
294  * \brief   l_dnaHashCreateFromDna()
295  *
296  * \param[in]    da
297  * \return  dahash if OK; 1 on error
298  *
299  * <pre>
300  * Notes:
301  *      (1) The values stored in the %dahash are indices into %da;
302  *          %dahash has no use without %da.
303  * </pre>
304  */
305 L_DNAHASH *
l_dnaHashCreateFromDna(L_DNA * da)306 l_dnaHashCreateFromDna(L_DNA  *da)
307 {
308 l_int32     i, n;
309 l_uint32    nsize;
310 l_uint64    key;
311 l_float64   val;
312 L_DNAHASH  *dahash;
313 
314     PROCNAME("l_dnaHashCreateFromDna");
315 
316     if (!da)
317         return (L_DNAHASH *)ERROR_PTR("da not defined", procName, NULL);
318 
319     n = l_dnaGetCount(da);
320     findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
321 
322     dahash = l_dnaHashCreate(nsize, 8);
323     for (i = 0; i < n; i++) {
324         l_dnaGetDValue(da, i, &val);
325         l_hashFloat64ToUint64(nsize, val, &key);
326         l_dnaHashAdd(dahash, key, (l_float64)i);
327     }
328 
329     return dahash;
330 }
331 
332 
333 /*!
334  * \brief   l_dnaRemoveDupsByHash()
335  *
336  * \param[in]    das
337  * \param[out]   pdad hash set
338  * \param[out]   pdahash [optional] dnahash used for lookup
339  * \return  0 if OK; 1 on error
340  *
341  * <pre>
342  * Notes:
343  *      (1) Generates a dna with unique values.
344  *      (2) The dnahash is built up with dad to assure uniqueness.
345  *          It can be used to find if an element is in the set:
346  *              l_dnaFindValByHash(dad, dahash, val, &index)
347  * </pre>
348  */
349 l_int32
l_dnaRemoveDupsByHash(L_DNA * das,L_DNA ** pdad,L_DNAHASH ** pdahash)350 l_dnaRemoveDupsByHash(L_DNA       *das,
351                       L_DNA      **pdad,
352                       L_DNAHASH  **pdahash)
353 {
354 l_int32     i, n, index, items;
355 l_uint32    nsize;
356 l_uint64    key;
357 l_float64   val;
358 L_DNA      *dad;
359 L_DNAHASH  *dahash;
360 
361     PROCNAME("l_dnaRemoveDupsByHash");
362 
363     if (pdahash) *pdahash = NULL;
364     if (!pdad)
365         return ERROR_INT("&dad not defined", procName, 1);
366     *pdad = NULL;
367     if (!das)
368         return ERROR_INT("das not defined", procName, 1);
369 
370     n = l_dnaGetCount(das);
371     findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
372     dahash = l_dnaHashCreate(nsize, 8);
373     dad = l_dnaCreate(n);
374     *pdad = dad;
375     for (i = 0, items = 0; i < n; i++) {
376         l_dnaGetDValue(das, i, &val);
377         l_dnaFindValByHash(dad, dahash, val, &index);
378         if (index < 0) {  /* not found */
379             l_hashFloat64ToUint64(nsize, val, &key);
380             l_dnaHashAdd(dahash, key, (l_float64)items);
381             l_dnaAddNumber(dad, val);
382             items++;
383         }
384     }
385 
386     if (pdahash)
387         *pdahash = dahash;
388     else
389         l_dnaHashDestroy(&dahash);
390     return 0;
391 }
392 
393 
394 /*!
395  * \brief   l_dnaMakeHistoByHash()
396  *
397  * \param[in]    das
398  * \param[out]   pdahash hash map: val --> index
399  * \param[out]   pdav array of values: index --> val
400  * \param[out]   pdac histo array of counts: index --> count
401  * \return  0 if OK; 1 on error
402  *
403  * <pre>
404  * Notes:
405  *      (1) Generates and returns a dna of occurrences (histogram),
406  *          an aligned dna of values, and an associated hashmap.
407  *          The hashmap takes %dav and a value, and points into the
408  *          histogram in %dac.
409  *      (2) The dna of values, %dav, is aligned with the histogram %dac,
410  *          and is needed for fast lookup.  It is a hash set, because
411  *          the values are unique.
412  *      (3) Lookup is simple:
413  *              l_dnaFindValByHash(dav, dahash, val, &index);
414  *              if (index >= 0)
415  *                  l_dnaGetIValue(dac, index, &icount);
416  *              else
417  *                  icount = 0;
418  * </pre>
419  */
420 l_int32
l_dnaMakeHistoByHash(L_DNA * das,L_DNAHASH ** pdahash,L_DNA ** pdav,L_DNA ** pdac)421 l_dnaMakeHistoByHash(L_DNA       *das,
422                      L_DNAHASH  **pdahash,
423                      L_DNA      **pdav,
424                      L_DNA      **pdac)
425 {
426 l_int32     i, n, nitems, index, count;
427 l_uint32    nsize;
428 l_uint64    key;
429 l_float64   val;
430 L_DNA      *dac, *dav;
431 L_DNAHASH  *dahash;
432 
433     PROCNAME("l_dnaMakeHistoByHash");
434 
435     if (pdahash) *pdahash = NULL;
436     if (pdac) *pdac = NULL;
437     if (pdav) *pdav = NULL;
438     if (!pdahash || !pdac || !pdav)
439         return ERROR_INT("&dahash, &dac, &dav not all defined", procName, 1);
440     if (!das)
441         return ERROR_INT("das not defined", procName, 1);
442     if ((n = l_dnaGetCount(das)) == 0)
443         return ERROR_INT("no data in das", procName, 1);
444 
445     findNextLargerPrime(n / 20, &nsize);  /* buckets in hash table */
446     dahash = l_dnaHashCreate(nsize, 8);
447     dac = l_dnaCreate(n);  /* histogram */
448     dav = l_dnaCreate(n);  /* the values */
449     for (i = 0, nitems = 0; i < n; i++) {
450         l_dnaGetDValue(das, i, &val);
451             /* Is this value already stored in dav? */
452         l_dnaFindValByHash(dav, dahash, val, &index);
453         if (index >= 0) {  /* found */
454             l_dnaGetIValue(dac, (l_float64)index, &count);
455             l_dnaSetValue(dac, (l_float64)index, count + 1);
456         } else {  /* not found */
457             l_hashFloat64ToUint64(nsize, val, &key);
458             l_dnaHashAdd(dahash, key, (l_float64)nitems);
459             l_dnaAddNumber(dav, val);
460             l_dnaAddNumber(dac, 1);
461             nitems++;
462         }
463     }
464 
465     *pdahash = dahash;
466     *pdac = dac;
467     *pdav = dav;
468     return 0;
469 }
470 
471 
472 /*!
473  * \brief   l_dnaIntersectionByHash()
474  *
475  * \param[in]    da1, da2
476  * \return  dad intersection of the number arrays, or NULL on error
477  *
478  * <pre>
479  * Notes:
480  *      (1) This uses the same method for building the intersection set
481  *          as ptaIntersectionByHash() and sarrayIntersectionByHash().
482  * </pre>
483  */
484 L_DNA *
l_dnaIntersectionByHash(L_DNA * da1,L_DNA * da2)485 l_dnaIntersectionByHash(L_DNA  *da1,
486                         L_DNA  *da2)
487 {
488 l_int32     n1, n2, nsmall, nbuckets, i, index1, index2;
489 l_uint32    nsize2;
490 l_uint64    key;
491 l_float64   val;
492 L_DNAHASH  *dahash1, *dahash2;
493 L_DNA      *da_small, *da_big, *dad;
494 
495     PROCNAME("l_dnaIntersectionByHash");
496 
497     if (!da1)
498         return (L_DNA *)ERROR_PTR("da1 not defined", procName, NULL);
499     if (!da2)
500         return (L_DNA *)ERROR_PTR("da2 not defined", procName, NULL);
501 
502         /* Put the elements of the biggest array into a dnahash */
503     n1 = l_dnaGetCount(da1);
504     n2 = l_dnaGetCount(da2);
505     da_small = (n1 < n2) ? da1 : da2;   /* do not destroy da_small */
506     da_big = (n1 < n2) ? da2 : da1;   /* do not destroy da_big */
507     dahash1 = l_dnaHashCreateFromDna(da_big);
508 
509         /* Build up the intersection of numbers.  Add to %dad
510          * if the number is in da_big (using dahash1) but hasn't
511          * yet been seen in the traversal of da_small (using dahash2). */
512     dad = l_dnaCreate(0);
513     nsmall = l_dnaGetCount(da_small);
514     findNextLargerPrime(nsmall / 20, &nsize2);  /* buckets in hash table */
515     dahash2 = l_dnaHashCreate(nsize2, 0);
516     nbuckets = l_dnaHashGetCount(dahash2);
517     for (i = 0; i < nsmall; i++) {
518         l_dnaGetDValue(da_small, i, &val);
519         l_dnaFindValByHash(da_big, dahash1, val, &index1);
520         if (index1 >= 0) {  /* found */
521             l_dnaFindValByHash(da_small, dahash2, val, &index2);
522             if (index2 == -1) {  /* not found */
523                 l_dnaAddNumber(dad, val);
524                 l_hashFloat64ToUint64(nbuckets, val, &key);
525                 l_dnaHashAdd(dahash2, key, (l_float64)i);
526             }
527         }
528     }
529 
530     l_dnaHashDestroy(&dahash1);
531     l_dnaHashDestroy(&dahash2);
532     return dad;
533 }
534 
535 
536 /*!
537  * \brief   l_dnaFindValByHash()
538  *
539  * \param[in]    da
540  * \param[in]    dahash containing indices into %da
541  * \param[in]    val  searching for this number in %da
542  * \param[out]   pindex index into da if found; -1 otherwise
543  * \return  0 if OK; 1 on error
544  *
545  * <pre>
546  * Notes:
547  *      (1) Algo: hash %val into a key; hash the key to get the dna
548  *                in %dahash (that holds indices into %da); traverse
549  *                the dna of indices looking for %val in %da.
550  * </pre>
551  */
552 l_int32
l_dnaFindValByHash(L_DNA * da,L_DNAHASH * dahash,l_float64 val,l_int32 * pindex)553 l_dnaFindValByHash(L_DNA      *da,
554                    L_DNAHASH  *dahash,
555                    l_float64   val,
556                    l_int32    *pindex)
557 {
558 l_int32    i, nbuckets, nvals, indexval;
559 l_float64  vali;
560 l_uint64   key;
561 L_DNA     *da1;
562 
563     PROCNAME("l_dnaFindValByHash");
564 
565     if (!pindex)
566         return ERROR_INT("&index not defined", procName, 1);
567     *pindex = -1;
568     if (!da)
569         return ERROR_INT("da not defined", procName, 1);
570     if (!dahash)
571         return ERROR_INT("dahash not defined", procName, 1);
572 
573     nbuckets = l_dnaHashGetCount(dahash);
574     l_hashFloat64ToUint64(nbuckets, val, &key);
575     da1 = l_dnaHashGetDna(dahash, key, L_NOCOPY);
576     if (!da1) return 0;
577 
578         /* Run through da1, looking for this %val */
579     nvals = l_dnaGetCount(da1);
580     for (i = 0; i < nvals; i++) {
581         l_dnaGetIValue(da1, i, &indexval);
582         l_dnaGetDValue(da, indexval, &vali);
583         if (val == vali) {
584             *pindex = indexval;
585             return 0;
586         }
587     }
588 
589     return 0;
590 }
591