1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file dnahash.c
29 * <pre>
30 *
31 * DnaHash creation, destruction
32 * L_DNAHASH *l_dnaHashCreate()
33 * void l_dnaHashDestroy()
34 *
35 * DnaHash: Accessors and modifiers *
36 * l_int32 l_dnaHashGetCount()
37 * l_int32 l_dnaHashGetTotalCount()
38 * L_DNA *l_dnaHashGetDna()
39 * void l_dnaHashAdd()
40 *
41 * DnaHash: Operations on Dna
42 * L_DNAHASH *l_dnaHashCreateFromDna()
43 * l_int32 l_dnaRemoveDupsByHash()
44 * l_int32 l_dnaMakeHistoByHash()
45 * L_DNA *l_dnaIntersectionByHash()
46 * l_int32 l_dnaFindValByHash()
47 *
48 * (1) The DnaHash is an array of Dna. It is useful for fast
49 * storage and lookup for sets and maps. If the set or map
50 * is on a Dna itself, the hash is a simple function that
51 * maps a double to a l_uint64; otherwise the function will
52 * map a string or a (x,y) point to a l_uint64. The result of
53 * the map is the "key", which is then used with the mod
54 * function to select which Dna array is to be used. The
55 * number of arrays in a DnaHash should be a prime number.
56 * If there are N items, we set up the DnaHash array to have
57 * approximately N/20 Dna, so the average size of these arrays
58 * will be about 20 when fully populated. The number 20 was
59 * found empirically to be in a broad maximum of efficiency.
60 * (2) Note that the word "hash" is overloaded. There are actually
61 * two hashing steps: the first hashes the object to a l_uint64,
62 * called the "key", and the second uses the mod function to
63 * "hash" the "key" to the index of a particular Dna in the
64 * DnaHash array.
65 * (3) Insertion and lookup time for DnaHash is O(1). Hash collisions
66 * are easily handled (we expect an average of 20 for each key),
67 * so we can use simple (fast) hash functions: we deal with
68 * collisions by storing an array for each hash key.
69 * This can be contrasted with using rbtree for sets and
70 * maps, where insertion and lookup are O(logN) and hash functions
71 * are slower because they must be good enough (i.e, random
72 * enough with arbitrary input) to avoid collisions.
73 * (4) Hash functions that map points, strings and floats to l_uint64
74 * are given in utils.c.
75 * (5) The use of the DnaHash (and RBTree) with strings and
76 * (x,y) points can be found in string2.c and ptafunc2.c, rsp.
77 * This file has similar hash set functions, using DnaHash on
78 * two input Dna, for removing duplicates and finding the
79 * intersection. It also uses DnaHash as a hash map to find
80 * a histogram of counts from an input Dna.
81 * (6) Comparisons in running time, between DnaHash and RBTree, for
82 * large sets of strings and points, are given in prog/hashtest.c.
83 * (7) This is a very simple implementation, that expects that you
84 * know approximately (i.e., within a factor of 2 or 3) how many
85 * items are to be stored when you initialize the DnaHash.
86 * (It would be nice to modify the l_dnaHashAdd() function
87 * to increase the number of bins when the average occupation
88 * exceeds 40 or so.)
89 * (8) Useful rule of thumb for hashing collisions:
90 * For a random hashing function (say, from strings to l_uint64),
91 * the probability of a collision increases as N^2 for N much
92 * less than 2^32. The quadratic behavior switches over to
93 * approaching 1.0 around 2^32, which is the square root of 2^64.
94 * So, for example, if you have 10^7 strings, the probability
95 * of a single collision using an l_uint64 key is on the order of
96 * (10^7/10^9)^2 ~ 10^-4.
97 * For a million strings you don't need to worry about collisons
98 * (~10-6 probability), and for most applications can use the
99 * RBTree (sorting) implementation with confidence.
100 * </pre>
101 */
102
103 #include "allheaders.h"
104
105 /*--------------------------------------------------------------------------*
106 * Dna hash: Creation and destruction *
107 *--------------------------------------------------------------------------*/
108 /*!
109 * \brief l_dnaHashCreate()
110 *
111 * \param[in] nbuckets the number of buckets in the hash table,
112 * which should be prime.
113 * \param[in] initsize initial size of each allocated dna; 0 for default
114 * \return ptr to new dnahash, or NULL on error
115 *
116 * <pre>
117 * Notes:
118 * (1) Actual dna are created only as required by l_dnaHashAdd()
119 * </pre>
120 */
121 L_DNAHASH *
l_dnaHashCreate(l_int32 nbuckets,l_int32 initsize)122 l_dnaHashCreate(l_int32 nbuckets,
123 l_int32 initsize)
124 {
125 L_DNAHASH *dahash;
126
127 PROCNAME("l_dnaHashCreate");
128
129 if (nbuckets <= 0)
130 return (L_DNAHASH *)ERROR_PTR("negative hash size", procName, NULL);
131 if ((dahash = (L_DNAHASH *)LEPT_CALLOC(1, sizeof(L_DNAHASH))) == NULL)
132 return (L_DNAHASH *)ERROR_PTR("dahash not made", procName, NULL);
133 if ((dahash->dna = (L_DNA **)LEPT_CALLOC(nbuckets, sizeof(L_DNA *)))
134 == NULL) {
135 LEPT_FREE(dahash);
136 return (L_DNAHASH *)ERROR_PTR("dna ptr array not made", procName, NULL);
137 }
138
139 dahash->nbuckets = nbuckets;
140 dahash->initsize = initsize;
141 return dahash;
142 }
143
144
145 /*!
146 * \brief l_dnaHashDestroy()
147 *
148 * \param[in,out] pdahash to be nulled, if it exists
149 * \return void
150 */
151 void
l_dnaHashDestroy(L_DNAHASH ** pdahash)152 l_dnaHashDestroy(L_DNAHASH **pdahash)
153 {
154 L_DNAHASH *dahash;
155 l_int32 i;
156
157 PROCNAME("l_dnaHashDestroy");
158
159 if (pdahash == NULL) {
160 L_WARNING("ptr address is NULL!\n", procName);
161 return;
162 }
163
164 if ((dahash = *pdahash) == NULL)
165 return;
166
167 for (i = 0; i < dahash->nbuckets; i++)
168 l_dnaDestroy(&dahash->dna[i]);
169 LEPT_FREE(dahash->dna);
170 LEPT_FREE(dahash);
171 *pdahash = NULL;
172 }
173
174
175 /*--------------------------------------------------------------------------*
176 * Dna hash: Accessors and modifiers *
177 *--------------------------------------------------------------------------*/
178 /*!
179 * \brief l_dnaHashGetCount()
180 *
181 * \param[in] dahash
182 * \return nbuckets allocated, or 0 on error
183 */
184 l_int32
l_dnaHashGetCount(L_DNAHASH * dahash)185 l_dnaHashGetCount(L_DNAHASH *dahash)
186 {
187
188 PROCNAME("l_dnaHashGetCount");
189
190 if (!dahash)
191 return ERROR_INT("dahash not defined", procName, 0);
192 return dahash->nbuckets;
193 }
194
195
196 /*!
197 * \brief l_dnaHashGetTotalCount()
198 *
199 * \param[in] dahash
200 * \return n number of numbers in all dna, or 0 on error
201 */
202 l_int32
l_dnaHashGetTotalCount(L_DNAHASH * dahash)203 l_dnaHashGetTotalCount(L_DNAHASH *dahash)
204 {
205 l_int32 i, n;
206 L_DNA *da;
207
208 PROCNAME("l_dnaHashGetTotalCount");
209
210 if (!dahash)
211 return ERROR_INT("dahash not defined", procName, 0);
212
213 for (i = 0, n = 0; i < dahash->nbuckets; i++) {
214 da = l_dnaHashGetDna(dahash, i, L_NOCOPY);
215 if (da)
216 n += l_dnaGetCount(da);
217 }
218
219 return n;
220 }
221
222
223 /*!
224 * \brief l_dnaHashGetDna()
225 *
226 * \param[in] dahash
227 * \param[in] key key to be hashed into a bucket number
228 * \param[in] copyflag L_NOCOPY, L_COPY, L_CLONE
229 * \return ptr to dna
230 */
231 L_DNA *
l_dnaHashGetDna(L_DNAHASH * dahash,l_uint64 key,l_int32 copyflag)232 l_dnaHashGetDna(L_DNAHASH *dahash,
233 l_uint64 key,
234 l_int32 copyflag)
235 {
236 l_int32 bucket;
237 L_DNA *da;
238
239 PROCNAME("l_dnaHashGetDna");
240
241 if (!dahash)
242 return (L_DNA *)ERROR_PTR("dahash not defined", procName, NULL);
243 bucket = key % dahash->nbuckets;
244 da = dahash->dna[bucket];
245 if (da) {
246 if (copyflag == L_NOCOPY)
247 return da;
248 else if (copyflag == L_COPY)
249 return l_dnaCopy(da);
250 else
251 return l_dnaClone(da);
252 }
253 else
254 return NULL;
255 }
256
257
258 /*!
259 * \brief l_dnaHashAdd()
260 *
261 * \param[in] dahash
262 * \param[in] key key to be hashed into a bucket number
263 * \param[in] value float value to be appended to the specific dna
264 * \return 0 if OK; 1 on error
265 */
266 l_int32
l_dnaHashAdd(L_DNAHASH * dahash,l_uint64 key,l_float64 value)267 l_dnaHashAdd(L_DNAHASH *dahash,
268 l_uint64 key,
269 l_float64 value)
270 {
271 l_int32 bucket;
272 L_DNA *da;
273
274 PROCNAME("l_dnaHashAdd");
275
276 if (!dahash)
277 return ERROR_INT("dahash not defined", procName, 1);
278 bucket = key % dahash->nbuckets;
279 da = dahash->dna[bucket];
280 if (!da) {
281 if ((da = l_dnaCreate(dahash->initsize)) == NULL)
282 return ERROR_INT("da not made", procName, 1);
283 dahash->dna[bucket] = da;
284 }
285 l_dnaAddNumber(da, value);
286 return 0;
287 }
288
289
290 /*--------------------------------------------------------------------------*
291 * DnaHash: Operations on Dna *
292 *--------------------------------------------------------------------------*/
293 /*!
294 * \brief l_dnaHashCreateFromDna()
295 *
296 * \param[in] da
297 * \return dahash if OK; 1 on error
298 *
299 * <pre>
300 * Notes:
301 * (1) The values stored in the %dahash are indices into %da;
302 * %dahash has no use without %da.
303 * </pre>
304 */
305 L_DNAHASH *
l_dnaHashCreateFromDna(L_DNA * da)306 l_dnaHashCreateFromDna(L_DNA *da)
307 {
308 l_int32 i, n;
309 l_uint32 nsize;
310 l_uint64 key;
311 l_float64 val;
312 L_DNAHASH *dahash;
313
314 PROCNAME("l_dnaHashCreateFromDna");
315
316 if (!da)
317 return (L_DNAHASH *)ERROR_PTR("da not defined", procName, NULL);
318
319 n = l_dnaGetCount(da);
320 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
321
322 dahash = l_dnaHashCreate(nsize, 8);
323 for (i = 0; i < n; i++) {
324 l_dnaGetDValue(da, i, &val);
325 l_hashFloat64ToUint64(nsize, val, &key);
326 l_dnaHashAdd(dahash, key, (l_float64)i);
327 }
328
329 return dahash;
330 }
331
332
333 /*!
334 * \brief l_dnaRemoveDupsByHash()
335 *
336 * \param[in] das
337 * \param[out] pdad hash set
338 * \param[out] pdahash [optional] dnahash used for lookup
339 * \return 0 if OK; 1 on error
340 *
341 * <pre>
342 * Notes:
343 * (1) Generates a dna with unique values.
344 * (2) The dnahash is built up with dad to assure uniqueness.
345 * It can be used to find if an element is in the set:
346 * l_dnaFindValByHash(dad, dahash, val, &index)
347 * </pre>
348 */
349 l_int32
l_dnaRemoveDupsByHash(L_DNA * das,L_DNA ** pdad,L_DNAHASH ** pdahash)350 l_dnaRemoveDupsByHash(L_DNA *das,
351 L_DNA **pdad,
352 L_DNAHASH **pdahash)
353 {
354 l_int32 i, n, index, items;
355 l_uint32 nsize;
356 l_uint64 key;
357 l_float64 val;
358 L_DNA *dad;
359 L_DNAHASH *dahash;
360
361 PROCNAME("l_dnaRemoveDupsByHash");
362
363 if (pdahash) *pdahash = NULL;
364 if (!pdad)
365 return ERROR_INT("&dad not defined", procName, 1);
366 *pdad = NULL;
367 if (!das)
368 return ERROR_INT("das not defined", procName, 1);
369
370 n = l_dnaGetCount(das);
371 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
372 dahash = l_dnaHashCreate(nsize, 8);
373 dad = l_dnaCreate(n);
374 *pdad = dad;
375 for (i = 0, items = 0; i < n; i++) {
376 l_dnaGetDValue(das, i, &val);
377 l_dnaFindValByHash(dad, dahash, val, &index);
378 if (index < 0) { /* not found */
379 l_hashFloat64ToUint64(nsize, val, &key);
380 l_dnaHashAdd(dahash, key, (l_float64)items);
381 l_dnaAddNumber(dad, val);
382 items++;
383 }
384 }
385
386 if (pdahash)
387 *pdahash = dahash;
388 else
389 l_dnaHashDestroy(&dahash);
390 return 0;
391 }
392
393
394 /*!
395 * \brief l_dnaMakeHistoByHash()
396 *
397 * \param[in] das
398 * \param[out] pdahash hash map: val --> index
399 * \param[out] pdav array of values: index --> val
400 * \param[out] pdac histo array of counts: index --> count
401 * \return 0 if OK; 1 on error
402 *
403 * <pre>
404 * Notes:
405 * (1) Generates and returns a dna of occurrences (histogram),
406 * an aligned dna of values, and an associated hashmap.
407 * The hashmap takes %dav and a value, and points into the
408 * histogram in %dac.
409 * (2) The dna of values, %dav, is aligned with the histogram %dac,
410 * and is needed for fast lookup. It is a hash set, because
411 * the values are unique.
412 * (3) Lookup is simple:
413 * l_dnaFindValByHash(dav, dahash, val, &index);
414 * if (index >= 0)
415 * l_dnaGetIValue(dac, index, &icount);
416 * else
417 * icount = 0;
418 * </pre>
419 */
420 l_int32
l_dnaMakeHistoByHash(L_DNA * das,L_DNAHASH ** pdahash,L_DNA ** pdav,L_DNA ** pdac)421 l_dnaMakeHistoByHash(L_DNA *das,
422 L_DNAHASH **pdahash,
423 L_DNA **pdav,
424 L_DNA **pdac)
425 {
426 l_int32 i, n, nitems, index, count;
427 l_uint32 nsize;
428 l_uint64 key;
429 l_float64 val;
430 L_DNA *dac, *dav;
431 L_DNAHASH *dahash;
432
433 PROCNAME("l_dnaMakeHistoByHash");
434
435 if (pdahash) *pdahash = NULL;
436 if (pdac) *pdac = NULL;
437 if (pdav) *pdav = NULL;
438 if (!pdahash || !pdac || !pdav)
439 return ERROR_INT("&dahash, &dac, &dav not all defined", procName, 1);
440 if (!das)
441 return ERROR_INT("das not defined", procName, 1);
442 if ((n = l_dnaGetCount(das)) == 0)
443 return ERROR_INT("no data in das", procName, 1);
444
445 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
446 dahash = l_dnaHashCreate(nsize, 8);
447 dac = l_dnaCreate(n); /* histogram */
448 dav = l_dnaCreate(n); /* the values */
449 for (i = 0, nitems = 0; i < n; i++) {
450 l_dnaGetDValue(das, i, &val);
451 /* Is this value already stored in dav? */
452 l_dnaFindValByHash(dav, dahash, val, &index);
453 if (index >= 0) { /* found */
454 l_dnaGetIValue(dac, (l_float64)index, &count);
455 l_dnaSetValue(dac, (l_float64)index, count + 1);
456 } else { /* not found */
457 l_hashFloat64ToUint64(nsize, val, &key);
458 l_dnaHashAdd(dahash, key, (l_float64)nitems);
459 l_dnaAddNumber(dav, val);
460 l_dnaAddNumber(dac, 1);
461 nitems++;
462 }
463 }
464
465 *pdahash = dahash;
466 *pdac = dac;
467 *pdav = dav;
468 return 0;
469 }
470
471
472 /*!
473 * \brief l_dnaIntersectionByHash()
474 *
475 * \param[in] da1, da2
476 * \return dad intersection of the number arrays, or NULL on error
477 *
478 * <pre>
479 * Notes:
480 * (1) This uses the same method for building the intersection set
481 * as ptaIntersectionByHash() and sarrayIntersectionByHash().
482 * </pre>
483 */
484 L_DNA *
l_dnaIntersectionByHash(L_DNA * da1,L_DNA * da2)485 l_dnaIntersectionByHash(L_DNA *da1,
486 L_DNA *da2)
487 {
488 l_int32 n1, n2, nsmall, nbuckets, i, index1, index2;
489 l_uint32 nsize2;
490 l_uint64 key;
491 l_float64 val;
492 L_DNAHASH *dahash1, *dahash2;
493 L_DNA *da_small, *da_big, *dad;
494
495 PROCNAME("l_dnaIntersectionByHash");
496
497 if (!da1)
498 return (L_DNA *)ERROR_PTR("da1 not defined", procName, NULL);
499 if (!da2)
500 return (L_DNA *)ERROR_PTR("da2 not defined", procName, NULL);
501
502 /* Put the elements of the biggest array into a dnahash */
503 n1 = l_dnaGetCount(da1);
504 n2 = l_dnaGetCount(da2);
505 da_small = (n1 < n2) ? da1 : da2; /* do not destroy da_small */
506 da_big = (n1 < n2) ? da2 : da1; /* do not destroy da_big */
507 dahash1 = l_dnaHashCreateFromDna(da_big);
508
509 /* Build up the intersection of numbers. Add to %dad
510 * if the number is in da_big (using dahash1) but hasn't
511 * yet been seen in the traversal of da_small (using dahash2). */
512 dad = l_dnaCreate(0);
513 nsmall = l_dnaGetCount(da_small);
514 findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */
515 dahash2 = l_dnaHashCreate(nsize2, 0);
516 nbuckets = l_dnaHashGetCount(dahash2);
517 for (i = 0; i < nsmall; i++) {
518 l_dnaGetDValue(da_small, i, &val);
519 l_dnaFindValByHash(da_big, dahash1, val, &index1);
520 if (index1 >= 0) { /* found */
521 l_dnaFindValByHash(da_small, dahash2, val, &index2);
522 if (index2 == -1) { /* not found */
523 l_dnaAddNumber(dad, val);
524 l_hashFloat64ToUint64(nbuckets, val, &key);
525 l_dnaHashAdd(dahash2, key, (l_float64)i);
526 }
527 }
528 }
529
530 l_dnaHashDestroy(&dahash1);
531 l_dnaHashDestroy(&dahash2);
532 return dad;
533 }
534
535
536 /*!
537 * \brief l_dnaFindValByHash()
538 *
539 * \param[in] da
540 * \param[in] dahash containing indices into %da
541 * \param[in] val searching for this number in %da
542 * \param[out] pindex index into da if found; -1 otherwise
543 * \return 0 if OK; 1 on error
544 *
545 * <pre>
546 * Notes:
547 * (1) Algo: hash %val into a key; hash the key to get the dna
548 * in %dahash (that holds indices into %da); traverse
549 * the dna of indices looking for %val in %da.
550 * </pre>
551 */
552 l_int32
l_dnaFindValByHash(L_DNA * da,L_DNAHASH * dahash,l_float64 val,l_int32 * pindex)553 l_dnaFindValByHash(L_DNA *da,
554 L_DNAHASH *dahash,
555 l_float64 val,
556 l_int32 *pindex)
557 {
558 l_int32 i, nbuckets, nvals, indexval;
559 l_float64 vali;
560 l_uint64 key;
561 L_DNA *da1;
562
563 PROCNAME("l_dnaFindValByHash");
564
565 if (!pindex)
566 return ERROR_INT("&index not defined", procName, 1);
567 *pindex = -1;
568 if (!da)
569 return ERROR_INT("da not defined", procName, 1);
570 if (!dahash)
571 return ERROR_INT("dahash not defined", procName, 1);
572
573 nbuckets = l_dnaHashGetCount(dahash);
574 l_hashFloat64ToUint64(nbuckets, val, &key);
575 da1 = l_dnaHashGetDna(dahash, key, L_NOCOPY);
576 if (!da1) return 0;
577
578 /* Run through da1, looking for this %val */
579 nvals = l_dnaGetCount(da1);
580 for (i = 0; i < nvals; i++) {
581 l_dnaGetIValue(da1, i, &indexval);
582 l_dnaGetDValue(da, indexval, &vali);
583 if (val == vali) {
584 *pindex = indexval;
585 return 0;
586 }
587 }
588
589 return 0;
590 }
591