1#include <config.h>
2
3#include <sys/types.h>
4#include <sys/stat.h>
5
6#include <errno.h>
7#include <stdio.h>
8#include <stdlib.h>
9#include <string.h>
10#include <unistd.h>
11
12#include "cunit/cyrunit.h"
13#include "lib/util.h"
14#include "lib/strhash.h"
15
16extern int verbose;
17
18static const char *repeat(int c, unsigned n)
19{
20    static char buf[1024];
21    char *p = buf;
22
23    /* always leave room for a \0 */
24    if (n >= sizeof(buf))
25        n = sizeof(buf) - 1;
26
27    memset(buf, 0, sizeof(buf));
28    while (n--) {
29        *p++ = c;
30    }
31
32    return buf;
33}
34
35static void test_repeated(void)
36{
37    /* repeated chars on the end should not obliterate earlier input */
38    unsigned suffix_lengths[] = { 15, 31, 63, 127, 255, 511, 1023 };
39    unsigned i;
40
41    for (i = 0; i < sizeof(suffix_lengths) / sizeof(suffix_lengths[0]); i++) {
42        char *cat = strconcat("cat", repeat('a', suffix_lengths[i]), NULL);
43        char *dog = strconcat("dog", repeat('a', suffix_lengths[i]), NULL);
44        char *mouse = strconcat("mouse", repeat('a', suffix_lengths[i]), NULL);
45
46        unsigned xcat = strhash(cat);
47        unsigned xdog = strhash(dog);
48        unsigned xmouse = strhash(mouse);
49
50        CU_ASSERT_NOT_EQUAL(xcat, xdog);
51        CU_ASSERT_NOT_EQUAL(xdog, xmouse);
52        CU_ASSERT_NOT_EQUAL(xmouse, xcat);
53
54        free(cat);
55        free(dog);
56        free(mouse);
57    }
58}
59
60static void test_seeded(void)
61{
62    const char *const words[] = { "lorem", "ipsum", "dolor", "sit", "amet" };
63    const size_t n_words = sizeof(words) / sizeof(words[0]);
64    unsigned hashes[n_words];
65    unsigned i, j;
66
67    memset(hashes, 0, sizeof(hashes));
68
69    /* with no seed, same input should produce same hash */
70    for (i = 0; i < n_words; i++) {
71        unsigned h1 = strhash(words[i]);
72        unsigned h2 = strhash(words[i]);
73        CU_ASSERT_EQUAL(h1, h2);
74    }
75
76    /* with explicit zero seed, same input should produce same hash */
77    for (i = 0; i < n_words; i++) {
78        unsigned h1 = strhash(words[i]);
79        unsigned h2 = strhash_seeded(0, words[i]);
80        unsigned h3 = strhash_seeded(0, words[i]);
81        CU_ASSERT_EQUAL(h1, h2);
82        CU_ASSERT_EQUAL(h2, h3);
83        CU_ASSERT_EQUAL(h3, h1);
84    }
85
86    /* with some seed, same input should produce same hash */
87    for (j = 0; j < 5; j++) {
88        uint32_t seed;
89        do {
90            seed = rand();
91        } while (seed == 0);
92
93        for (i = 0; i < n_words; i++) {
94            unsigned h1 = strhash_seeded(seed, words[i]);
95            unsigned h2 = strhash_seeded(seed, words[i]);
96            CU_ASSERT_EQUAL(h1, h2);
97        }
98    }
99
100    /* with different seed, same input should produce different hash */
101    for (i = 0; i < n_words; i++) {
102        uint32_t seed1, seed2;
103        do {
104            seed1 = rand();
105            seed2 = rand();
106        } while (seed1 == 0 || seed2 == 0 || seed1 == seed2);
107
108        unsigned h1 = strhash_seeded(seed1, words[i]);
109        unsigned h2 = strhash_seeded(seed2, words[i]);
110
111        CU_ASSERT_NOT_EQUAL(h1, h2);
112    }
113}
114
115/* We can't define-out an entire test function when a feature is missing
116 * (in this case getline), because it confuses cunit.pl. So instead we
117 * make sure it will at least compile, but then return early without doing
118 * anything if the feature we wanted was missing.
119 */
120#ifndef HAVE_GETLINE
121#define getline(a,b,c) (((void)(b)), -1)
122#endif
123
124#define NBUCKETS (0x10000)
125static void test_quality(void)
126{
127    const char *wordsfile = "/usr/share/dict/words";
128    unsigned buckets[NBUCKETS] = {0};
129    FILE *stream;
130    char *line = NULL;
131    size_t len = 0;
132    ssize_t nread;
133    unsigned i;
134    unsigned inputs = 0;
135    unsigned contains_none = 0;
136    unsigned contains_one = 0;
137    unsigned contains_many = 0;
138    unsigned contains_many_sum = 0;
139    unsigned highest_count = 0;
140    unsigned highest_count_freq = 0;
141    unsigned max_acceptable_count;
142    double load;
143
144#ifndef HAVE_GETLINE
145    /* can't do anything anyway */
146    return;
147#endif
148
149    stream = fopen(wordsfile, "r");
150    if (!stream) {
151        if (verbose)
152            fprintf(stderr, "%s: %s (skipping) ", wordsfile, strerror(errno));
153        return;
154    }
155
156    while ((nread = getline(&line, &len, stream)) != -1) {
157        /* chomp */
158        if (line[nread - 1] == '\n')
159            line[nread - 1] = '\0';
160
161        unsigned hash = strhash_seeded_djb2(0, line) % NBUCKETS;
162
163        buckets[hash]++;
164        inputs++;
165    }
166    free(line);
167
168    /* arbitrary declaration of quality: no buckets should have more
169     * than ten times the expected load
170     */
171    load = inputs * 1.0 / NBUCKETS;
172    max_acceptable_count = load * 10;
173
174    unsigned bucket_counts[max_acceptable_count + 2];
175    memset(bucket_counts, 0, sizeof(bucket_counts));
176
177    for (i = 0; i < NBUCKETS; i++) {
178        switch (buckets[i]) {
179        case 0:
180            contains_none++;
181            break;
182        case 1:
183            contains_one++;
184            break;
185        default:
186            contains_many++;
187            contains_many_sum += buckets[i];
188            break;
189        }
190
191        if (buckets[i] > max_acceptable_count) {
192            bucket_counts[max_acceptable_count+1]++;
193        }
194        else {
195            bucket_counts[buckets[i]]++;
196        }
197
198        if (buckets[i] > highest_count) {
199            highest_count = buckets[i];
200            highest_count_freq = 1;
201        }
202        else if (buckets[i] == highest_count) {
203            highest_count_freq++;
204        }
205    }
206
207    if (verbose) {
208        putc('\n', stderr);
209        fprintf(stderr, "buckets: %u inputs: %u load: %g\n",
210                        NBUCKETS, inputs, load);
211        fprintf(stderr, "empty: %u unique: %u busy: %u\n",
212                        contains_none, contains_one, contains_many);
213        fprintf(stderr, "avg count in busy buckets: %g\n",
214                        contains_many_sum * 1.0 / contains_many);
215        fprintf(stderr, "busiest %u buckets contain %u each\n",
216                        highest_count_freq, highest_count);
217        fprintf(stderr, "max acceptable count: %u\n", max_acceptable_count);
218        fprintf(stderr, "\nbucket count histogram:\ncount frequency\n");
219        for (i = 0; i <= max_acceptable_count; i++) {
220            fprintf(stderr, "%4u  %u\n", i, bucket_counts[i]);
221        }
222        fprintf(stderr, "%4u+ %u\n", max_acceptable_count + 1,
223                                     bucket_counts[max_acceptable_count + 1]);
224    }
225
226    CU_ASSERT_EQUAL(bucket_counts[max_acceptable_count + 1], 0);
227}
228#undef NBUCKETS
229
230/* vim: set ft=c: */
231