1 /*
2 * Copyright (c) 2021 by Farsight Security, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <stdbool.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21
22 #include "deduper.h"
23
24 struct chainlink;
25 typedef struct chainlink *chainlink_t;
26
27 struct chainlink {
28 chainlink_t next;
29 char str[];
30 };
31
32 /* keep this adjacent to the related 'struct' (chainlink). */
33 static inline size_t
chainlink_size(size_t length)34 chainlink_size(size_t length) {
35 return sizeof(struct chainlink) + length + 1;
36 }
37
38 struct deduper {
39 size_t buckets;
40 chainlink_t chains[];
41 };
42
43 /* keep this adjacent to the related 'struct' (deduper). */
44 static inline size_t
deduper_size(size_t buckets)45 deduper_size(size_t buckets) {
46 return sizeof(struct deduper) + buckets * sizeof(chainlink_t);
47 }
48
49 static unsigned long hash_djb2(const char *);
50
51 /* deduper_new(buckets) -- create a deduper having a set number of buckets
52 */
53 deduper_t
deduper_new(size_t buckets)54 deduper_new(size_t buckets) {
55 deduper_t ret = malloc(deduper_size(buckets));
56 if (ret == NULL)
57 abort();
58 memset(ret, 0x00, deduper_size(buckets));
59 ret->buckets = buckets;
60 return ret;
61 }
62
63 /* deduper_tas(str) -- test and maybe set this string in a deduper
64 */
65 bool
deduper_tas(deduper_t me,const char * str)66 deduper_tas(deduper_t me, const char *str) {
67 size_t bucket = hash_djb2(str) % me->buckets;
68 for (chainlink_t chainlink = me->chains[bucket];
69 chainlink != NULL;
70 chainlink = chainlink->next)
71 if (strcmp(str, chainlink->str) == 0)
72 return true;
73 size_t len = strlen(str);
74 chainlink_t chainlink = malloc(chainlink_size(len));
75 if (chainlink == NULL)
76 abort();
77 memset(chainlink, 0, chainlink_size(len));
78 chainlink->next = me->chains[bucket];
79 strcpy(chainlink->str, str);
80 me->chains[bucket] = chainlink;
81 return false;
82 }
83
84 /* deduper_dump(out) -- for debugging, render a deduper's contents to an output
85 */
86 void
deduper_dump(deduper_t me,FILE * out)87 deduper_dump(deduper_t me, FILE *out) {
88 for (size_t bucket = 0; bucket < me->buckets; bucket++)
89 if (me->chains[bucket] != NULL) {
90 fprintf(out, "[%zu]", bucket);
91 for (chainlink_t chainlink = me->chains[bucket];
92 chainlink != NULL;
93 chainlink = chainlink->next)
94 fprintf(out, " \"%s\"", chainlink->str);
95 fprintf(out, ".\n");
96 }
97 }
98
99 /* deduper_destroy() -- release all heap storage used by a deduper
100 */
101 void
deduper_destroy(deduper_t * me)102 deduper_destroy(deduper_t *me) {
103 for (size_t bucket = 0; bucket < (*me)->buckets; bucket++) {
104 chainlink_t next = (*me)->chains[bucket];
105 if (next != NULL) {
106 (*me)->chains[bucket] = NULL;
107 for (chainlink_t chainlink = next;
108 chainlink != NULL;
109 chainlink = next) {
110 next = chainlink->next;
111 chainlink->next = NULL;
112 chainlink->str[0] = '\0';
113 free(chainlink);
114 }
115 }
116 }
117 memset(*me, 0, deduper_size((*me)->buckets));
118 free(*me);
119 *me = NULL;
120 }
121
122 /* hash_djb2() -- compute daniel j. bernstein (djb2) hash #2 over a string
123 */
124 static unsigned long
hash_djb2(const char * str)125 hash_djb2(const char *str) {
126 unsigned long hash = 5381;
127 unsigned int c;
128
129 while ((c = (unsigned char) *str++) != 0)
130 hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
131
132 return hash;
133 }
134