1 /*
2  * Copyright (c) 2021 by Farsight Security, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *    http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <stdbool.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include "deduper.h"
23 
24 struct chainlink;
25 typedef struct chainlink *chainlink_t;
26 
27 struct chainlink {
28 	chainlink_t next;
29 	char str[];
30 };
31 
32 /* keep this adjacent to the related 'struct' (chainlink). */
33 static inline size_t
chainlink_size(size_t length)34 chainlink_size(size_t length) {
35 	return sizeof(struct chainlink) + length + 1;
36 }
37 
38 struct deduper {
39 	size_t buckets;
40 	chainlink_t chains[];
41 };
42 
43 /* keep this adjacent to the related 'struct' (deduper). */
44 static inline size_t
deduper_size(size_t buckets)45 deduper_size(size_t buckets) {
46 	return sizeof(struct deduper) + buckets * sizeof(chainlink_t);
47 }
48 
49 static unsigned long hash_djb2(const char *);
50 
51 /* deduper_new(buckets) -- create a deduper having a set number of buckets
52  */
53 deduper_t
deduper_new(size_t buckets)54 deduper_new(size_t buckets) {
55 	deduper_t ret = malloc(deduper_size(buckets));
56 	if (ret == NULL)
57 		abort();
58 	memset(ret, 0x00, deduper_size(buckets));
59 	ret->buckets = buckets;
60 	return ret;
61 }
62 
63 /* deduper_tas(str) -- test and maybe set this string in a deduper
64  */
65 bool
deduper_tas(deduper_t me,const char * str)66 deduper_tas(deduper_t me, const char *str) {
67 	size_t bucket = hash_djb2(str) % me->buckets;
68 	for (chainlink_t chainlink = me->chains[bucket];
69 	     chainlink != NULL;
70 	     chainlink = chainlink->next)
71 		if (strcmp(str, chainlink->str) == 0)
72 			return true;
73 	size_t len = strlen(str);
74 	chainlink_t chainlink = malloc(chainlink_size(len));
75 	if (chainlink == NULL)
76 		abort();
77 	memset(chainlink, 0, chainlink_size(len));
78 	chainlink->next = me->chains[bucket];
79 	strcpy(chainlink->str, str);
80 	me->chains[bucket] = chainlink;
81 	return false;
82 }
83 
84 /* deduper_dump(out) -- for debugging, render a deduper's contents to an output
85  */
86 void
deduper_dump(deduper_t me,FILE * out)87 deduper_dump(deduper_t me, FILE *out) {
88 	for (size_t bucket = 0; bucket < me->buckets; bucket++)
89 		if (me->chains[bucket] != NULL) {
90 			fprintf(out, "[%zu]", bucket);
91 			for (chainlink_t chainlink = me->chains[bucket];
92 			     chainlink != NULL;
93 			     chainlink = chainlink->next)
94 				fprintf(out, " \"%s\"", chainlink->str);
95 			fprintf(out, ".\n");
96 		}
97 }
98 
99 /* deduper_destroy() -- release all heap storage used by a deduper
100  */
101 void
deduper_destroy(deduper_t * me)102 deduper_destroy(deduper_t *me) {
103 	for (size_t bucket = 0; bucket < (*me)->buckets; bucket++) {
104 		chainlink_t next = (*me)->chains[bucket];
105 		if (next != NULL) {
106 		       (*me)->chains[bucket] = NULL;
107 			for (chainlink_t chainlink = next;
108 			     chainlink != NULL;
109 			     chainlink = next) {
110 				next = chainlink->next;
111 				chainlink->next = NULL;
112 				chainlink->str[0] = '\0';
113 				free(chainlink);
114 			}
115 		}
116 	}
117 	memset(*me, 0, deduper_size((*me)->buckets));
118 	free(*me);
119 	*me = NULL;
120 }
121 
122 /* hash_djb2() -- compute daniel j. bernstein (djb2) hash #2 over a string
123  */
124 static unsigned long
hash_djb2(const char * str)125 hash_djb2(const char *str) {
126 	unsigned long hash = 5381;
127 	unsigned int c;
128 
129 	while ((c = (unsigned char) *str++) != 0)
130 		hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
131 
132 	return hash;
133 }
134