1 /* Copyright (C) 2011 G.P. Halkes
2    This program is free software: you can redistribute it and/or modify
3    it under the terms of the GNU General Public License version 3, as
4    published by the Free Software Foundation.
5 
6    This program is distributed in the hope that it will be useful,
7    but WITHOUT ANY WARRANTY; without even the implied warranty of
8    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
9    GNU General Public License for more details.
10 
11    You should have received a copy of the GNU General Public License
12    along with this program.  If not, see <http://www.gnu.org/licenses/>.
13 */
14 
15 #include <stdlib.h>
16 #include <string.h>
17 #include <stddef.h>
18 #include "definitions.h"
19 #include "hashtable.h"
20 
21 #define MEMBLOCK_SIZE 32768
22 #define ALIGNOF(type) offsetof(struct { char c; type member; }, member)
23 #define ROUNDUP(x, to) (((x + (to - 1)) / to) * to)
24 
25 typedef struct MemBlock {
26 	struct MemBlock *next;
27 	size_t size;
28 	size_t idx;
29 } MemBlock;
30 
31 static MemBlock *head;
32 
33 #define HASHTABLE_SIZE 8091
34 
35 typedef struct Tuple {
36 	ValueType value;
37 	size_t stringLength;
38 	struct Tuple *next;
39 	char string[1];
40 } Tuple;
41 
42 static Tuple *hashtable[HASHTABLE_SIZE];
43 static ValueType nextValue;
44 
45 ValueType baseHashMax;
46 
allocFromBlock(size_t size)47 static Tuple *allocFromBlock(size_t size) {
48 	Tuple *result;
49 
50 	size = ROUNDUP(size, ALIGNOF(Tuple));
51 	if (head == NULL || size > head->size - head->idx) {
52 		MemBlock *newBlock;
53 		if (size > 2048) {
54 			newBlock = safe_malloc(size + sizeof(MemBlock));
55 			newBlock->size = size + sizeof(MemBlock);
56 		} else {
57 			newBlock = safe_malloc(MEMBLOCK_SIZE);
58 			newBlock->size = MEMBLOCK_SIZE;
59 		}
60 		newBlock->idx = ROUNDUP(sizeof(MemBlock), ALIGNOF(Tuple));
61 		newBlock->next = head;
62 		head = newBlock;
63 	}
64 	result = (Tuple *)(((char *) head) + head->idx);
65 	head->idx += size;
66 	return result;
67 }
68 
freeBlocks(void)69 static void freeBlocks(void) {
70 	MemBlock *ptr;
71 	while (head != NULL) {
72 		ptr = head;
73 		head = head->next;
74 		free(ptr);
75 	}
76 }
77 
78 #ifdef PROFILE_HASH
79 static int collisions;
80 static int hits;
81 
printHashStatistics(void)82 void printHashStatistics(void) {
83 	fprintf(stderr, "Hash statistics: unique words: %d, collisions %d (%.2f%%), hits %d\n",
84 		(int) nextValue, collisions, (double) collisions * 100.0 / nextValue, hits);
85 	collisions = 0;
86 	hits = 0;
87 }
88 #endif
89 
90 /** Calculate a hash value for a string.
91 	@param key The string to hash.
92 	@return The hash value associated with the string.
93 
94 	This function uses the djb2 hash function.
95 */
hash(void * data,size_t size)96 static unsigned int hash(void *data, size_t size) {
97 	const unsigned char *ptr;
98 	size_t i;
99 	unsigned int hashValue = 5381;
100 
101 	ptr = data;
102 
103 	for (i = 0; i < size; i++)
104 		hashValue = (hashValue << 5) + hashValue + (unsigned int) *ptr++;
105 
106 	return hashValue;
107 }
108 
109 /** Get the value associated with a word.
110 	@param word The word to get the value for.
111 
112 	This function gets the value associated with a word. If the word has not been
113 	seen before a new value will be given.
114 */
getValueFromContext(CharBuffer * word)115 ValueType getValueFromContext(CharBuffer *word) {
116 	return getValue(word->data, word->used);
117 }
118 
getValue(void * data,size_t size)119 ValueType getValue(void *data, size_t size) {
120 	Tuple *tuple;
121 	unsigned int hashValue = hash(data, size) % HASHTABLE_SIZE;
122 
123 	tuple = hashtable[hashValue];
124 	/* Search the singly linked list. */
125 	while (tuple != NULL && !(size == tuple->stringLength && memcmp(data, tuple->string, size) == 0))
126 		tuple = tuple->next;
127 
128 	if (tuple == NULL) {
129 		ASSERT(nextValue != VALUE_MAX);
130 		#ifdef PROFILE_HASH
131 		if (hashtable[hashValue] != NULL)
132 			collisions++;
133 		#endif
134 		tuple = allocFromBlock(sizeof(Tuple) - 1 + size);
135 
136 		tuple->value = nextValue++;
137 		tuple->stringLength = size;
138 		memcpy(tuple->string, data, size);
139 		tuple->next = hashtable[hashValue];
140 		hashtable[hashValue] = tuple;
141 	}
142 	#ifdef PROFILE_HASH
143 	else
144 		hits++;
145 	#endif
146 	return tuple->value;
147 }
148 
getHashMax(void)149 ValueType getHashMax(void) {
150 	ValueType hashMax = nextValue;
151 	int i;
152 
153 	#ifdef PROFILE_HASH
154 	printHashStatistics();
155 	#endif
156 
157 	/* Reset the hashtable for the next iteration. */
158 	for (i = 0; i < HASHTABLE_SIZE; i++)
159 		hashtable[i] = NULL;
160 	freeBlocks();
161 
162 	nextValue = 0;
163 
164 	return hashMax;
165 }
166