1 /* prefix_string.c --- implement strings based on a prefix tree
2  *
3  * ====================================================================
4  *    Licensed to the Apache Software Foundation (ASF) under one
5  *    or more contributor license agreements.  See the NOTICE file
6  *    distributed with this work for additional information
7  *    regarding copyright ownership.  The ASF licenses this file
8  *    to you under the Apache License, Version 2.0 (the
9  *    "License"); you may not use this file except in compliance
10  *    with the License.  You may obtain a copy of the License at
11  *
12  *      http://www.apache.org/licenses/LICENSE-2.0
13  *
14  *    Unless required by applicable law or agreed to in writing,
15  *    software distributed under the License is distributed on an
16  *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17  *    KIND, either express or implied.  See the License for the
18  *    specific language governing permissions and limitations
19  *    under the License.
20  * ====================================================================
21  */
22 
23 #include <assert.h>
24 #include "private/svn_string_private.h"
25 
26 /* A node in the tree represents a common prefix.  The root node is the
27  * empty prefix.  Nodes may have up to 256 sub-nodes, each starting with
28  * a different character (possibly '\0').
29  *
30  * The nodes in the tree store only up to 8 chars of the respective common
31  * prefix, i.e. longer common prefixes must be drawn out over multiple
32  * hierarchy levels.  This is a space <-> efficiency trade-off.
33  *
34  * Strings are the leaf nodes in the tree and use a specialized, smaller
35  * data structure.  They may add 0 to 7 extra chars to the prefix.  Both
36  * data types can be discerned by the last char in the data buffer.  This
37  * must be 0 for strings (leaves) and non-0 otherwise.  Please note that
38  * ordinary nodes have a length information so that no terminating 0 is
39  * required for them.
40  */
41 
42 /* forward declaration */
43 typedef struct node_t node_t;
44 
45 /* String type and tree leaf.
46  */
47 struct svn_prefix_string__t
48 {
49   /* mandatory prefix */
50   node_t *prefix;
51 
52   /* 0 ..7 chars to add the prefix.
53    *
54    * NUL-terminated, if this is indeed a tree leaf.  We use the same struct
55    * within node_t for inner tree nodes, too.  There, DATA[7] is not NUL,
56    * meaning DATA may or may not be NUL terminated.  The actual length is
57    * provided by the node_t.length field (minus parent node length). */
58   char data[8];
59 };
60 
61 /* A node inside the tree, i.e. not a string and not a leaf (unless this is
62  * the root node).
63  *
64  * Note: keep the ordering to minimize size / alignment overhead on 64 bit
65  * machines.
66  */
67 struct node_t
68 {
69   /* pointer to the parent prefix plus the 1 .. 8 extra chars.
70    * Only the root will provide 0 extra chars. */
71   svn_prefix_string__t key;
72 
73   /* Length of the prefix from the root down to and including this one.
74    * 0 for the root node.  Only then will key.prefix be NULL. */
75   apr_uint32_t length;
76 
77   /* Number of entries used in SUB_NODES. */
78   apr_uint32_t sub_node_count;
79 
80   /* The sub-nodes, ordered by first char.  node_t and svn_prefix_string__t
81    * may be mixed here.  May be NULL.
82    * The number of allocated entries is always a power-of-two and only
83    * given implicitly by SUB_NODE_COUNT. */
84   struct node_t **sub_nodes;
85 };
86 
87 /* The actual tree structure. */
88 struct svn_prefix_tree__t
89 {
90   /* the common tree root (represents the empty prefix). */
91   node_t *root;
92 
93   /* all sub-nodes & strings will be allocated from this pool */
94   apr_pool_t *pool;
95 };
96 
97 /* Return TRUE, iff NODE is a leaf node.
98  */
99 static svn_boolean_t
is_leaf(node_t * node)100 is_leaf(node_t *node)
101 {
102   /* If this NOT a leaf node and this node has ...
103    *    ... 8 chars, data[7] will not be NUL because we only support
104    *        NUL-*terminated* strings.
105    *    ... less than 8 chars, this will be set to 0xff
106    *        (any other non-NUL would do as well but this is not valid UTF8
107    *         making it easy to recognize during debugging etc.) */
108   return node->key.data[7] == 0;
109 }
110 
111 /* Ensure that the sub-nodes array of NODE within TREE has at least one
112  * unused entry.  Re-allocate as necessary.
113  */
114 static void
auto_realloc_sub_nodes(svn_prefix_tree__t * tree,node_t * node)115 auto_realloc_sub_nodes(svn_prefix_tree__t *tree,
116                        node_t *node)
117 {
118   if (node->sub_node_count & (node->sub_node_count - 1))
119     return;
120 
121   if (node->sub_node_count == 0)
122     {
123       node->sub_nodes = apr_pcalloc(tree->pool, sizeof(*node->sub_nodes));
124     }
125   else
126     {
127       node_t **sub_nodes
128         = apr_pcalloc(tree->pool,
129                       2 * node->sub_node_count * sizeof(*sub_nodes));
130       memcpy(sub_nodes, node->sub_nodes,
131              node->sub_node_count * sizeof(*sub_nodes));
132       node->sub_nodes = sub_nodes;
133     }
134 }
135 
136 /* Given the COUNT pointers in the SUB_NODES array, return the location at
137  * which KEY is either located or would be inserted.
138  */
139 static int
search_lower_bound(node_t ** sub_nodes,unsigned char key,int count)140 search_lower_bound(node_t **sub_nodes,
141                    unsigned char key,
142                    int count)
143 {
144   int lower = 0;
145   int upper = count - 1;
146 
147   /* Binary search for the lowest position at which to insert KEY. */
148   while (lower <= upper)
149     {
150       int current = lower + (upper - lower) / 2;
151 
152       if ((unsigned char)sub_nodes[current]->key.data[0] < key)
153         lower = current + 1;
154       else
155         upper = current - 1;
156     }
157 
158   return lower;
159 }
160 
161 svn_prefix_tree__t *
svn_prefix_tree__create(apr_pool_t * pool)162 svn_prefix_tree__create(apr_pool_t *pool)
163 {
164   svn_prefix_tree__t *tree = apr_pcalloc(pool, sizeof(*tree));
165   tree->pool = pool;
166 
167   tree->root = apr_pcalloc(pool, sizeof(*tree->root));
168   tree->root->key.data[7] = '\xff'; /* This is not a leaf. See is_leaf(). */
169 
170   return tree;
171 }
172 
173 svn_prefix_string__t *
svn_prefix_string__create(svn_prefix_tree__t * tree,const char * s)174 svn_prefix_string__create(svn_prefix_tree__t *tree,
175                           const char *s)
176 {
177   svn_prefix_string__t *new_string;
178   apr_size_t len = strlen(s);
179   node_t *node = tree->root;
180   node_t *new_node;
181   int idx;
182 
183   /* walk the existing tree until we either find S or the node at which S
184    * has to be inserted */
185   while (TRUE)
186     {
187       node_t *sub_node;
188       int match = 1;
189 
190       /* index of the matching sub-node */
191       idx = node->sub_node_count
192           ? search_lower_bound(node->sub_nodes,
193                                (unsigned char)s[node->length],
194                                node->sub_node_count)
195           : 0;
196 
197       /* any (partially) matching sub-nodes? */
198       if (idx == (int)node->sub_node_count
199           || node->sub_nodes[idx]->key.data[0] != s[node->length])
200         break;
201 
202       /* Yes, it matches - at least the first character does. */
203       sub_node = node->sub_nodes[idx];
204 
205       /* fully matching sub-node? */
206       if (is_leaf(sub_node))
207         {
208           if (strcmp(sub_node->key.data, s + node->length) == 0)
209             return &sub_node->key;
210         }
211       else
212         {
213           /* The string formed by the path from the root down to
214            * SUB_NODE differs from S.
215            *
216            * Is it a prefix?  In that case, the chars added by SUB_NODE
217            * must fully match the respective chars in S. */
218           apr_size_t sub_node_len = sub_node->length - node->length;
219           if (strncmp(sub_node->key.data, s + node->length,
220                       sub_node_len) == 0)
221             {
222               node = sub_node;
223               continue;
224             }
225         }
226 
227       /* partial match -> split
228        *
229        * At this point, S may either be a prefix to the string represented
230        * by SUB_NODE, or they may diverge at some offset with
231        * SUB_NODE->KEY.DATA .
232        *
233        * MATCH starts with 1 here b/c we already know that at least one
234        * char matches.  Also, the loop will terminate because the strings
235        * differ before SUB_NODE->KEY.DATA - either at the NUL terminator
236        * of S or some char before that.
237        */
238       while (sub_node->key.data[match] == s[node->length + match])
239         ++match;
240 
241       new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
242       new_node->key = sub_node->key;
243       new_node->length = node->length + match;
244       new_node->key.data[7] = '\xff';  /* This is not a leaf. See is_leaf(). */
245       new_node->sub_node_count = 1;
246       new_node->sub_nodes = apr_palloc(tree->pool, sizeof(node_t *));
247       new_node->sub_nodes[0] = sub_node;
248 
249       memmove(sub_node->key.data, sub_node->key.data + match, 8 - match);
250 
251       /* replace old sub-node with new one and continue lookup */
252       sub_node->key.prefix = new_node;
253       node->sub_nodes[idx] = new_node;
254       node = new_node;
255     }
256 
257   /* add sub-node(s) and final string */
258   while (len - node->length > 7)
259     {
260       new_node = apr_pcalloc(tree->pool, sizeof(*new_node));
261       new_node->key.prefix = node;
262       new_node->length = node->length + 8;
263       memcpy(new_node->key.data, s + node->length, 8);
264 
265       auto_realloc_sub_nodes(tree, node);
266       memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
267               (node->sub_node_count - idx) * sizeof(node_t *));
268 
269       /* replace old sub-node with new one and continue lookup */
270       node->sub_nodes[idx] = new_node;
271       node->sub_node_count++;
272       node = new_node;
273       idx = 0;
274     }
275 
276   new_string = apr_pcalloc(tree->pool, sizeof(*new_string));
277   new_string->prefix = node;
278   memcpy(new_string->data, s + node->length, len - node->length);
279 
280   auto_realloc_sub_nodes(tree, node);
281   memmove(node->sub_nodes + idx + 1, node->sub_nodes + idx,
282           (node->sub_node_count - idx) * sizeof(node_t *));
283 
284   node->sub_nodes[idx] = (node_t *)new_string;
285   node->sub_node_count++;
286   return new_string;
287 }
288 
289 svn_string_t *
svn_prefix_string__expand(const svn_prefix_string__t * s,apr_pool_t * pool)290 svn_prefix_string__expand(const svn_prefix_string__t *s,
291                           apr_pool_t *pool)
292 {
293   apr_size_t s_len = strlen(s->data);
294   apr_size_t len = s->prefix->length + s_len;
295   char *buffer = apr_palloc(pool, len + 1);
296 
297   svn_string_t *result = apr_pcalloc(pool, sizeof(*result));
298   result->data = buffer;
299   result->len = len;
300   buffer[len] = '\0';
301 
302   while (s->prefix)
303     {
304       memcpy(buffer + s->prefix->length, s->data, len - s->prefix->length);
305       len = s->prefix->length;
306       s = &s->prefix->key;
307     }
308 
309   return result;
310 }
311 
312 int
svn_prefix_string__compare(const svn_prefix_string__t * lhs,const svn_prefix_string__t * rhs)313 svn_prefix_string__compare(const svn_prefix_string__t *lhs,
314                            const svn_prefix_string__t *rhs)
315 {
316   const node_t *lhs_parent = lhs->prefix;
317   const node_t *rhs_parent = rhs->prefix;
318 
319   if (lhs == rhs)
320     return 0;
321 
322   /* find the common root */
323   while (lhs_parent != rhs_parent)
324     {
325       if (lhs_parent->length <= rhs_parent->length)
326         {
327           rhs = &rhs_parent->key;
328           rhs_parent = rhs_parent->key.prefix;
329         }
330       else if (rhs_parent->length <= lhs_parent->length)
331         {
332           lhs = &lhs_parent->key;
333           lhs_parent = lhs_parent->key.prefix;
334         }
335 
336       /* same tree? */
337       assert(lhs_parent && rhs_parent);
338     }
339 
340   /* at the common root, strings will differ in the first follow-up char */
341   return (int)(unsigned char)lhs->data[0] - (int)(unsigned char)rhs->data[0];
342 }
343