1 /*
2
3 K-Way Merge Template
4 By Jordan Zimmerman
5
6 */
7
8 #ifndef KMERGE_TREE_H
9 #define KMERGE_TREE_H
10
11 /*
12
13 K-Way Merge
14
15 An implementation of "k-Way Merging" as described in "Fundamentals of Data Structures" by Horowitz/Sahni.
16
17 The idea is to merge k sorted arrays limiting the number of comparisons. A tree is built containing the
18 results of comparing the heads of each array. The top most node is always the smallest entry. Then, its
19 corresponding leaf in the tree is refilled and the tree is processed again. It's easier to see in the
20 following example:
21
22 Imagine 4 sorted arrays:
23 {5, 10, 15, 20}
24 {10, 13, 16, 19}
25 {2, 19, 26, 40}
26 {18, 22, 23, 24}
27
28 The initial tree looks like this:
29
30
31 2
32 / \
33 2 5
34 / \ / \
35 18 2 10 5
36
37 The '/' and '\' represent links. The bottom row has the leaves and they contain the heads of the arrays.
38 The rows above the leaves represent the smaller of the two child nodes. Thus, the top node is the smallest.
39 To process the next iteration, the top node gets popped and its leaf gets refilled. Then, the new leaf's
40 associated nodes are processed. So, after the 2 is taken, it is filled with 19 (the next head of its array).
41 After processing, the tree looks like this:
42
43
44 5
45 / \
46 18 5
47 / \ / \
48 18 19 10 5
49
50 So, you can see how the number of comparisons is reduced.
51
52 A good use of this is when you have a very large array that needs to be sorted. Break it up into n small
53 arrays and sort those. Then use this merge sort for the final sort. This can also be done with files. If
54 you have n sorted files, you can merge them into one sorted file. K Way Merging works best when comparing
55 is somewhat expensive.
56
57 */
58
59 #include <math.h>
60
61 #include <functional>
62
63 template <class T, class owner_t, class iterator_t, class comparitor = std::less<T> >
64 class kmerge_tree_c
65 {
66
67 public:
68 // create the tree with the given number of buckets. Call add() for each of the buckets
69 // and then call execute() to build things. Call get_top() then next() until get_top returns
70 // false.
71 kmerge_tree_c(long bucket_qty);
72 ~kmerge_tree_c();
73
74 // add a sorted collection to the tree
75 //
76 // begin/end - start end of a collection
77 void add(owner_t *owner, iterator_t begin, iterator_t end);
78
79 // process the first sort
80 void execute(void);
81
82 // advance to the next entry
83 void next(void);
84
85 // return the next entry without re-processing the tree
86 // if false is returned, the merge is complete
get_top(owner_t * & owner,iterator_t & iterator)87 bool get_top(owner_t *&owner, iterator_t& iterator)
88 {
89 if (top_node_ptr_mbr->has_iterator) {
90 owner = top_node_ptr_mbr->owner_ptr;
91 iterator = top_node_ptr_mbr->current_iterator;
92 return iterator != top_node_ptr_mbr->end_iterator;
93 }
94 else {
95 return false;
96 }
97 }
98
99 private:
100 class node_rec
101 {
102
103 public:
node_rec(void)104 node_rec(void) :
105 left_child_ptr(NULL),
106 right_child_ptr(NULL),
107 parent_ptr(NULL),
108 next_ptr(NULL),
109 previous_ptr(NULL),
110 next_leaf_ptr(NULL),
111 source_node_ptr(NULL),
112 owner_ptr(NULL),
113 has_iterator(false)
114 {
115 }
~node_rec()116 ~node_rec()
117 {
118 delete left_child_ptr;
119 delete right_child_ptr;
120 }
121
122 node_rec* left_child_ptr; // owned the left child of this node
123 node_rec* right_child_ptr; // owned the right child of this node
124 node_rec* parent_ptr; // copy the parent of this node
125 node_rec* next_ptr; // copy the next sibling
126 node_rec* previous_ptr; // copy the previous sibling
127 node_rec* next_leaf_ptr; // copy only for the bottom rows, a linked list of the buckets
128 node_rec* source_node_ptr; // copy ptr back to the node from whence this iterator came
129 owner_t *owner_ptr;
130 int has_iterator;
131 iterator_t current_iterator;
132 iterator_t end_iterator;
133
134 private:
135 node_rec(const node_rec&);
136 node_rec& operator=(const node_rec&);
137
138 };
139
140 void build_tree(void);
141 node_rec* build_levels(long number_of_levels);
142 void build_left_siblings(kmerge_tree_c::node_rec* node_ptr);
143 void build_right_siblings(kmerge_tree_c::node_rec* node_ptr);
144 void compare_nodes(kmerge_tree_c::node_rec* node_ptr);
145
146 comparitor comparitor_mbr;
147 long bucket_qty_mbr;
148 long number_of_levels_mbr;
149 node_rec* top_node_ptr_mbr; // owned
150 node_rec* first_leaf_ptr; // copy
151 node_rec* last_leaf_ptr; // copy
152
153 };
154
kmerge_tree_brute_log2(long value)155 inline long kmerge_tree_brute_log2(long value)
156 {
157
158 long square = 2;
159 long count = 1;
160 while ( square < value )
161 {
162 square *= 2;
163 ++count;
164 }
165
166 return count;
167
168 } // kmerge_tree_brute_log2
169
170 //~~~~~~~~~~class kmerge_tree_c
171
172 template <class T, class owner_t, class iterator_t, class comparitor>
kmerge_tree_c(long bucket_qty)173 kmerge_tree_c<T, owner_t, iterator_t, comparitor>::kmerge_tree_c(long bucket_qty) :
174 bucket_qty_mbr(bucket_qty),
175 number_of_levels_mbr(bucket_qty ? ::kmerge_tree_brute_log2(bucket_qty_mbr) : 0), // don't add one - build_levels is zero based
176 top_node_ptr_mbr(NULL),
177 first_leaf_ptr(NULL),
178 last_leaf_ptr(NULL)
179 {
180
181 build_tree();
182
183 }
184
185 template <class T, class owner_t, class iterator_t, class comparitor>
~kmerge_tree_c()186 kmerge_tree_c<T, owner_t, iterator_t, comparitor>::~kmerge_tree_c()
187 {
188
189 delete top_node_ptr_mbr;
190
191 }
192
193 /*
194 Unlike the book, I'm going to make my life easy
195 by maintaining every possible link to each node that I might need
196 */
197 template <class T, class owner_t, class iterator_t, class comparitor>
build_tree(void)198 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_tree(void)
199 {
200
201 // the book says that the number of levels is (log2 * k) + 1
202 top_node_ptr_mbr = build_levels(number_of_levels_mbr);
203 if ( top_node_ptr_mbr )
204 {
205 build_left_siblings(top_node_ptr_mbr);
206 build_right_siblings(top_node_ptr_mbr);
207 }
208
209 }
210
211 /*
212 highly recursive tree builder
213 as long as number_of_levels isn't zero, each node builds its own children
214 and updates the parent link for them.
215
216 If no children are to be built (i.e. number_of_levels is 0), then the leaf linked list is created
217 */
218 template <class T, class owner_t, class iterator_t, class comparitor>
219 typename kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec *
build_levels(long number_of_levels)220 kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_levels(long number_of_levels)
221 {
222
223 node_rec* node_ptr = new node_rec;
224
225 if ( number_of_levels )
226 {
227 node_ptr->left_child_ptr = build_levels(number_of_levels - 1);
228 if ( node_ptr->left_child_ptr )
229 {
230 node_ptr->left_child_ptr->parent_ptr = node_ptr;
231 node_ptr->right_child_ptr = build_levels(number_of_levels - 1);
232 if ( node_ptr->right_child_ptr )
233 {
234 node_ptr->right_child_ptr->parent_ptr = node_ptr;
235 }
236 }
237 }
238 else
239 {
240 if ( last_leaf_ptr )
241 {
242 last_leaf_ptr->next_leaf_ptr = node_ptr;
243 last_leaf_ptr = node_ptr;
244 }
245 else
246 {
247 first_leaf_ptr = last_leaf_ptr = node_ptr;
248 }
249 }
250
251 return node_ptr;
252
253 }
254
255 /*
256 when we process a comparison, we'll have to compare two siblings
257 this code matches each link with its sibling
258
259 To get every node correct, I had to write two routines: one which works
260 with left nodes and one which works with right nodes. build_tree() starts it
261 off with the top node's children and then these two swing back and forth
262 between each other.
263 */
264
265 template <class T, class owner_t, class iterator_t, class comparitor>
build_left_siblings(kmerge_tree_c<T,owner_t,iterator_t,comparitor>::node_rec * node_ptr)266 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_left_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
267 {
268
269 if ( node_ptr->parent_ptr )
270 {
271 if ( node_ptr->parent_ptr->right_child_ptr )
272 {
273 node_ptr->next_ptr = node_ptr->parent_ptr->right_child_ptr;
274 node_ptr->parent_ptr->right_child_ptr->previous_ptr = node_ptr;
275 }
276 }
277
278 if ( node_ptr->left_child_ptr )
279 {
280 build_left_siblings(node_ptr->left_child_ptr);
281 }
282
283 if ( node_ptr->right_child_ptr )
284 {
285 build_right_siblings(node_ptr->right_child_ptr);
286 }
287
288 }
289
290 template <class T, class owner_t, class iterator_t, class comparitor>
build_right_siblings(kmerge_tree_c<T,owner_t,iterator_t,comparitor>::node_rec * node_ptr)291 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_right_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
292 {
293
294 if ( node_ptr->parent_ptr )
295 {
296 if ( node_ptr->parent_ptr->left_child_ptr )
297 {
298 node_ptr->previous_ptr = node_ptr->parent_ptr->left_child_ptr;
299 node_ptr->parent_ptr->left_child_ptr->next_ptr = node_ptr;
300 }
301 }
302
303 if ( node_ptr->right_child_ptr )
304 {
305 build_right_siblings(node_ptr->right_child_ptr);
306 }
307
308 if ( node_ptr->left_child_ptr )
309 {
310 build_left_siblings(node_ptr->left_child_ptr);
311 }
312
313 }
314
315 // fill the leaf linked list
316 template <class T, class owner_t, class iterator_t, class comparitor>
add(owner_t * owner,iterator_t begin,iterator_t end)317 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::add(owner_t *owner, iterator_t begin, iterator_t end)
318 {
319
320 if ( begin == end )
321 {
322 return;
323 }
324
325 node_rec* node_ptr = first_leaf_ptr;
326 while ( node_ptr )
327 {
328 if ( node_ptr->has_iterator )
329 {
330 node_ptr = node_ptr->next_leaf_ptr;
331 }
332 else
333 {
334 node_ptr->has_iterator = true;
335 node_ptr->owner_ptr = owner;
336 node_ptr->current_iterator = begin;
337 node_ptr->end_iterator = end;
338 break;
339 }
340 }
341 }
342
343 /*
344 fill the initial tree by comparing each sibling in the
345 leaf linked list and then factoring that up to the parents
346 This is only done once so it doesn't have to be that efficient
347 */
348 template <class T, class owner_t, class iterator_t, class comparitor>
execute(void)349 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::execute(void)
350 {
351
352 for ( long parent_level = 0; parent_level < number_of_levels_mbr; ++parent_level )
353 {
354 // the number of nodes to skip is (parent level + 1) ^ 2 - 1
355 long skip_nodes = 2;
356 for ( long skip = 0; skip < parent_level; ++skip )
357 {
358 skip_nodes *= 2;
359 }
360 --skip_nodes;
361
362 node_rec* node_ptr = first_leaf_ptr;
363 while ( node_ptr )
364 {
365 node_rec* parent_ptr = node_ptr;
366 long i;
367 for ( i = 0; i < parent_level; ++i )
368 {
369 parent_ptr = parent_ptr->parent_ptr;
370 }
371
372 compare_nodes(parent_ptr);
373
374 for ( i = 0; i < skip_nodes; ++i )
375 {
376 node_ptr = node_ptr->next_leaf_ptr;
377 }
378
379 node_ptr = node_ptr->next_leaf_ptr;
380 }
381 }
382
383 }
384
385 // compare the given node and its sibling and bubble the result up to the parent
386 template <class T, class owner_t, class iterator_t, class comparitor>
compare_nodes(kmerge_tree_c<T,owner_t,iterator_t,comparitor>::node_rec * node_ptr)387 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::compare_nodes(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
388 {
389
390 // assert(node_ptr->parent_ptr);
391
392 node_rec* node1_ptr = NULL;
393 node_rec* node2_ptr = NULL;
394 if ( node_ptr->next_ptr )
395 {
396 node1_ptr = node_ptr;
397 node2_ptr = node_ptr->next_ptr;
398 }
399 else
400 {
401 node1_ptr = node_ptr->previous_ptr;
402 node2_ptr = node_ptr;
403 }
404
405 long result;
406 if ( !node2_ptr->has_iterator ) {
407 result = -1;
408 }
409 else if ( !node1_ptr->has_iterator) {
410 result = 1;
411 }
412 else if ( (node1_ptr->current_iterator != node1_ptr->end_iterator) && (node2_ptr->current_iterator != node2_ptr->end_iterator) )
413 {
414 // no need to check for exact equality - we just want the lesser of the two
415 result = comparitor_mbr(*node1_ptr->current_iterator, *node2_ptr->current_iterator) ? -1 : 1;
416 }
417 else if ( node1_ptr->current_iterator != node1_ptr->end_iterator )
418 {
419 result = -1;
420 }
421 else // if ( node2_ptr->current_iterator != node2_ptr->end_iterator )
422 {
423 result = 1;
424 }
425
426 node_rec* winner_ptr = (result <= 0) ? node1_ptr : node2_ptr;
427 node_rec* parent_ptr = node_ptr->parent_ptr;
428
429 parent_ptr->owner_ptr = winner_ptr->owner_ptr;
430 parent_ptr->has_iterator = winner_ptr->has_iterator;
431 parent_ptr->current_iterator = winner_ptr->current_iterator;
432 parent_ptr->end_iterator = winner_ptr->end_iterator;
433 parent_ptr->source_node_ptr = winner_ptr;
434
435 }
436
437 /*
438 pop the top node, factor it down the list to find its
439 leaf, replace its leaf and then factor that back up
440 */
441 template <class T, class owner_t, class iterator_t, class comparitor>
next(void)442 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::next(void)
443 {
444
445 if ( top_node_ptr_mbr->current_iterator == top_node_ptr_mbr->end_iterator )
446 {
447 return;
448 }
449
450 // find the source leaf ptr
451 node_rec* node_ptr = top_node_ptr_mbr;
452 while ( node_ptr->source_node_ptr )
453 {
454 node_ptr = node_ptr->source_node_ptr;
455 }
456
457 // assert(!node_ptr->left_child_ptr && !node_ptr->right_child_ptr);
458 // assert(top_node_ptr_mbr->current_iterator == node_ptr->current_iterator);
459
460 ++node_ptr->current_iterator;
461
462 while ( node_ptr->parent_ptr )
463 {
464 compare_nodes(node_ptr);
465 node_ptr = node_ptr->parent_ptr;
466 }
467
468 }
469
470 #endif // KMERGE_TREE_H
471
472