1 /*
2 
3 	K-Way Merge Template
4 	By Jordan Zimmerman
5 
6 */
7 
8 #ifndef KMERGE_TREE_H
9 #define KMERGE_TREE_H
10 
11 /*
12 
13 K-Way Merge
14 
15 An implementation of "k-Way Merging" as described in "Fundamentals of Data Structures" by Horowitz/Sahni.
16 
17 The idea is to merge k sorted arrays limiting the number of comparisons. A tree is built containing the
18 results of comparing the heads of each array. The top most node is always the smallest entry. Then, its
19 corresponding leaf in the tree is refilled and the tree is processed again. It's easier to see in the
20 following example:
21 
22 Imagine 4 sorted arrays:
23 {5, 10, 15, 20}
24 {10, 13, 16, 19}
25 {2, 19, 26, 40}
26 {18, 22, 23, 24}
27 
28 The initial tree looks like this:
29 
30 
31            2
32        /       \
33      2           5
34    /   \       /   \
35  18     2    10     5
36 
37 The '/' and '\' represent links. The bottom row has the leaves and they contain the heads of the arrays.
38 The rows above the leaves represent the smaller of the two child nodes. Thus, the top node is the smallest.
39 To process the next iteration, the top node gets popped and its leaf gets refilled. Then, the new leaf's
40 associated nodes are processed. So, after the 2 is taken, it is filled with 19 (the next head of its array).
41 After processing, the tree looks like this:
42 
43 
44            5
45        /       \
46      18          5
47    /   \       /   \
48  18    19    10     5
49 
50 So, you can see how the number of comparisons is reduced.
51 
52 A good use of this is when you have a very large array that needs to be sorted. Break it up into n small
53 arrays and sort those. Then use this merge sort for the final sort. This can also be done with files. If
54 you have n sorted files, you can merge them into one sorted file. K Way Merging works best when comparing
55 is somewhat expensive.
56 
57 */
58 
59 #include <math.h>
60 
61 #include <functional>
62 
63 template <class T, class owner_t, class iterator_t, class comparitor = std::less<T> >
64 class kmerge_tree_c
65 {
66 
67 public:
68 	// create the tree with the given number of buckets. Call add() for each of the buckets
69 	// and then call execute() to build things. Call get_top() then next() until get_top returns
70 	// false.
71 	kmerge_tree_c(long bucket_qty);
72 	~kmerge_tree_c();
73 
74 	// add a sorted collection to the tree
75 	//
76 	// begin/end - start end of a collection
77 	void			add(owner_t *owner, iterator_t begin, iterator_t end);
78 
79 	// process the first sort
80 	void			execute(void);
81 
82 	// advance to the next entry
83 	void			next(void);
84 
85 	// return the next entry without re-processing the tree
86 	// if false is returned, the merge is complete
get_top(owner_t * & owner,iterator_t & iterator)87 	bool			get_top(owner_t *&owner, iterator_t& iterator)
88 	{
89         if (top_node_ptr_mbr->has_iterator) {
90             owner = top_node_ptr_mbr->owner_ptr;
91             iterator = top_node_ptr_mbr->current_iterator;
92             return iterator != top_node_ptr_mbr->end_iterator;
93         }
94         else {
95             return false;
96         }
97 	}
98 
99 private:
100 	class node_rec
101 	{
102 
103 	public:
node_rec(void)104 		node_rec(void) :
105 			left_child_ptr(NULL),
106 			right_child_ptr(NULL),
107 			parent_ptr(NULL),
108 			next_ptr(NULL),
109 			previous_ptr(NULL),
110 			next_leaf_ptr(NULL),
111 			source_node_ptr(NULL),
112             owner_ptr(NULL),
113 			has_iterator(false)
114 		{
115 		}
~node_rec()116 		~node_rec()
117 		{
118 			delete left_child_ptr;
119 			delete right_child_ptr;
120 		}
121 
122 		node_rec*				left_child_ptr;		// owned	the left child of this node
123 		node_rec*				right_child_ptr;	// owned	the right child of this node
124 		node_rec*				parent_ptr;			// copy		the parent of this node
125 		node_rec*				next_ptr;			// copy		the next sibling
126 		node_rec*				previous_ptr;		// copy		the previous sibling
127 		node_rec*				next_leaf_ptr;		// copy		only for the bottom rows, a linked list of the buckets
128 		node_rec*				source_node_ptr;	// copy		ptr back to the node from whence this iterator came
129         owner_t                 *owner_ptr;
130 		int						has_iterator;
131 		iterator_t				current_iterator;
132 		iterator_t				end_iterator;
133 
134 	private:
135 		node_rec(const node_rec&);
136 		node_rec&	operator=(const node_rec&);
137 
138 	};
139 
140 	void				build_tree(void);
141 	node_rec*			build_levels(long number_of_levels);
142 	void				build_left_siblings(kmerge_tree_c::node_rec* node_ptr);
143 	void				build_right_siblings(kmerge_tree_c::node_rec* node_ptr);
144 	void				compare_nodes(kmerge_tree_c::node_rec* node_ptr);
145 
146 	comparitor	comparitor_mbr;
147 	long		bucket_qty_mbr;
148 	long		number_of_levels_mbr;
149 	node_rec*	top_node_ptr_mbr;	// owned
150 	node_rec*	first_leaf_ptr;		// copy
151 	node_rec*	last_leaf_ptr;		// copy
152 
153 };
154 
kmerge_tree_brute_log2(long value)155 inline long kmerge_tree_brute_log2(long value)
156 {
157 
158         long    square = 2;
159         long    count = 1;
160         while ( square < value )
161         {
162                 square *= 2;
163                 ++count;
164         }
165 
166         return count;
167 
168 } // kmerge_tree_brute_log2
169 
170 //~~~~~~~~~~class kmerge_tree_c
171 
172 template <class T, class owner_t, class iterator_t, class comparitor>
kmerge_tree_c(long bucket_qty)173 kmerge_tree_c<T, owner_t, iterator_t, comparitor>::kmerge_tree_c(long bucket_qty) :
174 	bucket_qty_mbr(bucket_qty),
175     number_of_levels_mbr(bucket_qty ? ::kmerge_tree_brute_log2(bucket_qty_mbr) : 0),    // don't add one - build_levels is zero based
176 	top_node_ptr_mbr(NULL),
177 	first_leaf_ptr(NULL),
178 	last_leaf_ptr(NULL)
179 {
180 
181 	build_tree();
182 
183 }
184 
185 template <class T, class owner_t, class iterator_t, class comparitor>
~kmerge_tree_c()186 kmerge_tree_c<T, owner_t, iterator_t, comparitor>::~kmerge_tree_c()
187 {
188 
189 	delete top_node_ptr_mbr;
190 
191 }
192 
193 /*
194 	Unlike the book, I'm going to make my life easy
195 	by maintaining every possible link to each node that I might need
196 */
197 template <class T, class owner_t, class iterator_t, class comparitor>
build_tree(void)198 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_tree(void)
199 {
200 
201 	// the book says that the number of levels is (log2 * k) + 1
202 	top_node_ptr_mbr = build_levels(number_of_levels_mbr);
203 	if ( top_node_ptr_mbr )
204 	{
205 		build_left_siblings(top_node_ptr_mbr);
206 		build_right_siblings(top_node_ptr_mbr);
207 	}
208 
209 }
210 
211 /*
212 	highly recursive tree builder
213 	as long as number_of_levels isn't zero, each node builds its own children
214 	and updates the parent link for them.
215 
216 	If no children are to be built (i.e. number_of_levels is 0), then the leaf linked list is created
217 */
218 template <class T, class owner_t, class iterator_t, class comparitor>
219 typename kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec *
build_levels(long number_of_levels)220 kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_levels(long number_of_levels)
221 {
222 
223 	node_rec*		node_ptr = new node_rec;
224 
225 	if ( number_of_levels )
226 	{
227 		node_ptr->left_child_ptr = build_levels(number_of_levels - 1);
228 		if ( node_ptr->left_child_ptr )
229 		{
230 			node_ptr->left_child_ptr->parent_ptr = node_ptr;
231 			node_ptr->right_child_ptr = build_levels(number_of_levels - 1);
232 			if ( node_ptr->right_child_ptr )
233 			{
234 				node_ptr->right_child_ptr->parent_ptr = node_ptr;
235 			}
236 		}
237 	}
238 	else
239 	{
240 		if ( last_leaf_ptr )
241 		{
242 			last_leaf_ptr->next_leaf_ptr = node_ptr;
243 			last_leaf_ptr = node_ptr;
244 		}
245 		else
246 		{
247 			first_leaf_ptr = last_leaf_ptr = node_ptr;
248 		}
249 	}
250 
251 	return node_ptr;
252 
253 }
254 
255 /*
256 	when we process a comparison, we'll have to compare two siblings
257 	this code matches each link with its sibling
258 
259 	To get every node correct, I had to write two routines: one which works
260 	with left nodes and one which works with right nodes. build_tree() starts it
261 	off with the top node's children and then these two swing back and forth
262 	between each other.
263 */
264 
265 template <class T, class owner_t, class iterator_t, class comparitor>
build_left_siblings(kmerge_tree_c<T,owner_t,iterator_t,comparitor>::node_rec * node_ptr)266 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_left_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
267 {
268 
269 	if ( node_ptr->parent_ptr )
270 	{
271 		if ( node_ptr->parent_ptr->right_child_ptr )
272 		{
273 			node_ptr->next_ptr = node_ptr->parent_ptr->right_child_ptr;
274 			node_ptr->parent_ptr->right_child_ptr->previous_ptr = node_ptr;
275 		}
276 	}
277 
278 	if ( node_ptr->left_child_ptr )
279 	{
280 		build_left_siblings(node_ptr->left_child_ptr);
281 	}
282 
283 	if ( node_ptr->right_child_ptr )
284 	{
285 		build_right_siblings(node_ptr->right_child_ptr);
286 	}
287 
288 }
289 
290 template <class T, class owner_t, class iterator_t, class comparitor>
build_right_siblings(kmerge_tree_c<T,owner_t,iterator_t,comparitor>::node_rec * node_ptr)291 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::build_right_siblings(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
292 {
293 
294 	if ( node_ptr->parent_ptr )
295 	{
296 		if ( node_ptr->parent_ptr->left_child_ptr )
297 		{
298 			node_ptr->previous_ptr = node_ptr->parent_ptr->left_child_ptr;
299 			node_ptr->parent_ptr->left_child_ptr->next_ptr = node_ptr;
300 		}
301 	}
302 
303 	if ( node_ptr->right_child_ptr )
304 	{
305 		build_right_siblings(node_ptr->right_child_ptr);
306 	}
307 
308 	if ( node_ptr->left_child_ptr )
309 	{
310 		build_left_siblings(node_ptr->left_child_ptr);
311 	}
312 
313 }
314 
315 // fill the leaf linked list
316 template <class T, class owner_t, class iterator_t, class comparitor>
add(owner_t * owner,iterator_t begin,iterator_t end)317 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::add(owner_t *owner, iterator_t begin, iterator_t end)
318 {
319 
320 	if ( begin == end )
321 	{
322 		return;
323 	}
324 
325 	node_rec*		node_ptr = first_leaf_ptr;
326 	while ( node_ptr )
327 	{
328 		if ( node_ptr->has_iterator )
329 		{
330 			node_ptr = node_ptr->next_leaf_ptr;
331 		}
332 		else
333 		{
334 			node_ptr->has_iterator = true;
335             node_ptr->owner_ptr = owner;
336 			node_ptr->current_iterator = begin;
337 			node_ptr->end_iterator = end;
338 			break;
339 		}
340 	}
341 }
342 
343 /*
344 	fill the initial tree by comparing each sibling in the
345 	leaf linked list and then factoring that up to the parents
346 	This is only done once so it doesn't have to be that efficient
347 */
348 template <class T, class owner_t, class iterator_t, class comparitor>
execute(void)349 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::execute(void)
350 {
351 
352 	for ( long parent_level = 0; parent_level < number_of_levels_mbr; ++parent_level )
353 	{
354 		// the number of nodes to skip is (parent level + 1) ^ 2 - 1
355 		long			skip_nodes = 2;
356 		for ( long skip = 0; skip < parent_level; ++skip )
357 		{
358 			skip_nodes *= 2;
359 		}
360 		--skip_nodes;
361 
362 		node_rec*		node_ptr = first_leaf_ptr;
363 		while ( node_ptr )
364 		{
365 			node_rec*	parent_ptr = node_ptr;
366 			long		i;
367 			for ( i = 0; i < parent_level; ++i )
368 			{
369 				parent_ptr = parent_ptr->parent_ptr;
370 			}
371 
372 			compare_nodes(parent_ptr);
373 
374 			for ( i = 0; i < skip_nodes; ++i )
375 			{
376 				node_ptr = node_ptr->next_leaf_ptr;
377 			}
378 
379 			node_ptr = node_ptr->next_leaf_ptr;
380 		}
381 	}
382 
383 }
384 
385 // compare the given node and its sibling and bubble the result up to the parent
386 template <class T, class owner_t, class iterator_t, class comparitor>
compare_nodes(kmerge_tree_c<T,owner_t,iterator_t,comparitor>::node_rec * node_ptr)387 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::compare_nodes(kmerge_tree_c<T, owner_t, iterator_t, comparitor>::node_rec* node_ptr)
388 {
389 
390 //	assert(node_ptr->parent_ptr);
391 
392 	node_rec*	node1_ptr = NULL;
393 	node_rec*	node2_ptr = NULL;
394 	if ( node_ptr->next_ptr )
395 	{
396 		node1_ptr = node_ptr;
397 		node2_ptr = node_ptr->next_ptr;
398 	}
399 	else
400 	{
401 		node1_ptr = node_ptr->previous_ptr;
402 		node2_ptr = node_ptr;
403 	}
404 
405 	long				result;
406     if ( !node2_ptr->has_iterator ) {
407         result = -1;
408     }
409     else if ( !node1_ptr->has_iterator) {
410         result = 1;
411     }
412 	else if ( (node1_ptr->current_iterator != node1_ptr->end_iterator) && (node2_ptr->current_iterator != node2_ptr->end_iterator) )
413 	{
414 		// no need to check for exact equality - we just want the lesser of the two
415 		result = comparitor_mbr(*node1_ptr->current_iterator, *node2_ptr->current_iterator) ? -1 : 1;
416 	}
417 	else if ( node1_ptr->current_iterator != node1_ptr->end_iterator )
418 	{
419 		result = -1;
420 	}
421 	else // if ( node2_ptr->current_iterator != node2_ptr->end_iterator )
422 	{
423 		result = 1;
424 	}
425 
426 	node_rec*	winner_ptr = (result <= 0) ? node1_ptr : node2_ptr;
427 	node_rec*	parent_ptr = node_ptr->parent_ptr;
428 
429     parent_ptr->owner_ptr = winner_ptr->owner_ptr;
430     parent_ptr->has_iterator = winner_ptr->has_iterator;
431 	parent_ptr->current_iterator = winner_ptr->current_iterator;
432 	parent_ptr->end_iterator = winner_ptr->end_iterator;
433 	parent_ptr->source_node_ptr = winner_ptr;
434 
435 }
436 
437 /*
438 	pop the top node, factor it down the list to find its
439 	leaf, replace its leaf and then factor that back up
440 */
441 template <class T, class owner_t, class iterator_t, class comparitor>
next(void)442 void kmerge_tree_c<T, owner_t, iterator_t, comparitor>::next(void)
443 {
444 
445 	if ( top_node_ptr_mbr->current_iterator == top_node_ptr_mbr->end_iterator )
446 	{
447 		return;
448 	}
449 
450 	// find the source leaf ptr
451 	node_rec*	node_ptr = top_node_ptr_mbr;
452 	while ( node_ptr->source_node_ptr )
453 	{
454 		node_ptr = node_ptr->source_node_ptr;
455 	}
456 
457 //	assert(!node_ptr->left_child_ptr && !node_ptr->right_child_ptr);
458 //	assert(top_node_ptr_mbr->current_iterator == node_ptr->current_iterator);
459 
460 	++node_ptr->current_iterator;
461 
462 	while ( node_ptr->parent_ptr )
463 	{
464 		compare_nodes(node_ptr);
465 		node_ptr = node_ptr->parent_ptr;
466 	}
467 
468 }
469 
470 #endif	// KMERGE_TREE_H
471 
472