1 /*  $Id: esort.h 1649 2009-10-19 14:35:01Z terpstra $
2  *
3  *  esort.h - Public interface to libesort
4  *
5  *  Copyright (C) 2002 - Wesley W. Terpstra
6  *
7  *  License: GPL
8  *
9  *  Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca>
10  *
11  *    This program is free software; you can redistribute it and/or modify
12  *    it under the terms of the GNU General Public License as published by
13  *    the Free Software Foundation; version 2.1.
14  *
15  *    This program is distributed in the hope that it will be useful,
16  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *    GNU General Public License for more details.
19  *
20  *    You should have received a copy of the GNU General Public License
21  *    along with this program; if not, write to the Free Software
22  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  */
24 
25 #ifndef ESORT_H
26 #define ESORT_H
27 
28 #include <string>
29 #include <memory>
30 
31 /* What is ESort? -- An Online External Sort
32  *
33  * ESort provides a very primitive database API: An insert-only set of keys.
34  *
35  * There are very different trade-offs when compared with a standard database.
36  *
37  * N = number of keys in the set, M = number of operations
38  * The seeks needed to for each repeated operation in one transaction:
39  *
40  *                    ESort           BTree            Hash
41  * Insertion          O(1)            O(M*log(N))      O(M)
42  * Read in sequence   O(1)            O(M)             impossible
43  * Seek to key        O(M*log^2(N))   O(M*log(N))      O(M)
44  * Delete             impossible      O(M*log(N))      O(M)
45  *
46  * From this table, one can conclude that ESort allows blindly fast insertion
47  * and sequential read, but pays in the cost to seek. This is what makes it
48  * highly appropriate for keyword indexes.
49  *
50  * An additional restriction is that ESort only supports a single-writer.
51  * An additional advantage is that ESort readers get snap-shots.
52  */
53 
54 namespace ESort
55 {
56 
57 using std::string;
58 using std::auto_ptr;
59 
60 /** These parameters specify minimum values required for a database.
61  *  If an existing database has too small a value, an error is returned.
62  *  The unique flag toggles whether the database is a set or multiset.
63  */
64 class Parameters
65 {
66  protected:
67 	unsigned int	version_;
68 	unsigned long	blockSize_;
69 	unsigned long	keySize_;
70 	bool		unique_;
71 	bool		synced_;
72 	unsigned int	keyWidth_;
73 
74  public:
75  	// keySize & blockSize are the number of bytes maximum
76 	Parameters(
77 		bool          synced    = true,
78 		bool          unique    = true,
79 		unsigned int  version   = 1,
80 		unsigned long blockSize = 8192,
81 		unsigned long keySize   = 255);
82 
83 	bool isWider(const Parameters& o);
84 
version()85 	unsigned int  version  () const { return version_;   }
blockSize()86 	unsigned long blockSize() const { return blockSize_; }
keySize()87 	unsigned long keySize  () const { return keySize_;   }
unique()88 	bool          unique   () const { return unique_;    }
synced()89 	bool          synced   () const { return synced_;    }
keyWidth()90 	unsigned int  keyWidth () const { return keyWidth_;  }
91 
minimize(const Parameters & x)92 	static Parameters minimize(const Parameters& x)
93 	{
94 		return Parameters(x.synced(), x.unique(), 1, 8, 1);
95 	}
96 };
97 
98 /** The direction in which the Walker walks the database.
99  *
100  *  A Forward  Query will find the first key >= the request
101  *  A Backward Query will find the last  key <  the request
102  *
103  *  The Forward  Query moves the iterator so the key increases
104  *  The Backward Query moves the iterator so the key decreases
105  *
106  *  In this manner Forward + Backward patition the database's keys
107  */
108 enum Direction
109 {
110 	Forward,
111 	Backward
112 };
113 
114 /** The Walker class walks the database in sorted order.
115  *  As from the chart above, each invokation of advance() pays 0 seeks.
116  */
117 class Walker
118 {
119  public:
120  	/** The current key which the walker points to.
121  	 *  This is not valid until advance is called once.
122  	 */
123  	string		key;
124 
125  	/** Advance to the next key in the database.
126  	 *  Returned is the number of bytes in common with the last key.
127  	 *  -1 is returned on error, errno = 0 -> reached EOF.
128  	 */
129 	virtual int advance() = 0;
130 
131  	virtual ~Walker();
132 };
133 
134 /** A Reader object allows you to obtain Walkers.
135  *
136  *  A Reader is atomic. When you obtain a Reader, the contents of the set
137  *  remain fixed for as long as you keep a hold of the handle. Furthermore,
138  *  a Reader will take snapshots of the database only between Writer commits.
139  */
140 class Reader
141 {
142  public:
143  	virtual ~Reader();
144 
145  	/** Obtain a Walker as described in Direction.
146  	 *  This costs O(log^2(N)) seeks, so try not to call it too often.
147  	 *
148  	 *  This operation never fails; you must call advance() before use.
149  	 */
150  	virtual auto_ptr<Walker> seek(const string& k, Direction dir) = 0;
151 
152  	/** Obtain a record Walker over only those records which are
153  	 *  prefixed by 'prefix'. When the last such is passed, eof is
154  	 *  returned.
155  	 *
156  	 *  This operation never failes; you must call advance() before use.
157  	 */
158  	virtual auto_ptr<Walker> seek(const string& prefix, const string& k, Direction dir) = 0;
159 
160  	/** Open an existing database for reading.
161  	 *  0 is returned and errno set on error.
162  	 */
163  	static auto_ptr<Reader> opendb(
164  		const string& db,
165  		const Parameters& p = Parameters::minimize(Parameters()));
166 };
167 
168 /** The Writer object allows you to insert keys.
169  *
170  *  The Writer is atomic. When a key is inserted, it immediately becomes
171  *  available in all Walkers & Readers obtained from the Writer. However,
172  *  all open Readers never see the inserted keys, and new Readers do not
173  *  see them until the Writer calls commit.
174  */
175 class Writer : public Reader
176 {
177  public:
178  	/** Insert a key into the database.
179  	 *  If an error occures, -1 is returned and errno set, else 0.
180  	 *  Be sure the key is smaller than keySize on pain of EINVAL.
181  	 */
182  	virtual int insert(const string& k) = 0;
183 
184  	/** Commit the changes to the database.
185  	 *  If an error occures, -1 is returned and errno set, else 0.
186  	 */
187  	virtual int commit() = 0;
188 
189  	/** Open a database for writing.
190  	 *  It is created if it did not exist.
191  	 *
192  	 *  The mode is identical to open(2).
193  	 *
194  	 *  0 is returned and errno set on error.
195  	 */
196  	static auto_ptr<Writer> opendb(
197  		const string& db, const Parameters& p = Parameters(),
198  		int mode = 0666);
199 };
200 
201 }
202 
203 #endif
204