1 // storage_engine.h
2 
3 
4 /**
5  *    Copyright (C) 2018-present MongoDB, Inc.
6  *
7  *    This program is free software: you can redistribute it and/or modify
8  *    it under the terms of the Server Side Public License, version 1,
9  *    as published by MongoDB, Inc.
10  *
11  *    This program is distributed in the hope that it will be useful,
12  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *    Server Side Public License for more details.
15  *
16  *    You should have received a copy of the Server Side Public License
17  *    along with this program. If not, see
18  *    <http://www.mongodb.com/licensing/server-side-public-license>.
19  *
20  *    As a special exception, the copyright holders give permission to link the
21  *    code of portions of this program with the OpenSSL library under certain
22  *    conditions as described in each individual source file and distribute
23  *    linked combinations including the program with the OpenSSL library. You
24  *    must comply with the Server Side Public License in all respects for
25  *    all of the code used other than as permitted herein. If you modify file(s)
26  *    with this exception, you may extend this exception to your version of the
27  *    file(s), but you are not obligated to do so. If you do not wish to do so,
28  *    delete this exception statement from your version. If you delete this
29  *    exception statement from all source files in the program, then also delete
30  *    it in the license file.
31  */
32 
33 #pragma once
34 
35 #include <string>
36 #include <vector>
37 
38 #include "mongo/base/status.h"
39 #include "mongo/bson/bsonobj.h"
40 #include "mongo/bson/timestamp.h"
41 #include "mongo/util/mongoutils/str.h"
42 
43 namespace mongo {
44 
45 class DatabaseCatalogEntry;
46 class JournalListener;
47 class OperationContext;
48 class RecoveryUnit;
49 class SnapshotManager;
50 struct StorageGlobalParams;
51 class StorageEngineLockFile;
52 class StorageEngineMetadata;
53 
54 /**
55  * The StorageEngine class is the top level interface for creating a new storage
56  * engine.  All StorageEngine(s) must be registered by calling registerFactory in order
57  * to possibly be activated.
58  */
59 class StorageEngine {
60 public:
61     /**
62      * The interface for creating new instances of storage engines.
63      *
64      * A storage engine provides an instance of this class (along with an associated
65      * name) to the global environment, which then sets the global storage engine
66      * according to the provided configuration parameter.
67      */
68     class Factory {
69     public:
~Factory()70         virtual ~Factory() {}
71 
72         /**
73          * Return a new instance of the StorageEngine. The lockFile parameter may be null if
74          * params.readOnly is set. Caller owns the returned pointer.
75          */
76         virtual StorageEngine* create(const StorageGlobalParams& params,
77                                       const StorageEngineLockFile* lockFile) const = 0;
78 
79         /**
80          * Returns the name of the storage engine.
81          *
82          * Implementations that change the value of the returned string can cause
83          * data file incompatibilities.
84          */
85         virtual StringData getCanonicalName() const = 0;
86 
87         /**
88          * Validates creation options for a collection in the StorageEngine.
89          * Returns an error if the creation options are not valid.
90          *
91          * Default implementation only accepts empty objects (no options).
92          */
validateCollectionStorageOptions(const BSONObj & options)93         virtual Status validateCollectionStorageOptions(const BSONObj& options) const {
94             if (options.isEmpty())
95                 return Status::OK();
96             return Status(ErrorCodes::InvalidOptions,
97                           str::stream() << "storage engine " << getCanonicalName()
98                                         << " does not support any collection storage options");
99         }
100 
101         /**
102          * Validates creation options for an index in the StorageEngine.
103          * Returns an error if the creation options are not valid.
104          *
105          * Default implementation only accepts empty objects (no options).
106          */
validateIndexStorageOptions(const BSONObj & options)107         virtual Status validateIndexStorageOptions(const BSONObj& options) const {
108             if (options.isEmpty())
109                 return Status::OK();
110             return Status(ErrorCodes::InvalidOptions,
111                           str::stream() << "storage engine " << getCanonicalName()
112                                         << " does not support any index storage options");
113         }
114 
115         /**
116          * Validates existing metadata in the data directory against startup options.
117          * Returns an error if the storage engine initialization should not proceed
118          * due to any inconsistencies between the current startup options and the creation
119          * options stored in the metadata.
120          */
121         virtual Status validateMetadata(const StorageEngineMetadata& metadata,
122                                         const StorageGlobalParams& params) const = 0;
123 
124         /**
125          * Returns a new document suitable for storing in the data directory metadata.
126          * This document will be used by validateMetadata() to check startup options
127          * on restart.
128          */
129         virtual BSONObj createMetadataOptions(const StorageGlobalParams& params) const = 0;
130 
131         /**
132          * Returns whether the engine supports read-only mode. If read-only mode is enabled, the
133          * engine may be started on a read-only filesystem (either mounted read-only or with
134          * read-only permissions). If readOnly mode is enabled, it is undefined behavior to call
135          * methods that write data (e.g. insertRecord). This method is provided on the Factory
136          * because it must be called before the storageEngine is instantiated.
137          */
supportsReadOnly()138         virtual bool supportsReadOnly() const {
139             return false;
140         }
141     };
142 
143     /**
144      * Called after the globalStorageEngine pointer has been set up, before any other methods
145      * are called. Any initialization work that requires the ability to create OperationContexts
146      * should be done here rather than in the constructor.
147      */
finishInit()148     virtual void finishInit() {}
149 
150     /**
151      * Returns a new interface to the storage engine's recovery unit.  The recovery
152      * unit is the durability interface.  For details, see recovery_unit.h
153      *
154      * Caller owns the returned pointer.
155      */
156     virtual RecoveryUnit* newRecoveryUnit() = 0;
157 
158     /**
159      * List the databases stored in this storage engine.
160      *
161      * XXX: why doesn't this take OpCtx?
162      */
163     virtual void listDatabases(std::vector<std::string>* out) const = 0;
164 
165     /**
166      * Return the DatabaseCatalogEntry that describes the database indicated by 'db'.
167      *
168      * StorageEngine owns returned pointer.
169      * It should not be deleted by any caller.
170      */
171     virtual DatabaseCatalogEntry* getDatabaseCatalogEntry(OperationContext* opCtx,
172                                                           StringData db) = 0;
173 
174     /**
175      * Returns whether the storage engine supports its own locking locking below the collection
176      * level. If the engine returns true, MongoDB will acquire intent locks down to the
177      * collection level and will assume that the engine will ensure consistency at the level of
178      * documents. If false, MongoDB will lock the entire collection in Shared/Exclusive mode
179      * for read/write operations respectively.
180      */
181     virtual bool supportsDocLocking() const = 0;
182 
183     /**
184      * Returns whether the storage engine supports locking at a database level.
185      */
supportsDBLocking()186     virtual bool supportsDBLocking() const {
187         return true;
188     }
189 
190     /**
191      * Returns whether the engine supports a journalling concept or not.
192      */
193     virtual bool isDurable() const = 0;
194 
195     /**
196      * Returns true if the engine does not persist data to disk; false otherwise.
197      */
198     virtual bool isEphemeral() const = 0;
199 
200     /**
201      * Only MMAPv1 should override this and return true to trigger MMAPv1-specific behavior.
202      */
isMmapV1()203     virtual bool isMmapV1() const {
204         return false;
205     }
206 
207     /**
208      * Closes all file handles associated with a database.
209      */
210     virtual Status closeDatabase(OperationContext* opCtx, StringData db) = 0;
211 
212     /**
213      * Deletes all data and metadata for a database.
214      */
215     virtual Status dropDatabase(OperationContext* opCtx, StringData db) = 0;
216 
217     /**
218      * @return number of files flushed
219      */
220     virtual int flushAllFiles(OperationContext* opCtx, bool sync) = 0;
221 
222     /**
223      * Transitions the storage engine into backup mode.
224      *
225      * During backup mode the storage engine must stabilize its on-disk files, and avoid
226      * any internal processing that may involve file I/O, such as online compaction, so
227      * a filesystem level backup may be performed.
228      *
229      * Storage engines that do not support this feature should use the default implementation.
230      * Storage engines that implement this must also implement endBackup().
231      *
232      * For Storage engines that implement beginBackup the _inBackupMode variable is provided
233      * to avoid multiple instance enterting/leaving backup concurrently.
234      *
235      * If this function returns an OK status, MongoDB can call endBackup to signal the storage
236      * engine that filesystem writes may continue. This function should return a non-OK status if
237      * filesystem changes cannot be stopped to allow for online backup. If the function should be
238      * retried, returns a non-OK status. This function may throw a WriteConflictException, which
239      * should trigger a retry by the caller. All other exceptions should be treated as errors.
240      */
beginBackup(OperationContext * opCtx)241     virtual Status beginBackup(OperationContext* opCtx) {
242         return Status(ErrorCodes::CommandNotSupported,
243                       "The current storage engine doesn't support backup mode");
244     }
245 
246     /**
247      * Transitions the storage engine out of backup mode.
248      *
249      * Storage engines that do not support this feature should use the default implementation.
250      *
251      * Storage engines implementing this feature should fassert when unable to leave backup mode.
252      */
endBackup(OperationContext * opCtx)253     virtual void endBackup(OperationContext* opCtx) {
254         return;
255     }
256 
257     /**
258      * Recover as much data as possible from a potentially corrupt RecordStore.
259      * This only recovers the record data, not indexes or anything else.
260      *
261      * Generally, this method should not be called directly except by the repairDatabase()
262      * free function.
263      *
264      * NOTE: MMAPv1 does not support this method and has its own repairDatabase() method.
265      */
266     virtual Status repairRecordStore(OperationContext* opCtx, const std::string& ns) = 0;
267 
268     /**
269      * This method will be called before there is a clean shutdown.  Storage engines should
270      * override this method if they have clean-up to do that is different from unclean shutdown.
271      * MongoDB will not call into the storage subsystem after calling this function.
272      *
273      * On error, the storage engine should assert and crash.
274      * There is intentionally no uncleanShutdown().
275      */
276     virtual void cleanShutdown() = 0;
277 
278     /**
279      * Returns the SnapshotManager for this StorageEngine or NULL if not supported.
280      *
281      * Pointer remains owned by the StorageEngine, not the caller.
282      */
getSnapshotManager()283     virtual SnapshotManager* getSnapshotManager() const {
284         return nullptr;
285     }
286 
287     /**
288      * Sets a new JournalListener, which is used by the storage engine to alert the rest of the
289      * system about journaled write progress.
290      */
291     virtual void setJournalListener(JournalListener* jl) = 0;
292 
293     /**
294      * Returns whether the storage engine supports "recover to stable timestamp". Returns false
295      * if the storage engine supports the "recover to stable timestamp" feature but does not have
296      * a stable timestamp, or if for some reason the storage engine is unable to recover to the
297      * last provided stable timestamp.
298      *
299      * It is illegal to call this concurrently with `setStableTimestamp` or
300      * `setInitialDataTimestamp`.
301      */
supportsRecoverToStableTimestamp()302     virtual bool supportsRecoverToStableTimestamp() const {
303         return false;
304     }
305 
306     /**
307      * Recovers the storage engine state to the last stable timestamp. "Stable" in this case
308      * refers to a timestamp that is guaranteed to never be rolled back. The stable timestamp
309      * used should be one provided by StorageEngine::setStableTimestamp().
310      *
311      * The "local" database is exempt and should not roll back any state except for
312      * "local.replset.minvalid" and "local.replset.checkpointTimestamp" which must roll back to
313      * the last stable timestamp.
314      *
315      * fasserts if StorageEngine::supportsRecoverToStableTimestamp() would return false.
316      */
recoverToStableTimestamp()317     virtual Status recoverToStableTimestamp() {
318         fassertFailed(40547);
319     }
320 
321     /**
322      * Sets the highest timestamp at which the storage engine is allowed to take a checkpoint.
323      * This timestamp can never decrease, and thus should be a timestamp that can never roll back.
324      */
setStableTimestamp(Timestamp timestamp)325     virtual void setStableTimestamp(Timestamp timestamp) {}
326 
327     /**
328      * Tells the storage engine the timestamp of the data at startup. This is necessary because
329      * timestamps are not persisted in the storage layer.
330      */
setInitialDataTimestamp(Timestamp timestamp)331     virtual void setInitialDataTimestamp(Timestamp timestamp) {}
332 
333     /**
334      * Sets the oldest timestamp for which the storage engine must maintain snapshot history
335      * through. Additionally, all future writes must be newer or equal to this value.
336      */
setOldestTimestamp(Timestamp timestamp)337     virtual void setOldestTimestamp(Timestamp timestamp) {}
338 
339     /**
340      *  Notifies the storage engine that a replication batch has completed.
341      *  This means that all the writes associated with the oplog entries in the batch are
342      *  finished and no new writes with timestamps associated with those oplog entries will show
343      *  up in the future.
344      *  This function can be used to ensure oplog visibility rules are not broken, for example.
345      */
replicationBatchIsComplete()346     virtual void replicationBatchIsComplete() const {};
347 
348     // (CollectionName, IndexName)
349     typedef std::pair<std::string, std::string> CollectionIndexNamePair;
350 
351     /**
352      * Drop abandoned idents. In the successful case, returns a list of collection, index name
353      * pairs to rebuild.
354      */
reconcileCatalogAndIdents(OperationContext * opCtx)355     virtual StatusWith<std::vector<CollectionIndexNamePair>> reconcileCatalogAndIdents(
356         OperationContext* opCtx) {
357         return std::vector<CollectionIndexNamePair>();
358     };
359 
360     /**
361      * Returns the all committed timestamp. All transactions with timestamps earlier than the
362      * all committed timestamp are committed. Only storage engines that support document level
363      * locking must provide an implementation. Other storage engines may provide a no-op
364      * implementation.
365      */
366     virtual Timestamp getAllCommittedTimestamp() const = 0;
367 
368     /**
369      * Returns the path to the directory which has the data files of database with `dbName`.
370      */
371     virtual std::string getFilesystemPathForDb(const std::string& dbName) const = 0;
372 
373 protected:
374     /**
375      * The destructor will never be called. See cleanShutdown instead.
376      */
~StorageEngine()377     virtual ~StorageEngine() {}
378 };
379 
380 }  // namespace mongo
381