1 // storage_engine.h 2 3 4 /** 5 * Copyright (C) 2018-present MongoDB, Inc. 6 * 7 * This program is free software: you can redistribute it and/or modify 8 * it under the terms of the Server Side Public License, version 1, 9 * as published by MongoDB, Inc. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * Server Side Public License for more details. 15 * 16 * You should have received a copy of the Server Side Public License 17 * along with this program. If not, see 18 * <http://www.mongodb.com/licensing/server-side-public-license>. 19 * 20 * As a special exception, the copyright holders give permission to link the 21 * code of portions of this program with the OpenSSL library under certain 22 * conditions as described in each individual source file and distribute 23 * linked combinations including the program with the OpenSSL library. You 24 * must comply with the Server Side Public License in all respects for 25 * all of the code used other than as permitted herein. If you modify file(s) 26 * with this exception, you may extend this exception to your version of the 27 * file(s), but you are not obligated to do so. If you do not wish to do so, 28 * delete this exception statement from your version. If you delete this 29 * exception statement from all source files in the program, then also delete 30 * it in the license file. 31 */ 32 33 #pragma once 34 35 #include <string> 36 #include <vector> 37 38 #include "mongo/base/status.h" 39 #include "mongo/bson/bsonobj.h" 40 #include "mongo/bson/timestamp.h" 41 #include "mongo/util/mongoutils/str.h" 42 43 namespace mongo { 44 45 class DatabaseCatalogEntry; 46 class JournalListener; 47 class OperationContext; 48 class RecoveryUnit; 49 class SnapshotManager; 50 struct StorageGlobalParams; 51 class StorageEngineLockFile; 52 class StorageEngineMetadata; 53 54 /** 55 * The StorageEngine class is the top level interface for creating a new storage 56 * engine. All StorageEngine(s) must be registered by calling registerFactory in order 57 * to possibly be activated. 58 */ 59 class StorageEngine { 60 public: 61 /** 62 * The interface for creating new instances of storage engines. 63 * 64 * A storage engine provides an instance of this class (along with an associated 65 * name) to the global environment, which then sets the global storage engine 66 * according to the provided configuration parameter. 67 */ 68 class Factory { 69 public: ~Factory()70 virtual ~Factory() {} 71 72 /** 73 * Return a new instance of the StorageEngine. The lockFile parameter may be null if 74 * params.readOnly is set. Caller owns the returned pointer. 75 */ 76 virtual StorageEngine* create(const StorageGlobalParams& params, 77 const StorageEngineLockFile* lockFile) const = 0; 78 79 /** 80 * Returns the name of the storage engine. 81 * 82 * Implementations that change the value of the returned string can cause 83 * data file incompatibilities. 84 */ 85 virtual StringData getCanonicalName() const = 0; 86 87 /** 88 * Validates creation options for a collection in the StorageEngine. 89 * Returns an error if the creation options are not valid. 90 * 91 * Default implementation only accepts empty objects (no options). 92 */ validateCollectionStorageOptions(const BSONObj & options)93 virtual Status validateCollectionStorageOptions(const BSONObj& options) const { 94 if (options.isEmpty()) 95 return Status::OK(); 96 return Status(ErrorCodes::InvalidOptions, 97 str::stream() << "storage engine " << getCanonicalName() 98 << " does not support any collection storage options"); 99 } 100 101 /** 102 * Validates creation options for an index in the StorageEngine. 103 * Returns an error if the creation options are not valid. 104 * 105 * Default implementation only accepts empty objects (no options). 106 */ validateIndexStorageOptions(const BSONObj & options)107 virtual Status validateIndexStorageOptions(const BSONObj& options) const { 108 if (options.isEmpty()) 109 return Status::OK(); 110 return Status(ErrorCodes::InvalidOptions, 111 str::stream() << "storage engine " << getCanonicalName() 112 << " does not support any index storage options"); 113 } 114 115 /** 116 * Validates existing metadata in the data directory against startup options. 117 * Returns an error if the storage engine initialization should not proceed 118 * due to any inconsistencies between the current startup options and the creation 119 * options stored in the metadata. 120 */ 121 virtual Status validateMetadata(const StorageEngineMetadata& metadata, 122 const StorageGlobalParams& params) const = 0; 123 124 /** 125 * Returns a new document suitable for storing in the data directory metadata. 126 * This document will be used by validateMetadata() to check startup options 127 * on restart. 128 */ 129 virtual BSONObj createMetadataOptions(const StorageGlobalParams& params) const = 0; 130 131 /** 132 * Returns whether the engine supports read-only mode. If read-only mode is enabled, the 133 * engine may be started on a read-only filesystem (either mounted read-only or with 134 * read-only permissions). If readOnly mode is enabled, it is undefined behavior to call 135 * methods that write data (e.g. insertRecord). This method is provided on the Factory 136 * because it must be called before the storageEngine is instantiated. 137 */ supportsReadOnly()138 virtual bool supportsReadOnly() const { 139 return false; 140 } 141 }; 142 143 /** 144 * Called after the globalStorageEngine pointer has been set up, before any other methods 145 * are called. Any initialization work that requires the ability to create OperationContexts 146 * should be done here rather than in the constructor. 147 */ finishInit()148 virtual void finishInit() {} 149 150 /** 151 * Returns a new interface to the storage engine's recovery unit. The recovery 152 * unit is the durability interface. For details, see recovery_unit.h 153 * 154 * Caller owns the returned pointer. 155 */ 156 virtual RecoveryUnit* newRecoveryUnit() = 0; 157 158 /** 159 * List the databases stored in this storage engine. 160 * 161 * XXX: why doesn't this take OpCtx? 162 */ 163 virtual void listDatabases(std::vector<std::string>* out) const = 0; 164 165 /** 166 * Return the DatabaseCatalogEntry that describes the database indicated by 'db'. 167 * 168 * StorageEngine owns returned pointer. 169 * It should not be deleted by any caller. 170 */ 171 virtual DatabaseCatalogEntry* getDatabaseCatalogEntry(OperationContext* opCtx, 172 StringData db) = 0; 173 174 /** 175 * Returns whether the storage engine supports its own locking locking below the collection 176 * level. If the engine returns true, MongoDB will acquire intent locks down to the 177 * collection level and will assume that the engine will ensure consistency at the level of 178 * documents. If false, MongoDB will lock the entire collection in Shared/Exclusive mode 179 * for read/write operations respectively. 180 */ 181 virtual bool supportsDocLocking() const = 0; 182 183 /** 184 * Returns whether the storage engine supports locking at a database level. 185 */ supportsDBLocking()186 virtual bool supportsDBLocking() const { 187 return true; 188 } 189 190 /** 191 * Returns whether the engine supports a journalling concept or not. 192 */ 193 virtual bool isDurable() const = 0; 194 195 /** 196 * Returns true if the engine does not persist data to disk; false otherwise. 197 */ 198 virtual bool isEphemeral() const = 0; 199 200 /** 201 * Only MMAPv1 should override this and return true to trigger MMAPv1-specific behavior. 202 */ isMmapV1()203 virtual bool isMmapV1() const { 204 return false; 205 } 206 207 /** 208 * Closes all file handles associated with a database. 209 */ 210 virtual Status closeDatabase(OperationContext* opCtx, StringData db) = 0; 211 212 /** 213 * Deletes all data and metadata for a database. 214 */ 215 virtual Status dropDatabase(OperationContext* opCtx, StringData db) = 0; 216 217 /** 218 * @return number of files flushed 219 */ 220 virtual int flushAllFiles(OperationContext* opCtx, bool sync) = 0; 221 222 /** 223 * Transitions the storage engine into backup mode. 224 * 225 * During backup mode the storage engine must stabilize its on-disk files, and avoid 226 * any internal processing that may involve file I/O, such as online compaction, so 227 * a filesystem level backup may be performed. 228 * 229 * Storage engines that do not support this feature should use the default implementation. 230 * Storage engines that implement this must also implement endBackup(). 231 * 232 * For Storage engines that implement beginBackup the _inBackupMode variable is provided 233 * to avoid multiple instance enterting/leaving backup concurrently. 234 * 235 * If this function returns an OK status, MongoDB can call endBackup to signal the storage 236 * engine that filesystem writes may continue. This function should return a non-OK status if 237 * filesystem changes cannot be stopped to allow for online backup. If the function should be 238 * retried, returns a non-OK status. This function may throw a WriteConflictException, which 239 * should trigger a retry by the caller. All other exceptions should be treated as errors. 240 */ beginBackup(OperationContext * opCtx)241 virtual Status beginBackup(OperationContext* opCtx) { 242 return Status(ErrorCodes::CommandNotSupported, 243 "The current storage engine doesn't support backup mode"); 244 } 245 246 /** 247 * Transitions the storage engine out of backup mode. 248 * 249 * Storage engines that do not support this feature should use the default implementation. 250 * 251 * Storage engines implementing this feature should fassert when unable to leave backup mode. 252 */ endBackup(OperationContext * opCtx)253 virtual void endBackup(OperationContext* opCtx) { 254 return; 255 } 256 257 /** 258 * Recover as much data as possible from a potentially corrupt RecordStore. 259 * This only recovers the record data, not indexes or anything else. 260 * 261 * Generally, this method should not be called directly except by the repairDatabase() 262 * free function. 263 * 264 * NOTE: MMAPv1 does not support this method and has its own repairDatabase() method. 265 */ 266 virtual Status repairRecordStore(OperationContext* opCtx, const std::string& ns) = 0; 267 268 /** 269 * This method will be called before there is a clean shutdown. Storage engines should 270 * override this method if they have clean-up to do that is different from unclean shutdown. 271 * MongoDB will not call into the storage subsystem after calling this function. 272 * 273 * On error, the storage engine should assert and crash. 274 * There is intentionally no uncleanShutdown(). 275 */ 276 virtual void cleanShutdown() = 0; 277 278 /** 279 * Returns the SnapshotManager for this StorageEngine or NULL if not supported. 280 * 281 * Pointer remains owned by the StorageEngine, not the caller. 282 */ getSnapshotManager()283 virtual SnapshotManager* getSnapshotManager() const { 284 return nullptr; 285 } 286 287 /** 288 * Sets a new JournalListener, which is used by the storage engine to alert the rest of the 289 * system about journaled write progress. 290 */ 291 virtual void setJournalListener(JournalListener* jl) = 0; 292 293 /** 294 * Returns whether the storage engine supports "recover to stable timestamp". Returns false 295 * if the storage engine supports the "recover to stable timestamp" feature but does not have 296 * a stable timestamp, or if for some reason the storage engine is unable to recover to the 297 * last provided stable timestamp. 298 * 299 * It is illegal to call this concurrently with `setStableTimestamp` or 300 * `setInitialDataTimestamp`. 301 */ supportsRecoverToStableTimestamp()302 virtual bool supportsRecoverToStableTimestamp() const { 303 return false; 304 } 305 306 /** 307 * Recovers the storage engine state to the last stable timestamp. "Stable" in this case 308 * refers to a timestamp that is guaranteed to never be rolled back. The stable timestamp 309 * used should be one provided by StorageEngine::setStableTimestamp(). 310 * 311 * The "local" database is exempt and should not roll back any state except for 312 * "local.replset.minvalid" and "local.replset.checkpointTimestamp" which must roll back to 313 * the last stable timestamp. 314 * 315 * fasserts if StorageEngine::supportsRecoverToStableTimestamp() would return false. 316 */ recoverToStableTimestamp()317 virtual Status recoverToStableTimestamp() { 318 fassertFailed(40547); 319 } 320 321 /** 322 * Sets the highest timestamp at which the storage engine is allowed to take a checkpoint. 323 * This timestamp can never decrease, and thus should be a timestamp that can never roll back. 324 */ setStableTimestamp(Timestamp timestamp)325 virtual void setStableTimestamp(Timestamp timestamp) {} 326 327 /** 328 * Tells the storage engine the timestamp of the data at startup. This is necessary because 329 * timestamps are not persisted in the storage layer. 330 */ setInitialDataTimestamp(Timestamp timestamp)331 virtual void setInitialDataTimestamp(Timestamp timestamp) {} 332 333 /** 334 * Sets the oldest timestamp for which the storage engine must maintain snapshot history 335 * through. Additionally, all future writes must be newer or equal to this value. 336 */ setOldestTimestamp(Timestamp timestamp)337 virtual void setOldestTimestamp(Timestamp timestamp) {} 338 339 /** 340 * Notifies the storage engine that a replication batch has completed. 341 * This means that all the writes associated with the oplog entries in the batch are 342 * finished and no new writes with timestamps associated with those oplog entries will show 343 * up in the future. 344 * This function can be used to ensure oplog visibility rules are not broken, for example. 345 */ replicationBatchIsComplete()346 virtual void replicationBatchIsComplete() const {}; 347 348 // (CollectionName, IndexName) 349 typedef std::pair<std::string, std::string> CollectionIndexNamePair; 350 351 /** 352 * Drop abandoned idents. In the successful case, returns a list of collection, index name 353 * pairs to rebuild. 354 */ reconcileCatalogAndIdents(OperationContext * opCtx)355 virtual StatusWith<std::vector<CollectionIndexNamePair>> reconcileCatalogAndIdents( 356 OperationContext* opCtx) { 357 return std::vector<CollectionIndexNamePair>(); 358 }; 359 360 /** 361 * Returns the all committed timestamp. All transactions with timestamps earlier than the 362 * all committed timestamp are committed. Only storage engines that support document level 363 * locking must provide an implementation. Other storage engines may provide a no-op 364 * implementation. 365 */ 366 virtual Timestamp getAllCommittedTimestamp() const = 0; 367 368 /** 369 * Returns the path to the directory which has the data files of database with `dbName`. 370 */ 371 virtual std::string getFilesystemPathForDb(const std::string& dbName) const = 0; 372 373 protected: 374 /** 375 * The destructor will never be called. See cleanShutdown instead. 376 */ ~StorageEngine()377 virtual ~StorageEngine() {} 378 }; 379 380 } // namespace mongo 381