1 
2 /**
3  *    Copyright (C) 2018-present MongoDB, Inc.
4  *
5  *    This program is free software: you can redistribute it and/or modify
6  *    it under the terms of the Server Side Public License, version 1,
7  *    as published by MongoDB, Inc.
8  *
9  *    This program is distributed in the hope that it will be useful,
10  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *    Server Side Public License for more details.
13  *
14  *    You should have received a copy of the Server Side Public License
15  *    along with this program. If not, see
16  *    <http://www.mongodb.com/licensing/server-side-public-license>.
17  *
18  *    As a special exception, the copyright holders give permission to link the
19  *    code of portions of this program with the OpenSSL library under certain
20  *    conditions as described in each individual source file and distribute
21  *    linked combinations including the program with the OpenSSL library. You
22  *    must comply with the Server Side Public License in all respects for
23  *    all of the code used other than as permitted herein. If you modify file(s)
24  *    with this exception, you may extend this exception to your version of the
25  *    file(s), but you are not obligated to do so. If you do not wish to do so,
26  *    delete this exception statement from your version. If you delete this
27  *    exception statement from all source files in the program, then also delete
28  *    it in the license file.
29  */
30 
31 #pragma once
32 
33 
34 #include "mongo/base/owned_pointer_vector.h"
35 #include "mongo/bson/timestamp.h"
36 #include "mongo/db/exec/collection_scan.h"
37 #include "mongo/db/exec/plan_stage.h"
38 #include "mongo/db/matcher/expression_leaf.h"
39 #include "mongo/db/record_id.h"
40 #include "mongo/util/timer.h"
41 
42 namespace mongo {
43 
44 class RecordCursor;
45 
46 /**
47  * OplogStart walks a collection backwards to find the first object in the collection that matches
48  * the timestamp.  It's used by replication to efficiently find where the oplog should be replayed
49  * from.
50  *
51  * The oplog is always a capped collection.  In capped collections, documents are oriented on disk
52  * according to insertion order.  The oplog inserts documents with increasing timestamps.  Queries
53  * on the oplog look for entries that are after a certain time.  Therefore if we navigate backwards,
54  * the first document we encounter that is less than or equal to the timestamp is the first document
55  * we should scan.
56  *
57  * Why isn't this a normal reverse table scan, you may ask?  We could be correct if we used a
58  * normal reverse collection scan.  However, that's not fast enough.  Since we know all
59  * documents are oriented on disk in insertion order, we know all documents in one extent were
60  * inserted before documents in a subsequent extent.  As such we can skip through entire extents
61  * looking only at the first document.
62  *
63  * Why is this a stage?  Because we want to yield, and we want to be notified of RecordId
64  * invalidations.  :(
65  */
66 class OplogStart final : public PlanStage {
67 public:
68     // Does not take ownership.
69     OplogStart(OperationContext* opCtx,
70                const Collection* collection,
71                Timestamp timestamp,
72                WorkingSet* ws);
73 
74     StageState doWork(WorkingSetID* out) final;
75     bool isEOF() final;
76 
77     void doInvalidate(OperationContext* opCtx, const RecordId& dl, InvalidationType type) final;
78     void doSaveState() final;
79     void doRestoreState() final;
80     void doDetachFromOperationContext() final;
81     void doReattachToOperationContext() final;
82 
83     // Returns empty PlanStageStats object
84     std::unique_ptr<PlanStageStats> getStats() final;
85 
86     //
87     // Exec stats -- do not call for the oplog start stage.
88     //
89 
getSpecificStats()90     const SpecificStats* getSpecificStats() const final {
91         return NULL;
92     }
93 
stageType()94     StageType stageType() const final {
95         return STAGE_OPLOG_START;
96     }
97 
98     // For testing only.
setBackwardsScanTime(int newTime)99     void setBackwardsScanTime(int newTime) {
100         _backwardsScanTime = newTime;
101     }
isExtentHopping()102     bool isExtentHopping() {
103         return _extentHopping;
104     }
isBackwardsScanning()105     bool isBackwardsScanning() {
106         return _backwardsScanning;
107     }
108 
109     static const char* kStageType;
110 
111 private:
112     StageState workBackwardsScan(WorkingSetID* out);
113 
114     void switchToExtentHopping();
115 
116     StageState workExtentHopping(WorkingSetID* out);
117 
118     // This is only used for the extent hopping scan.
119     std::vector<std::unique_ptr<RecordCursor>> _subIterators;
120 
121     // Have we done our heavy init yet?
122     bool _needInit;
123 
124     // Our first state: going backwards via a collscan.
125     bool _backwardsScanning;
126 
127     // Our second state: hopping backwards extent by extent.
128     bool _extentHopping;
129 
130     // Our final state: done.
131     bool _done;
132 
133     const Collection* _collection;
134 
135     // We only go backwards via a collscan for a few seconds.
136     Timer _timer;
137 
138     // WorkingSet is not owned by us.
139     WorkingSet* _workingSet;
140 
141     std::string _ns;
142 
143     // '_filter' matches documents whose "ts" field is less than or equal to 'timestamp'. Once we
144     // have found a document matching '_filter', we know that we're at or behind the starting point
145     // and can start scanning forwards again.
146     BSONObj _filterBSON;
147     LTEMatchExpression _filter;
148 
149     static int _backwardsScanTime;
150 };
151 
152 }  // namespace mongo
153