1// Copyright 2015 The etcd Authors 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package mvcc 16 17import ( 18 "bytes" 19 "errors" 20 "fmt" 21 22 "github.com/google/btree" 23 "go.uber.org/zap" 24) 25 26var ( 27 ErrRevisionNotFound = errors.New("mvcc: revision not found") 28) 29 30// keyIndex stores the revisions of a key in the backend. 31// Each keyIndex has at least one key generation. 32// Each generation might have several key versions. 33// Tombstone on a key appends an tombstone version at the end 34// of the current generation and creates a new empty generation. 35// Each version of a key has an index pointing to the backend. 36// 37// For example: put(1.0);put(2.0);tombstone(3.0);put(4.0);tombstone(5.0) on key "foo" 38// generate a keyIndex: 39// key: "foo" 40// rev: 5 41// generations: 42// {empty} 43// {4.0, 5.0(t)} 44// {1.0, 2.0, 3.0(t)} 45// 46// Compact a keyIndex removes the versions with smaller or equal to 47// rev except the largest one. If the generation becomes empty 48// during compaction, it will be removed. if all the generations get 49// removed, the keyIndex should be removed. 50// 51// For example: 52// compact(2) on the previous example 53// generations: 54// {empty} 55// {4.0, 5.0(t)} 56// {2.0, 3.0(t)} 57// 58// compact(4) 59// generations: 60// {empty} 61// {4.0, 5.0(t)} 62// 63// compact(5): 64// generations: 65// {empty} -> key SHOULD be removed. 66// 67// compact(6): 68// generations: 69// {empty} -> key SHOULD be removed. 70type keyIndex struct { 71 key []byte 72 modified revision // the main rev of the last modification 73 generations []generation 74} 75 76// put puts a revision to the keyIndex. 77func (ki *keyIndex) put(lg *zap.Logger, main int64, sub int64) { 78 rev := revision{main: main, sub: sub} 79 80 if !rev.GreaterThan(ki.modified) { 81 if lg != nil { 82 lg.Panic( 83 "'put' with an unexpected smaller revision", 84 zap.Int64("given-revision-main", rev.main), 85 zap.Int64("given-revision-sub", rev.sub), 86 zap.Int64("modified-revision-main", ki.modified.main), 87 zap.Int64("modified-revision-sub", ki.modified.sub), 88 ) 89 } else { 90 plog.Panicf("store.keyindex: put with unexpected smaller revision [%v / %v]", rev, ki.modified) 91 } 92 } 93 if len(ki.generations) == 0 { 94 ki.generations = append(ki.generations, generation{}) 95 } 96 g := &ki.generations[len(ki.generations)-1] 97 if len(g.revs) == 0 { // create a new key 98 keysGauge.Inc() 99 g.created = rev 100 } 101 g.revs = append(g.revs, rev) 102 g.ver++ 103 ki.modified = rev 104} 105 106func (ki *keyIndex) restore(lg *zap.Logger, created, modified revision, ver int64) { 107 if len(ki.generations) != 0 { 108 if lg != nil { 109 lg.Panic( 110 "'restore' got an unexpected non-empty generations", 111 zap.Int("generations-size", len(ki.generations)), 112 ) 113 } else { 114 plog.Panicf("store.keyindex: cannot restore non-empty keyIndex") 115 } 116 } 117 118 ki.modified = modified 119 g := generation{created: created, ver: ver, revs: []revision{modified}} 120 ki.generations = append(ki.generations, g) 121 keysGauge.Inc() 122} 123 124// tombstone puts a revision, pointing to a tombstone, to the keyIndex. 125// It also creates a new empty generation in the keyIndex. 126// It returns ErrRevisionNotFound when tombstone on an empty generation. 127func (ki *keyIndex) tombstone(lg *zap.Logger, main int64, sub int64) error { 128 if ki.isEmpty() { 129 if lg != nil { 130 lg.Panic( 131 "'tombstone' got an unexpected empty keyIndex", 132 zap.String("key", string(ki.key)), 133 ) 134 } else { 135 plog.Panicf("store.keyindex: unexpected tombstone on empty keyIndex %s", string(ki.key)) 136 } 137 } 138 if ki.generations[len(ki.generations)-1].isEmpty() { 139 return ErrRevisionNotFound 140 } 141 ki.put(lg, main, sub) 142 ki.generations = append(ki.generations, generation{}) 143 keysGauge.Dec() 144 return nil 145} 146 147// get gets the modified, created revision and version of the key that satisfies the given atRev. 148// Rev must be higher than or equal to the given atRev. 149func (ki *keyIndex) get(lg *zap.Logger, atRev int64) (modified, created revision, ver int64, err error) { 150 if ki.isEmpty() { 151 if lg != nil { 152 lg.Panic( 153 "'get' got an unexpected empty keyIndex", 154 zap.String("key", string(ki.key)), 155 ) 156 } else { 157 plog.Panicf("store.keyindex: unexpected get on empty keyIndex %s", string(ki.key)) 158 } 159 } 160 g := ki.findGeneration(atRev) 161 if g.isEmpty() { 162 return revision{}, revision{}, 0, ErrRevisionNotFound 163 } 164 165 n := g.walk(func(rev revision) bool { return rev.main > atRev }) 166 if n != -1 { 167 return g.revs[n], g.created, g.ver - int64(len(g.revs)-n-1), nil 168 } 169 170 return revision{}, revision{}, 0, ErrRevisionNotFound 171} 172 173// since returns revisions since the given rev. Only the revision with the 174// largest sub revision will be returned if multiple revisions have the same 175// main revision. 176func (ki *keyIndex) since(lg *zap.Logger, rev int64) []revision { 177 if ki.isEmpty() { 178 if lg != nil { 179 lg.Panic( 180 "'since' got an unexpected empty keyIndex", 181 zap.String("key", string(ki.key)), 182 ) 183 } else { 184 plog.Panicf("store.keyindex: unexpected get on empty keyIndex %s", string(ki.key)) 185 } 186 } 187 since := revision{rev, 0} 188 var gi int 189 // find the generations to start checking 190 for gi = len(ki.generations) - 1; gi > 0; gi-- { 191 g := ki.generations[gi] 192 if g.isEmpty() { 193 continue 194 } 195 if since.GreaterThan(g.created) { 196 break 197 } 198 } 199 200 var revs []revision 201 var last int64 202 for ; gi < len(ki.generations); gi++ { 203 for _, r := range ki.generations[gi].revs { 204 if since.GreaterThan(r) { 205 continue 206 } 207 if r.main == last { 208 // replace the revision with a new one that has higher sub value, 209 // because the original one should not be seen by external 210 revs[len(revs)-1] = r 211 continue 212 } 213 revs = append(revs, r) 214 last = r.main 215 } 216 } 217 return revs 218} 219 220// compact compacts a keyIndex by removing the versions with smaller or equal 221// revision than the given atRev except the largest one (If the largest one is 222// a tombstone, it will not be kept). 223// If a generation becomes empty during compaction, it will be removed. 224func (ki *keyIndex) compact(lg *zap.Logger, atRev int64, available map[revision]struct{}) { 225 if ki.isEmpty() { 226 if lg != nil { 227 lg.Panic( 228 "'compact' got an unexpected empty keyIndex", 229 zap.String("key", string(ki.key)), 230 ) 231 } else { 232 plog.Panicf("store.keyindex: unexpected compact on empty keyIndex %s", string(ki.key)) 233 } 234 } 235 236 genIdx, revIndex := ki.doCompact(atRev, available) 237 238 g := &ki.generations[genIdx] 239 if !g.isEmpty() { 240 // remove the previous contents. 241 if revIndex != -1 { 242 g.revs = g.revs[revIndex:] 243 } 244 // remove any tombstone 245 if len(g.revs) == 1 && genIdx != len(ki.generations)-1 { 246 delete(available, g.revs[0]) 247 genIdx++ 248 } 249 } 250 251 // remove the previous generations. 252 ki.generations = ki.generations[genIdx:] 253} 254 255// keep finds the revision to be kept if compact is called at given atRev. 256func (ki *keyIndex) keep(atRev int64, available map[revision]struct{}) { 257 if ki.isEmpty() { 258 return 259 } 260 261 genIdx, revIndex := ki.doCompact(atRev, available) 262 g := &ki.generations[genIdx] 263 if !g.isEmpty() { 264 // remove any tombstone 265 if revIndex == len(g.revs)-1 && genIdx != len(ki.generations)-1 { 266 delete(available, g.revs[revIndex]) 267 } 268 } 269} 270 271func (ki *keyIndex) doCompact(atRev int64, available map[revision]struct{}) (genIdx int, revIndex int) { 272 // walk until reaching the first revision smaller or equal to "atRev", 273 // and add the revision to the available map 274 f := func(rev revision) bool { 275 if rev.main <= atRev { 276 available[rev] = struct{}{} 277 return false 278 } 279 return true 280 } 281 282 genIdx, g := 0, &ki.generations[0] 283 // find first generation includes atRev or created after atRev 284 for genIdx < len(ki.generations)-1 { 285 if tomb := g.revs[len(g.revs)-1].main; tomb > atRev { 286 break 287 } 288 genIdx++ 289 g = &ki.generations[genIdx] 290 } 291 292 revIndex = g.walk(f) 293 294 return genIdx, revIndex 295} 296 297func (ki *keyIndex) isEmpty() bool { 298 return len(ki.generations) == 1 && ki.generations[0].isEmpty() 299} 300 301// findGeneration finds out the generation of the keyIndex that the 302// given rev belongs to. If the given rev is at the gap of two generations, 303// which means that the key does not exist at the given rev, it returns nil. 304func (ki *keyIndex) findGeneration(rev int64) *generation { 305 lastg := len(ki.generations) - 1 306 cg := lastg 307 308 for cg >= 0 { 309 if len(ki.generations[cg].revs) == 0 { 310 cg-- 311 continue 312 } 313 g := ki.generations[cg] 314 if cg != lastg { 315 if tomb := g.revs[len(g.revs)-1].main; tomb <= rev { 316 return nil 317 } 318 } 319 if g.revs[0].main <= rev { 320 return &ki.generations[cg] 321 } 322 cg-- 323 } 324 return nil 325} 326 327func (ki *keyIndex) Less(b btree.Item) bool { 328 return bytes.Compare(ki.key, b.(*keyIndex).key) == -1 329} 330 331func (ki *keyIndex) equal(b *keyIndex) bool { 332 if !bytes.Equal(ki.key, b.key) { 333 return false 334 } 335 if ki.modified != b.modified { 336 return false 337 } 338 if len(ki.generations) != len(b.generations) { 339 return false 340 } 341 for i := range ki.generations { 342 ag, bg := ki.generations[i], b.generations[i] 343 if !ag.equal(bg) { 344 return false 345 } 346 } 347 return true 348} 349 350func (ki *keyIndex) String() string { 351 var s string 352 for _, g := range ki.generations { 353 s += g.String() 354 } 355 return s 356} 357 358// generation contains multiple revisions of a key. 359type generation struct { 360 ver int64 361 created revision // when the generation is created (put in first revision). 362 revs []revision 363} 364 365func (g *generation) isEmpty() bool { return g == nil || len(g.revs) == 0 } 366 367// walk walks through the revisions in the generation in descending order. 368// It passes the revision to the given function. 369// walk returns until: 1. it finishes walking all pairs 2. the function returns false. 370// walk returns the position at where it stopped. If it stopped after 371// finishing walking, -1 will be returned. 372func (g *generation) walk(f func(rev revision) bool) int { 373 l := len(g.revs) 374 for i := range g.revs { 375 ok := f(g.revs[l-i-1]) 376 if !ok { 377 return l - i - 1 378 } 379 } 380 return -1 381} 382 383func (g *generation) String() string { 384 return fmt.Sprintf("g: created[%d] ver[%d], revs %#v\n", g.created, g.ver, g.revs) 385} 386 387func (g generation) equal(b generation) bool { 388 if g.ver != b.ver { 389 return false 390 } 391 if len(g.revs) != len(b.revs) { 392 return false 393 } 394 395 for i := range g.revs { 396 ar, br := g.revs[i], b.revs[i] 397 if ar != br { 398 return false 399 } 400 } 401 return true 402} 403