1package overlay 2 3import ( 4 "context" 5 "fmt" 6 "net" 7 "sync" 8 "syscall" 9 10 "github.com/docker/libnetwork/common" 11 "github.com/docker/libnetwork/osl" 12 "github.com/sirupsen/logrus" 13) 14 15const ovPeerTable = "overlay_peer_table" 16 17type peerKey struct { 18 peerIP net.IP 19 peerMac net.HardwareAddr 20} 21 22type peerEntry struct { 23 eid string 24 vtep net.IP 25 peerIPMask net.IPMask 26 isLocal bool 27} 28 29func (p *peerEntry) MarshalDB() peerEntryDB { 30 ones, bits := p.peerIPMask.Size() 31 return peerEntryDB{ 32 eid: p.eid, 33 vtep: p.vtep.String(), 34 peerIPMaskOnes: ones, 35 peerIPMaskBits: bits, 36 isLocal: p.isLocal, 37 } 38} 39 40// This the structure saved into the set (SetMatrix), due to the implementation of it 41// the value inserted in the set has to be Hashable so the []byte had to be converted into 42// strings 43type peerEntryDB struct { 44 eid string 45 vtep string 46 peerIPMaskOnes int 47 peerIPMaskBits int 48 isLocal bool 49} 50 51func (p *peerEntryDB) UnMarshalDB() peerEntry { 52 return peerEntry{ 53 eid: p.eid, 54 vtep: net.ParseIP(p.vtep), 55 peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits), 56 isLocal: p.isLocal, 57 } 58} 59 60type peerMap struct { 61 // set of peerEntry, note they have to be objects and not pointers to maintain the proper equality checks 62 mp common.SetMatrix 63 sync.Mutex 64} 65 66type peerNetworkMap struct { 67 // map with key peerKey 68 mp map[string]*peerMap 69 sync.Mutex 70} 71 72func (pKey peerKey) String() string { 73 return fmt.Sprintf("%s %s", pKey.peerIP, pKey.peerMac) 74} 75 76func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error { 77 ipB, err := state.Token(true, nil) 78 if err != nil { 79 return err 80 } 81 82 pKey.peerIP = net.ParseIP(string(ipB)) 83 84 macB, err := state.Token(true, nil) 85 if err != nil { 86 return err 87 } 88 89 pKey.peerMac, err = net.ParseMAC(string(macB)) 90 return err 91} 92 93func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error { 94 d.peerDb.Lock() 95 nids := []string{} 96 for nid := range d.peerDb.mp { 97 nids = append(nids, nid) 98 } 99 d.peerDb.Unlock() 100 101 for _, nid := range nids { 102 d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 103 return f(nid, pKey, pEntry) 104 }) 105 } 106 return nil 107} 108 109func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool) error { 110 d.peerDb.Lock() 111 pMap, ok := d.peerDb.mp[nid] 112 d.peerDb.Unlock() 113 114 if !ok { 115 return nil 116 } 117 118 mp := map[string]peerEntry{} 119 pMap.Lock() 120 for _, pKeyStr := range pMap.mp.Keys() { 121 entryDBList, ok := pMap.mp.Get(pKeyStr) 122 if ok { 123 peerEntryDB := entryDBList[0].(peerEntryDB) 124 mp[pKeyStr] = peerEntryDB.UnMarshalDB() 125 } 126 } 127 pMap.Unlock() 128 129 for pKeyStr, pEntry := range mp { 130 var pKey peerKey 131 if _, err := fmt.Sscan(pKeyStr, &pKey); err != nil { 132 logrus.Warnf("Peer key scan on network %s failed: %v", nid, err) 133 } 134 if f(&pKey, &pEntry) { 135 return nil 136 } 137 } 138 139 return nil 140} 141 142func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) { 143 var pKeyMatched *peerKey 144 var pEntryMatched *peerEntry 145 err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 146 if pKey.peerIP.Equal(peerIP) { 147 pKeyMatched = pKey 148 pEntryMatched = pEntry 149 return true 150 } 151 152 return false 153 }) 154 155 if err != nil { 156 return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err) 157 } 158 159 if pKeyMatched == nil || pEntryMatched == nil { 160 return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP) 161 } 162 163 return pKeyMatched, pEntryMatched, nil 164} 165 166func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 167 peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { 168 169 d.peerDb.Lock() 170 pMap, ok := d.peerDb.mp[nid] 171 if !ok { 172 d.peerDb.mp[nid] = &peerMap{ 173 mp: common.NewSetMatrix(), 174 } 175 176 pMap = d.peerDb.mp[nid] 177 } 178 d.peerDb.Unlock() 179 180 pKey := peerKey{ 181 peerIP: peerIP, 182 peerMac: peerMac, 183 } 184 185 pEntry := peerEntry{ 186 eid: eid, 187 vtep: vtep, 188 peerIPMask: peerIPMask, 189 isLocal: isLocal, 190 } 191 192 pMap.Lock() 193 defer pMap.Unlock() 194 b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB()) 195 if i != 1 { 196 // Transient case, there is more than one endpoint that is using the same IP,MAC pair 197 s, _ := pMap.mp.String(pKey.String()) 198 logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) 199 } 200 return b, i 201} 202 203func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 204 peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { 205 206 d.peerDb.Lock() 207 pMap, ok := d.peerDb.mp[nid] 208 if !ok { 209 d.peerDb.Unlock() 210 return false, 0 211 } 212 d.peerDb.Unlock() 213 214 pKey := peerKey{ 215 peerIP: peerIP, 216 peerMac: peerMac, 217 } 218 219 pEntry := peerEntry{ 220 eid: eid, 221 vtep: vtep, 222 peerIPMask: peerIPMask, 223 isLocal: isLocal, 224 } 225 226 pMap.Lock() 227 defer pMap.Unlock() 228 b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB()) 229 if i != 0 { 230 // Transient case, there is more than one endpoint that is using the same IP,MAC pair 231 s, _ := pMap.mp.String(pKey.String()) 232 logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) 233 } 234 return b, i 235} 236 237// The overlay uses a lazy initialization approach, this means that when a network is created 238// and the driver registered the overlay does not allocate resources till the moment that a 239// sandbox is actually created. 240// At the moment of this call, that happens when a sandbox is initialized, is possible that 241// networkDB has already delivered some events of peers already available on remote nodes, 242// these peers are saved into the peerDB and this function is used to properly configure 243// the network sandbox with all those peers that got previously notified. 244// Note also that this method sends a single message on the channel and the go routine on the 245// other side, will atomically loop on the whole table of peers and will program their state 246// in one single atomic operation. This is fundamental to guarantee consistency, and avoid that 247// new peerAdd or peerDelete gets reordered during the sandbox init. 248func (d *driver) initSandboxPeerDB(nid string) { 249 d.peerInit(nid) 250} 251 252type peerOperationType int32 253 254const ( 255 peerOperationINIT peerOperationType = iota 256 peerOperationADD 257 peerOperationDELETE 258 peerOperationFLUSH 259) 260 261type peerOperation struct { 262 opType peerOperationType 263 networkID string 264 endpointID string 265 peerIP net.IP 266 peerIPMask net.IPMask 267 peerMac net.HardwareAddr 268 vtepIP net.IP 269 l2Miss bool 270 l3Miss bool 271 localPeer bool 272 callerName string 273} 274 275func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) { 276 var err error 277 for { 278 select { 279 case <-ctx.Done(): 280 return 281 case op := <-ch: 282 switch op.opType { 283 case peerOperationINIT: 284 err = d.peerInitOp(op.networkID) 285 case peerOperationADD: 286 err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer) 287 case peerOperationDELETE: 288 err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer) 289 case peerOperationFLUSH: 290 err = d.peerFlushOp(op.networkID) 291 } 292 if err != nil { 293 logrus.Warnf("Peer operation failed:%s op:%v", err, op) 294 } 295 } 296 } 297} 298 299func (d *driver) peerInit(nid string) { 300 callerName := common.CallerName(1) 301 d.peerOpCh <- &peerOperation{ 302 opType: peerOperationINIT, 303 networkID: nid, 304 callerName: callerName, 305 } 306} 307 308func (d *driver) peerInitOp(nid string) error { 309 return d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { 310 // Local entries do not need to be added 311 if pEntry.isLocal { 312 return false 313 } 314 315 d.peerAddOp(nid, pEntry.eid, pKey.peerIP, pEntry.peerIPMask, pKey.peerMac, pEntry.vtep, false, false, false, pEntry.isLocal) 316 // return false to loop on all entries 317 return false 318 }) 319} 320 321func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 322 peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) { 323 d.peerOpCh <- &peerOperation{ 324 opType: peerOperationADD, 325 networkID: nid, 326 endpointID: eid, 327 peerIP: peerIP, 328 peerIPMask: peerIPMask, 329 peerMac: peerMac, 330 vtepIP: vtep, 331 l2Miss: l2Miss, 332 l3Miss: l3Miss, 333 localPeer: localPeer, 334 callerName: common.CallerName(1), 335 } 336} 337 338func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 339 peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error { 340 341 if err := validateID(nid, eid); err != nil { 342 return err 343 } 344 345 var dbEntries int 346 var inserted bool 347 if updateDB { 348 inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) 349 if !inserted { 350 logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", 351 nid, eid, peerIP, peerMac, localPeer, vtep) 352 } 353 } 354 355 // Local peers do not need any further configuration 356 if localPeer { 357 return nil 358 } 359 360 n := d.network(nid) 361 if n == nil { 362 return nil 363 } 364 365 sbox := n.sandbox() 366 if sbox == nil { 367 // We are hitting this case for all the events that are arriving before that the sandbox 368 // is being created. The peer got already added into the database and the sanbox init will 369 // call the peerDbUpdateSandbox that will configure all these peers from the database 370 return nil 371 } 372 373 IP := &net.IPNet{ 374 IP: peerIP, 375 Mask: peerIPMask, 376 } 377 378 s := n.getSubnetforIP(IP) 379 if s == nil { 380 return fmt.Errorf("couldn't find the subnet %q in network %q", IP.String(), n.id) 381 } 382 383 if err := n.obtainVxlanID(s); err != nil { 384 return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err) 385 } 386 387 if err := n.joinSubnetSandbox(s, false); err != nil { 388 return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err) 389 } 390 391 if err := d.checkEncryption(nid, vtep, n.vxlanID(s), false, true); err != nil { 392 logrus.Warn(err) 393 } 394 395 // Add neighbor entry for the peer IP 396 if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil { 397 if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 { 398 // We are in the transient case so only the first configuration is programmed into the kernel 399 // Upon deletion if the active configuration is deleted the next one from the database will be restored 400 // Note we are skipping also the next configuration 401 return nil 402 } 403 return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 404 } 405 406 // Add fdb entry to the bridge for the peer mac 407 if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName), 408 sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil { 409 return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 410 } 411 412 return nil 413} 414 415func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 416 peerMac net.HardwareAddr, vtep net.IP, localPeer bool) { 417 d.peerOpCh <- &peerOperation{ 418 opType: peerOperationDELETE, 419 networkID: nid, 420 endpointID: eid, 421 peerIP: peerIP, 422 peerIPMask: peerIPMask, 423 peerMac: peerMac, 424 vtepIP: vtep, 425 callerName: common.CallerName(1), 426 localPeer: localPeer, 427 } 428} 429 430func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, 431 peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error { 432 433 if err := validateID(nid, eid); err != nil { 434 return err 435 } 436 437 deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) 438 if !deleted { 439 logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", 440 nid, eid, peerIP, peerMac, localPeer, vtep) 441 } 442 443 n := d.network(nid) 444 if n == nil { 445 return nil 446 } 447 448 sbox := n.sandbox() 449 if sbox == nil { 450 return nil 451 } 452 453 if err := d.checkEncryption(nid, vtep, 0, localPeer, false); err != nil { 454 logrus.Warn(err) 455 } 456 457 // Local peers do not have any local configuration to delete 458 if !localPeer { 459 // Remove fdb entry to the bridge for the peer mac 460 if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil { 461 if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 { 462 // We fall in here if there is a transient state and if the neighbor that is being deleted 463 // was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping) 464 return nil 465 } 466 return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 467 } 468 469 // Delete neighbor entry for the peer IP 470 if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil { 471 return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) 472 } 473 } 474 475 if dbEntries == 0 { 476 return nil 477 } 478 479 // If there is still an entry into the database and the deletion went through without errors means that there is now no 480 // configuration active in the kernel. 481 // Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one 482 peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP) 483 if err != nil { 484 logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err) 485 return err 486 } 487 return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal) 488} 489 490func (d *driver) peerFlush(nid string) { 491 d.peerOpCh <- &peerOperation{ 492 opType: peerOperationFLUSH, 493 networkID: nid, 494 callerName: common.CallerName(1), 495 } 496} 497 498func (d *driver) peerFlushOp(nid string) error { 499 d.peerDb.Lock() 500 defer d.peerDb.Unlock() 501 _, ok := d.peerDb.mp[nid] 502 if !ok { 503 return fmt.Errorf("Unable to find the peerDB for nid:%s", nid) 504 } 505 delete(d.peerDb.mp, nid) 506 return nil 507} 508 509func (d *driver) pushLocalDb() { 510 d.peerDbWalk(func(nid string, pKey *peerKey, pEntry *peerEntry) bool { 511 if pEntry.isLocal { 512 d.pushLocalEndpointEvent("join", nid, pEntry.eid) 513 } 514 return false 515 }) 516} 517 518func (d *driver) peerDBUpdateSelf() { 519 d.peerDbWalk(func(nid string, pkey *peerKey, pEntry *peerEntry) bool { 520 if pEntry.isLocal { 521 pEntry.vtep = net.ParseIP(d.advertiseAddress) 522 } 523 return false 524 }) 525} 526