1package overlay
2
3import (
4	"context"
5	"fmt"
6	"net"
7	"sync"
8	"syscall"
9
10	"github.com/docker/libnetwork/internal/caller"
11	"github.com/docker/libnetwork/internal/setmatrix"
12	"github.com/docker/libnetwork/osl"
13	"github.com/sirupsen/logrus"
14)
15
16const ovPeerTable = "overlay_peer_table"
17
18type peerKey struct {
19	peerIP  net.IP
20	peerMac net.HardwareAddr
21}
22
23type peerEntry struct {
24	eid        string
25	vtep       net.IP
26	peerIPMask net.IPMask
27	isLocal    bool
28}
29
30func (p *peerEntry) MarshalDB() peerEntryDB {
31	ones, bits := p.peerIPMask.Size()
32	return peerEntryDB{
33		eid:            p.eid,
34		vtep:           p.vtep.String(),
35		peerIPMaskOnes: ones,
36		peerIPMaskBits: bits,
37		isLocal:        p.isLocal,
38	}
39}
40
41// This the structure saved into the set (SetMatrix), due to the implementation of it
42// the value inserted in the set has to be Hashable so the []byte had to be converted into
43// strings
44type peerEntryDB struct {
45	eid            string
46	vtep           string
47	peerIPMaskOnes int
48	peerIPMaskBits int
49	isLocal        bool
50}
51
52func (p *peerEntryDB) UnMarshalDB() peerEntry {
53	return peerEntry{
54		eid:        p.eid,
55		vtep:       net.ParseIP(p.vtep),
56		peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits),
57		isLocal:    p.isLocal,
58	}
59}
60
61type peerMap struct {
62	// set of peerEntry, note they have to be objects and not pointers to maintain the proper equality checks
63	mp setmatrix.SetMatrix
64	sync.Mutex
65}
66
67type peerNetworkMap struct {
68	// map with key peerKey
69	mp map[string]*peerMap
70	sync.Mutex
71}
72
73func (pKey peerKey) String() string {
74	return fmt.Sprintf("%s %s", pKey.peerIP, pKey.peerMac)
75}
76
77func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error {
78	ipB, err := state.Token(true, nil)
79	if err != nil {
80		return err
81	}
82
83	pKey.peerIP = net.ParseIP(string(ipB))
84
85	macB, err := state.Token(true, nil)
86	if err != nil {
87		return err
88	}
89
90	pKey.peerMac, err = net.ParseMAC(string(macB))
91	return err
92}
93
94func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error {
95	d.peerDb.Lock()
96	nids := []string{}
97	for nid := range d.peerDb.mp {
98		nids = append(nids, nid)
99	}
100	d.peerDb.Unlock()
101
102	for _, nid := range nids {
103		d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
104			return f(nid, pKey, pEntry)
105		})
106	}
107	return nil
108}
109
110func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool) error {
111	d.peerDb.Lock()
112	pMap, ok := d.peerDb.mp[nid]
113	d.peerDb.Unlock()
114
115	if !ok {
116		return nil
117	}
118
119	mp := map[string]peerEntry{}
120	pMap.Lock()
121	for _, pKeyStr := range pMap.mp.Keys() {
122		entryDBList, ok := pMap.mp.Get(pKeyStr)
123		if ok {
124			peerEntryDB := entryDBList[0].(peerEntryDB)
125			mp[pKeyStr] = peerEntryDB.UnMarshalDB()
126		}
127	}
128	pMap.Unlock()
129
130	for pKeyStr, pEntry := range mp {
131		var pKey peerKey
132		if _, err := fmt.Sscan(pKeyStr, &pKey); err != nil {
133			logrus.Warnf("Peer key scan on network %s failed: %v", nid, err)
134		}
135		if f(&pKey, &pEntry) {
136			return nil
137		}
138	}
139
140	return nil
141}
142
143func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) {
144	var pKeyMatched *peerKey
145	var pEntryMatched *peerEntry
146	err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
147		if pKey.peerIP.Equal(peerIP) {
148			pKeyMatched = pKey
149			pEntryMatched = pEntry
150			return true
151		}
152
153		return false
154	})
155
156	if err != nil {
157		return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
158	}
159
160	if pKeyMatched == nil || pEntryMatched == nil {
161		return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
162	}
163
164	return pKeyMatched, pEntryMatched, nil
165}
166
167func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
168	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
169
170	d.peerDb.Lock()
171	pMap, ok := d.peerDb.mp[nid]
172	if !ok {
173		d.peerDb.mp[nid] = &peerMap{
174			mp: setmatrix.NewSetMatrix(),
175		}
176
177		pMap = d.peerDb.mp[nid]
178	}
179	d.peerDb.Unlock()
180
181	pKey := peerKey{
182		peerIP:  peerIP,
183		peerMac: peerMac,
184	}
185
186	pEntry := peerEntry{
187		eid:        eid,
188		vtep:       vtep,
189		peerIPMask: peerIPMask,
190		isLocal:    isLocal,
191	}
192
193	pMap.Lock()
194	defer pMap.Unlock()
195	b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB())
196	if i != 1 {
197		// Transient case, there is more than one endpoint that is using the same IP,MAC pair
198		s, _ := pMap.mp.String(pKey.String())
199		logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
200	}
201	return b, i
202}
203
204func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
205	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
206
207	d.peerDb.Lock()
208	pMap, ok := d.peerDb.mp[nid]
209	if !ok {
210		d.peerDb.Unlock()
211		return false, 0
212	}
213	d.peerDb.Unlock()
214
215	pKey := peerKey{
216		peerIP:  peerIP,
217		peerMac: peerMac,
218	}
219
220	pEntry := peerEntry{
221		eid:        eid,
222		vtep:       vtep,
223		peerIPMask: peerIPMask,
224		isLocal:    isLocal,
225	}
226
227	pMap.Lock()
228	defer pMap.Unlock()
229	b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB())
230	if i != 0 {
231		// Transient case, there is more than one endpoint that is using the same IP,MAC pair
232		s, _ := pMap.mp.String(pKey.String())
233		logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
234	}
235	return b, i
236}
237
238// The overlay uses a lazy initialization approach, this means that when a network is created
239// and the driver registered the overlay does not allocate resources till the moment that a
240// sandbox is actually created.
241// At the moment of this call, that happens when a sandbox is initialized, is possible that
242// networkDB has already delivered some events of peers already available on remote nodes,
243// these peers are saved into the peerDB and this function is used to properly configure
244// the network sandbox with all those peers that got previously notified.
245// Note also that this method sends a single message on the channel and the go routine on the
246// other side, will atomically loop on the whole table of peers and will program their state
247// in one single atomic operation. This is fundamental to guarantee consistency, and avoid that
248// new peerAdd or peerDelete gets reordered during the sandbox init.
249func (d *driver) initSandboxPeerDB(nid string) {
250	d.peerInit(nid)
251}
252
253type peerOperationType int32
254
255const (
256	peerOperationINIT peerOperationType = iota
257	peerOperationADD
258	peerOperationDELETE
259	peerOperationFLUSH
260)
261
262type peerOperation struct {
263	opType     peerOperationType
264	networkID  string
265	endpointID string
266	peerIP     net.IP
267	peerIPMask net.IPMask
268	peerMac    net.HardwareAddr
269	vtepIP     net.IP
270	l2Miss     bool
271	l3Miss     bool
272	localPeer  bool
273	callerName string
274}
275
276func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) {
277	var err error
278	for {
279		select {
280		case <-ctx.Done():
281			return
282		case op := <-ch:
283			switch op.opType {
284			case peerOperationINIT:
285				err = d.peerInitOp(op.networkID)
286			case peerOperationADD:
287				err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer)
288			case peerOperationDELETE:
289				err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer)
290			case peerOperationFLUSH:
291				err = d.peerFlushOp(op.networkID)
292			}
293			if err != nil {
294				logrus.Warnf("Peer operation failed:%s op:%v", err, op)
295			}
296		}
297	}
298}
299
300func (d *driver) peerInit(nid string) {
301	callerName := caller.Name(1)
302	d.peerOpCh <- &peerOperation{
303		opType:     peerOperationINIT,
304		networkID:  nid,
305		callerName: callerName,
306	}
307}
308
309func (d *driver) peerInitOp(nid string) error {
310	return d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
311		// Local entries do not need to be added
312		if pEntry.isLocal {
313			return false
314		}
315
316		d.peerAddOp(nid, pEntry.eid, pKey.peerIP, pEntry.peerIPMask, pKey.peerMac, pEntry.vtep, false, false, false, pEntry.isLocal)
317		// return false to loop on all entries
318		return false
319	})
320}
321
322func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
323	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) {
324	d.peerOpCh <- &peerOperation{
325		opType:     peerOperationADD,
326		networkID:  nid,
327		endpointID: eid,
328		peerIP:     peerIP,
329		peerIPMask: peerIPMask,
330		peerMac:    peerMac,
331		vtepIP:     vtep,
332		l2Miss:     l2Miss,
333		l3Miss:     l3Miss,
334		localPeer:  localPeer,
335		callerName: caller.Name(1),
336	}
337}
338
339func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
340	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error {
341
342	if err := validateID(nid, eid); err != nil {
343		return err
344	}
345
346	var dbEntries int
347	var inserted bool
348	if updateDB {
349		inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
350		if !inserted {
351			logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
352				nid, eid, peerIP, peerMac, localPeer, vtep)
353		}
354	}
355
356	// Local peers do not need any further configuration
357	if localPeer {
358		return nil
359	}
360
361	n := d.network(nid)
362	if n == nil {
363		return nil
364	}
365
366	sbox := n.sandbox()
367	if sbox == nil {
368		// We are hitting this case for all the events that are arriving before that the sandbox
369		// is being created. The peer got already added into the database and the sanbox init will
370		// call the peerDbUpdateSandbox that will configure all these peers from the database
371		return nil
372	}
373
374	IP := &net.IPNet{
375		IP:   peerIP,
376		Mask: peerIPMask,
377	}
378
379	s := n.getSubnetforIP(IP)
380	if s == nil {
381		return fmt.Errorf("couldn't find the subnet %q in network %q", IP.String(), n.id)
382	}
383
384	if err := n.obtainVxlanID(s); err != nil {
385		return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
386	}
387
388	if err := n.joinSandbox(s, false, false); err != nil {
389		return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
390	}
391
392	if err := d.checkEncryption(nid, vtep, n.vxlanID(s), false, true); err != nil {
393		logrus.Warn(err)
394	}
395
396	// Add neighbor entry for the peer IP
397	if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil {
398		if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 {
399			// We are in the transient case so only the first configuration is programmed into the kernel
400			// Upon deletion if the active configuration is deleted the next one from the database will be restored
401			// Note we are skipping also the next configuration
402			return nil
403		}
404		return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
405	}
406
407	// Add fdb entry to the bridge for the peer mac
408	if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName),
409		sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil {
410		return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
411	}
412
413	return nil
414}
415
416func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
417	peerMac net.HardwareAddr, vtep net.IP, localPeer bool) {
418	d.peerOpCh <- &peerOperation{
419		opType:     peerOperationDELETE,
420		networkID:  nid,
421		endpointID: eid,
422		peerIP:     peerIP,
423		peerIPMask: peerIPMask,
424		peerMac:    peerMac,
425		vtepIP:     vtep,
426		callerName: caller.Name(1),
427		localPeer:  localPeer,
428	}
429}
430
431func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
432	peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error {
433
434	if err := validateID(nid, eid); err != nil {
435		return err
436	}
437
438	deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
439	if !deleted {
440		logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
441			nid, eid, peerIP, peerMac, localPeer, vtep)
442	}
443
444	n := d.network(nid)
445	if n == nil {
446		return nil
447	}
448
449	sbox := n.sandbox()
450	if sbox == nil {
451		return nil
452	}
453
454	if err := d.checkEncryption(nid, vtep, 0, localPeer, false); err != nil {
455		logrus.Warn(err)
456	}
457
458	// Local peers do not have any local configuration to delete
459	if !localPeer {
460		// Remove fdb entry to the bridge for the peer mac
461		if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil {
462			if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 {
463				// We fall in here if there is a transient state and if the neighbor that is being deleted
464				// was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping)
465				return nil
466			}
467			return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
468		}
469
470		// Delete neighbor entry for the peer IP
471		if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil {
472			return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
473		}
474	}
475
476	if dbEntries == 0 {
477		return nil
478	}
479
480	// If there is still an entry into the database and the deletion went through without errors means that there is now no
481	// configuration active in the kernel.
482	// Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one
483	peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP)
484	if err != nil {
485		logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err)
486		return err
487	}
488	return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal)
489}
490
491func (d *driver) peerFlush(nid string) {
492	d.peerOpCh <- &peerOperation{
493		opType:     peerOperationFLUSH,
494		networkID:  nid,
495		callerName: caller.Name(1),
496	}
497}
498
499func (d *driver) peerFlushOp(nid string) error {
500	d.peerDb.Lock()
501	defer d.peerDb.Unlock()
502	_, ok := d.peerDb.mp[nid]
503	if !ok {
504		return fmt.Errorf("Unable to find the peerDB for nid:%s", nid)
505	}
506	delete(d.peerDb.mp, nid)
507	return nil
508}
509
510func (d *driver) pushLocalDb() {
511	d.peerDbWalk(func(nid string, pKey *peerKey, pEntry *peerEntry) bool {
512		if pEntry.isLocal {
513			d.pushLocalEndpointEvent("join", nid, pEntry.eid)
514		}
515		return false
516	})
517}
518
519func (d *driver) peerDBUpdateSelf() {
520	d.peerDbWalk(func(nid string, pkey *peerKey, pEntry *peerEntry) bool {
521		if pEntry.isLocal {
522			pEntry.vtep = net.ParseIP(d.advertiseAddress)
523		}
524		return false
525	})
526}
527