1package overlay
2
3import (
4	"context"
5	"fmt"
6	"net"
7	"sync"
8	"syscall"
9
10	"github.com/docker/libnetwork/common"
11	"github.com/docker/libnetwork/osl"
12	"github.com/sirupsen/logrus"
13)
14
15const ovPeerTable = "overlay_peer_table"
16
17type peerKey struct {
18	peerIP  net.IP
19	peerMac net.HardwareAddr
20}
21
22type peerEntry struct {
23	eid        string
24	vtep       net.IP
25	peerIPMask net.IPMask
26	isLocal    bool
27}
28
29func (p *peerEntry) MarshalDB() peerEntryDB {
30	ones, bits := p.peerIPMask.Size()
31	return peerEntryDB{
32		eid:            p.eid,
33		vtep:           p.vtep.String(),
34		peerIPMaskOnes: ones,
35		peerIPMaskBits: bits,
36		isLocal:        p.isLocal,
37	}
38}
39
40// This the structure saved into the set (SetMatrix), due to the implementation of it
41// the value inserted in the set has to be Hashable so the []byte had to be converted into
42// strings
43type peerEntryDB struct {
44	eid            string
45	vtep           string
46	peerIPMaskOnes int
47	peerIPMaskBits int
48	isLocal        bool
49}
50
51func (p *peerEntryDB) UnMarshalDB() peerEntry {
52	return peerEntry{
53		eid:        p.eid,
54		vtep:       net.ParseIP(p.vtep),
55		peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits),
56		isLocal:    p.isLocal,
57	}
58}
59
60type peerMap struct {
61	// set of peerEntry, note they have to be objects and not pointers to maintain the proper equality checks
62	mp common.SetMatrix
63	sync.Mutex
64}
65
66type peerNetworkMap struct {
67	// map with key peerKey
68	mp map[string]*peerMap
69	sync.Mutex
70}
71
72func (pKey peerKey) String() string {
73	return fmt.Sprintf("%s %s", pKey.peerIP, pKey.peerMac)
74}
75
76func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error {
77	ipB, err := state.Token(true, nil)
78	if err != nil {
79		return err
80	}
81
82	pKey.peerIP = net.ParseIP(string(ipB))
83
84	macB, err := state.Token(true, nil)
85	if err != nil {
86		return err
87	}
88
89	pKey.peerMac, err = net.ParseMAC(string(macB))
90	return err
91}
92
93func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error {
94	d.peerDb.Lock()
95	nids := []string{}
96	for nid := range d.peerDb.mp {
97		nids = append(nids, nid)
98	}
99	d.peerDb.Unlock()
100
101	for _, nid := range nids {
102		d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
103			return f(nid, pKey, pEntry)
104		})
105	}
106	return nil
107}
108
109func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool) error {
110	d.peerDb.Lock()
111	pMap, ok := d.peerDb.mp[nid]
112	d.peerDb.Unlock()
113
114	if !ok {
115		return nil
116	}
117
118	mp := map[string]peerEntry{}
119	pMap.Lock()
120	for _, pKeyStr := range pMap.mp.Keys() {
121		entryDBList, ok := pMap.mp.Get(pKeyStr)
122		if ok {
123			peerEntryDB := entryDBList[0].(peerEntryDB)
124			mp[pKeyStr] = peerEntryDB.UnMarshalDB()
125		}
126	}
127	pMap.Unlock()
128
129	for pKeyStr, pEntry := range mp {
130		var pKey peerKey
131		if _, err := fmt.Sscan(pKeyStr, &pKey); err != nil {
132			logrus.Warnf("Peer key scan on network %s failed: %v", nid, err)
133		}
134		if f(&pKey, &pEntry) {
135			return nil
136		}
137	}
138
139	return nil
140}
141
142func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) {
143	var pKeyMatched *peerKey
144	var pEntryMatched *peerEntry
145	err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
146		if pKey.peerIP.Equal(peerIP) {
147			pKeyMatched = pKey
148			pEntryMatched = pEntry
149			return true
150		}
151
152		return false
153	})
154
155	if err != nil {
156		return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
157	}
158
159	if pKeyMatched == nil || pEntryMatched == nil {
160		return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
161	}
162
163	return pKeyMatched, pEntryMatched, nil
164}
165
166func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
167	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
168
169	d.peerDb.Lock()
170	pMap, ok := d.peerDb.mp[nid]
171	if !ok {
172		d.peerDb.mp[nid] = &peerMap{
173			mp: common.NewSetMatrix(),
174		}
175
176		pMap = d.peerDb.mp[nid]
177	}
178	d.peerDb.Unlock()
179
180	pKey := peerKey{
181		peerIP:  peerIP,
182		peerMac: peerMac,
183	}
184
185	pEntry := peerEntry{
186		eid:        eid,
187		vtep:       vtep,
188		peerIPMask: peerIPMask,
189		isLocal:    isLocal,
190	}
191
192	pMap.Lock()
193	defer pMap.Unlock()
194	b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB())
195	if i != 1 {
196		// Transient case, there is more than one endpoint that is using the same IP,MAC pair
197		s, _ := pMap.mp.String(pKey.String())
198		logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
199	}
200	return b, i
201}
202
203func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
204	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
205
206	d.peerDb.Lock()
207	pMap, ok := d.peerDb.mp[nid]
208	if !ok {
209		d.peerDb.Unlock()
210		return false, 0
211	}
212	d.peerDb.Unlock()
213
214	pKey := peerKey{
215		peerIP:  peerIP,
216		peerMac: peerMac,
217	}
218
219	pEntry := peerEntry{
220		eid:        eid,
221		vtep:       vtep,
222		peerIPMask: peerIPMask,
223		isLocal:    isLocal,
224	}
225
226	pMap.Lock()
227	defer pMap.Unlock()
228	b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB())
229	if i != 0 {
230		// Transient case, there is more than one endpoint that is using the same IP,MAC pair
231		s, _ := pMap.mp.String(pKey.String())
232		logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
233	}
234	return b, i
235}
236
237// The overlay uses a lazy initialization approach, this means that when a network is created
238// and the driver registered the overlay does not allocate resources till the moment that a
239// sandbox is actually created.
240// At the moment of this call, that happens when a sandbox is initialized, is possible that
241// networkDB has already delivered some events of peers already available on remote nodes,
242// these peers are saved into the peerDB and this function is used to properly configure
243// the network sandbox with all those peers that got previously notified.
244// Note also that this method sends a single message on the channel and the go routine on the
245// other side, will atomically loop on the whole table of peers and will program their state
246// in one single atomic operation. This is fundamental to guarantee consistency, and avoid that
247// new peerAdd or peerDelete gets reordered during the sandbox init.
248func (d *driver) initSandboxPeerDB(nid string) {
249	d.peerInit(nid)
250}
251
252type peerOperationType int32
253
254const (
255	peerOperationINIT peerOperationType = iota
256	peerOperationADD
257	peerOperationDELETE
258	peerOperationFLUSH
259)
260
261type peerOperation struct {
262	opType     peerOperationType
263	networkID  string
264	endpointID string
265	peerIP     net.IP
266	peerIPMask net.IPMask
267	peerMac    net.HardwareAddr
268	vtepIP     net.IP
269	l2Miss     bool
270	l3Miss     bool
271	localPeer  bool
272	callerName string
273}
274
275func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) {
276	var err error
277	for {
278		select {
279		case <-ctx.Done():
280			return
281		case op := <-ch:
282			switch op.opType {
283			case peerOperationINIT:
284				err = d.peerInitOp(op.networkID)
285			case peerOperationADD:
286				err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer)
287			case peerOperationDELETE:
288				err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer)
289			case peerOperationFLUSH:
290				err = d.peerFlushOp(op.networkID)
291			}
292			if err != nil {
293				logrus.Warnf("Peer operation failed:%s op:%v", err, op)
294			}
295		}
296	}
297}
298
299func (d *driver) peerInit(nid string) {
300	callerName := common.CallerName(1)
301	d.peerOpCh <- &peerOperation{
302		opType:     peerOperationINIT,
303		networkID:  nid,
304		callerName: callerName,
305	}
306}
307
308func (d *driver) peerInitOp(nid string) error {
309	return d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
310		// Local entries do not need to be added
311		if pEntry.isLocal {
312			return false
313		}
314
315		d.peerAddOp(nid, pEntry.eid, pKey.peerIP, pEntry.peerIPMask, pKey.peerMac, pEntry.vtep, false, false, false, pEntry.isLocal)
316		// return false to loop on all entries
317		return false
318	})
319}
320
321func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
322	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) {
323	d.peerOpCh <- &peerOperation{
324		opType:     peerOperationADD,
325		networkID:  nid,
326		endpointID: eid,
327		peerIP:     peerIP,
328		peerIPMask: peerIPMask,
329		peerMac:    peerMac,
330		vtepIP:     vtep,
331		l2Miss:     l2Miss,
332		l3Miss:     l3Miss,
333		localPeer:  localPeer,
334		callerName: common.CallerName(1),
335	}
336}
337
338func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
339	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error {
340
341	if err := validateID(nid, eid); err != nil {
342		return err
343	}
344
345	var dbEntries int
346	var inserted bool
347	if updateDB {
348		inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
349		if !inserted {
350			logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
351				nid, eid, peerIP, peerMac, localPeer, vtep)
352		}
353	}
354
355	// Local peers do not need any further configuration
356	if localPeer {
357		return nil
358	}
359
360	n := d.network(nid)
361	if n == nil {
362		return nil
363	}
364
365	sbox := n.sandbox()
366	if sbox == nil {
367		// We are hitting this case for all the events that are arriving before that the sandbox
368		// is being created. The peer got already added into the database and the sanbox init will
369		// call the peerDbUpdateSandbox that will configure all these peers from the database
370		return nil
371	}
372
373	IP := &net.IPNet{
374		IP:   peerIP,
375		Mask: peerIPMask,
376	}
377
378	s := n.getSubnetforIP(IP)
379	if s == nil {
380		return fmt.Errorf("couldn't find the subnet %q in network %q", IP.String(), n.id)
381	}
382
383	if err := n.obtainVxlanID(s); err != nil {
384		return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
385	}
386
387	if err := n.joinSubnetSandbox(s, false); err != nil {
388		return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
389	}
390
391	if err := d.checkEncryption(nid, vtep, n.vxlanID(s), false, true); err != nil {
392		logrus.Warn(err)
393	}
394
395	// Add neighbor entry for the peer IP
396	if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil {
397		if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 {
398			// We are in the transient case so only the first configuration is programmed into the kernel
399			// Upon deletion if the active configuration is deleted the next one from the database will be restored
400			// Note we are skipping also the next configuration
401			return nil
402		}
403		return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
404	}
405
406	// Add fdb entry to the bridge for the peer mac
407	if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName),
408		sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil {
409		return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
410	}
411
412	return nil
413}
414
415func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
416	peerMac net.HardwareAddr, vtep net.IP, localPeer bool) {
417	d.peerOpCh <- &peerOperation{
418		opType:     peerOperationDELETE,
419		networkID:  nid,
420		endpointID: eid,
421		peerIP:     peerIP,
422		peerIPMask: peerIPMask,
423		peerMac:    peerMac,
424		vtepIP:     vtep,
425		callerName: common.CallerName(1),
426		localPeer:  localPeer,
427	}
428}
429
430func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
431	peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error {
432
433	if err := validateID(nid, eid); err != nil {
434		return err
435	}
436
437	deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
438	if !deleted {
439		logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
440			nid, eid, peerIP, peerMac, localPeer, vtep)
441	}
442
443	n := d.network(nid)
444	if n == nil {
445		return nil
446	}
447
448	sbox := n.sandbox()
449	if sbox == nil {
450		return nil
451	}
452
453	if err := d.checkEncryption(nid, vtep, 0, localPeer, false); err != nil {
454		logrus.Warn(err)
455	}
456
457	// Local peers do not have any local configuration to delete
458	if !localPeer {
459		// Remove fdb entry to the bridge for the peer mac
460		if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil {
461			if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 {
462				// We fall in here if there is a transient state and if the neighbor that is being deleted
463				// was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping)
464				return nil
465			}
466			return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
467		}
468
469		// Delete neighbor entry for the peer IP
470		if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil {
471			return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
472		}
473	}
474
475	if dbEntries == 0 {
476		return nil
477	}
478
479	// If there is still an entry into the database and the deletion went through without errors means that there is now no
480	// configuration active in the kernel.
481	// Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one
482	peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP)
483	if err != nil {
484		logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err)
485		return err
486	}
487	return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal)
488}
489
490func (d *driver) peerFlush(nid string) {
491	d.peerOpCh <- &peerOperation{
492		opType:     peerOperationFLUSH,
493		networkID:  nid,
494		callerName: common.CallerName(1),
495	}
496}
497
498func (d *driver) peerFlushOp(nid string) error {
499	d.peerDb.Lock()
500	defer d.peerDb.Unlock()
501	_, ok := d.peerDb.mp[nid]
502	if !ok {
503		return fmt.Errorf("Unable to find the peerDB for nid:%s", nid)
504	}
505	delete(d.peerDb.mp, nid)
506	return nil
507}
508
509func (d *driver) pushLocalDb() {
510	d.peerDbWalk(func(nid string, pKey *peerKey, pEntry *peerEntry) bool {
511		if pEntry.isLocal {
512			d.pushLocalEndpointEvent("join", nid, pEntry.eid)
513		}
514		return false
515	})
516}
517
518func (d *driver) peerDBUpdateSelf() {
519	d.peerDbWalk(func(nid string, pkey *peerKey, pEntry *peerEntry) bool {
520		if pEntry.isLocal {
521			pEntry.vtep = net.ParseIP(d.advertiseAddress)
522		}
523		return false
524	})
525}
526