1/*
2 *
3 * Copyright 2018 gRPC authors.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19package channelz
20
21import (
22	"net"
23	"sync"
24	"sync/atomic"
25	"time"
26
27	"google.golang.org/grpc/connectivity"
28	"google.golang.org/grpc/credentials"
29)
30
31// entry represents a node in the channelz database.
32type entry interface {
33	// addChild adds a child e, whose channelz id is id to child list
34	addChild(id int64, e entry)
35	// deleteChild deletes a child with channelz id to be id from child list
36	deleteChild(id int64)
37	// triggerDelete tries to delete self from channelz database. However, if child
38	// list is not empty, then deletion from the database is on hold until the last
39	// child is deleted from database.
40	triggerDelete()
41	// deleteSelfIfReady check whether triggerDelete() has been called before, and whether child
42	// list is now empty. If both conditions are met, then delete self from database.
43	deleteSelfIfReady()
44	// getParentID returns parent ID of the entry. 0 value parent ID means no parent.
45	getParentID() int64
46}
47
48// dummyEntry is a fake entry to handle entry not found case.
49type dummyEntry struct {
50	idNotFound int64
51}
52
53func (d *dummyEntry) addChild(id int64, e entry) {
54	// Note: It is possible for a normal program to reach here under race condition.
55	// For example, there could be a race between ClientConn.Close() info being propagated
56	// to addrConn and http2Client. ClientConn.Close() cancel the context and result
57	// in http2Client to error. The error info is then caught by transport monitor
58	// and before addrConn.tearDown() is called in side ClientConn.Close(). Therefore,
59	// the addrConn will create a new transport. And when registering the new transport in
60	// channelz, its parent addrConn could have already been torn down and deleted
61	// from channelz tracking, and thus reach the code here.
62	logger.Infof("attempt to add child of type %T with id %d to a parent (id=%d) that doesn't currently exist", e, id, d.idNotFound)
63}
64
65func (d *dummyEntry) deleteChild(id int64) {
66	// It is possible for a normal program to reach here under race condition.
67	// Refer to the example described in addChild().
68	logger.Infof("attempt to delete child with id %d from a parent (id=%d) that doesn't currently exist", id, d.idNotFound)
69}
70
71func (d *dummyEntry) triggerDelete() {
72	logger.Warningf("attempt to delete an entry (id=%d) that doesn't currently exist", d.idNotFound)
73}
74
75func (*dummyEntry) deleteSelfIfReady() {
76	// code should not reach here. deleteSelfIfReady is always called on an existing entry.
77}
78
79func (*dummyEntry) getParentID() int64 {
80	return 0
81}
82
83// ChannelMetric defines the info channelz provides for a specific Channel, which
84// includes ChannelInternalMetric and channelz-specific data, such as channelz id,
85// child list, etc.
86type ChannelMetric struct {
87	// ID is the channelz id of this channel.
88	ID int64
89	// RefName is the human readable reference string of this channel.
90	RefName string
91	// ChannelData contains channel internal metric reported by the channel through
92	// ChannelzMetric().
93	ChannelData *ChannelInternalMetric
94	// NestedChans tracks the nested channel type children of this channel in the format of
95	// a map from nested channel channelz id to corresponding reference string.
96	NestedChans map[int64]string
97	// SubChans tracks the subchannel type children of this channel in the format of a
98	// map from subchannel channelz id to corresponding reference string.
99	SubChans map[int64]string
100	// Sockets tracks the socket type children of this channel in the format of a map
101	// from socket channelz id to corresponding reference string.
102	// Note current grpc implementation doesn't allow channel having sockets directly,
103	// therefore, this is field is unused.
104	Sockets map[int64]string
105	// Trace contains the most recent traced events.
106	Trace *ChannelTrace
107}
108
109// SubChannelMetric defines the info channelz provides for a specific SubChannel,
110// which includes ChannelInternalMetric and channelz-specific data, such as
111// channelz id, child list, etc.
112type SubChannelMetric struct {
113	// ID is the channelz id of this subchannel.
114	ID int64
115	// RefName is the human readable reference string of this subchannel.
116	RefName string
117	// ChannelData contains subchannel internal metric reported by the subchannel
118	// through ChannelzMetric().
119	ChannelData *ChannelInternalMetric
120	// NestedChans tracks the nested channel type children of this subchannel in the format of
121	// a map from nested channel channelz id to corresponding reference string.
122	// Note current grpc implementation doesn't allow subchannel to have nested channels
123	// as children, therefore, this field is unused.
124	NestedChans map[int64]string
125	// SubChans tracks the subchannel type children of this subchannel in the format of a
126	// map from subchannel channelz id to corresponding reference string.
127	// Note current grpc implementation doesn't allow subchannel to have subchannels
128	// as children, therefore, this field is unused.
129	SubChans map[int64]string
130	// Sockets tracks the socket type children of this subchannel in the format of a map
131	// from socket channelz id to corresponding reference string.
132	Sockets map[int64]string
133	// Trace contains the most recent traced events.
134	Trace *ChannelTrace
135}
136
137// ChannelInternalMetric defines the struct that the implementor of Channel interface
138// should return from ChannelzMetric().
139type ChannelInternalMetric struct {
140	// current connectivity state of the channel.
141	State connectivity.State
142	// The target this channel originally tried to connect to.  May be absent
143	Target string
144	// The number of calls started on the channel.
145	CallsStarted int64
146	// The number of calls that have completed with an OK status.
147	CallsSucceeded int64
148	// The number of calls that have a completed with a non-OK status.
149	CallsFailed int64
150	// The last time a call was started on the channel.
151	LastCallStartedTimestamp time.Time
152}
153
154// ChannelTrace stores traced events on a channel/subchannel and related info.
155type ChannelTrace struct {
156	// EventNum is the number of events that ever got traced (i.e. including those that have been deleted)
157	EventNum int64
158	// CreationTime is the creation time of the trace.
159	CreationTime time.Time
160	// Events stores the most recent trace events (up to $maxTraceEntry, newer event will overwrite the
161	// oldest one)
162	Events []*TraceEvent
163}
164
165// TraceEvent represent a single trace event
166type TraceEvent struct {
167	// Desc is a simple description of the trace event.
168	Desc string
169	// Severity states the severity of this trace event.
170	Severity Severity
171	// Timestamp is the event time.
172	Timestamp time.Time
173	// RefID is the id of the entity that gets referenced in the event. RefID is 0 if no other entity is
174	// involved in this event.
175	// e.g. SubChannel (id: 4[]) Created. --> RefID = 4, RefName = "" (inside [])
176	RefID int64
177	// RefName is the reference name for the entity that gets referenced in the event.
178	RefName string
179	// RefType indicates the referenced entity type, i.e Channel or SubChannel.
180	RefType RefChannelType
181}
182
183// Channel is the interface that should be satisfied in order to be tracked by
184// channelz as Channel or SubChannel.
185type Channel interface {
186	ChannelzMetric() *ChannelInternalMetric
187}
188
189type dummyChannel struct{}
190
191func (d *dummyChannel) ChannelzMetric() *ChannelInternalMetric {
192	return &ChannelInternalMetric{}
193}
194
195type channel struct {
196	refName     string
197	c           Channel
198	closeCalled bool
199	nestedChans map[int64]string
200	subChans    map[int64]string
201	id          int64
202	pid         int64
203	cm          *channelMap
204	trace       *channelTrace
205	// traceRefCount is the number of trace events that reference this channel.
206	// Non-zero traceRefCount means the trace of this channel cannot be deleted.
207	traceRefCount int32
208}
209
210func (c *channel) addChild(id int64, e entry) {
211	switch v := e.(type) {
212	case *subChannel:
213		c.subChans[id] = v.refName
214	case *channel:
215		c.nestedChans[id] = v.refName
216	default:
217		logger.Errorf("cannot add a child (id = %d) of type %T to a channel", id, e)
218	}
219}
220
221func (c *channel) deleteChild(id int64) {
222	delete(c.subChans, id)
223	delete(c.nestedChans, id)
224	c.deleteSelfIfReady()
225}
226
227func (c *channel) triggerDelete() {
228	c.closeCalled = true
229	c.deleteSelfIfReady()
230}
231
232func (c *channel) getParentID() int64 {
233	return c.pid
234}
235
236// deleteSelfFromTree tries to delete the channel from the channelz entry relation tree, which means
237// deleting the channel reference from its parent's child list.
238//
239// In order for a channel to be deleted from the tree, it must meet the criteria that, removal of the
240// corresponding grpc object has been invoked, and the channel does not have any children left.
241//
242// The returned boolean value indicates whether the channel has been successfully deleted from tree.
243func (c *channel) deleteSelfFromTree() (deleted bool) {
244	if !c.closeCalled || len(c.subChans)+len(c.nestedChans) != 0 {
245		return false
246	}
247	// not top channel
248	if c.pid != 0 {
249		c.cm.findEntry(c.pid).deleteChild(c.id)
250	}
251	return true
252}
253
254// deleteSelfFromMap checks whether it is valid to delete the channel from the map, which means
255// deleting the channel from channelz's tracking entirely. Users can no longer use id to query the
256// channel, and its memory will be garbage collected.
257//
258// The trace reference count of the channel must be 0 in order to be deleted from the map. This is
259// specified in the channel tracing gRFC that as long as some other trace has reference to an entity,
260// the trace of the referenced entity must not be deleted. In order to release the resource allocated
261// by grpc, the reference to the grpc object is reset to a dummy object.
262//
263// deleteSelfFromMap must be called after deleteSelfFromTree returns true.
264//
265// It returns a bool to indicate whether the channel can be safely deleted from map.
266func (c *channel) deleteSelfFromMap() (delete bool) {
267	if c.getTraceRefCount() != 0 {
268		c.c = &dummyChannel{}
269		return false
270	}
271	return true
272}
273
274// deleteSelfIfReady tries to delete the channel itself from the channelz database.
275// The delete process includes two steps:
276// 1. delete the channel from the entry relation tree, i.e. delete the channel reference from its
277//    parent's child list.
278// 2. delete the channel from the map, i.e. delete the channel entirely from channelz. Lookup by id
279//    will return entry not found error.
280func (c *channel) deleteSelfIfReady() {
281	if !c.deleteSelfFromTree() {
282		return
283	}
284	if !c.deleteSelfFromMap() {
285		return
286	}
287	c.cm.deleteEntry(c.id)
288	c.trace.clear()
289}
290
291func (c *channel) getChannelTrace() *channelTrace {
292	return c.trace
293}
294
295func (c *channel) incrTraceRefCount() {
296	atomic.AddInt32(&c.traceRefCount, 1)
297}
298
299func (c *channel) decrTraceRefCount() {
300	atomic.AddInt32(&c.traceRefCount, -1)
301}
302
303func (c *channel) getTraceRefCount() int {
304	i := atomic.LoadInt32(&c.traceRefCount)
305	return int(i)
306}
307
308func (c *channel) getRefName() string {
309	return c.refName
310}
311
312type subChannel struct {
313	refName       string
314	c             Channel
315	closeCalled   bool
316	sockets       map[int64]string
317	id            int64
318	pid           int64
319	cm            *channelMap
320	trace         *channelTrace
321	traceRefCount int32
322}
323
324func (sc *subChannel) addChild(id int64, e entry) {
325	if v, ok := e.(*normalSocket); ok {
326		sc.sockets[id] = v.refName
327	} else {
328		logger.Errorf("cannot add a child (id = %d) of type %T to a subChannel", id, e)
329	}
330}
331
332func (sc *subChannel) deleteChild(id int64) {
333	delete(sc.sockets, id)
334	sc.deleteSelfIfReady()
335}
336
337func (sc *subChannel) triggerDelete() {
338	sc.closeCalled = true
339	sc.deleteSelfIfReady()
340}
341
342func (sc *subChannel) getParentID() int64 {
343	return sc.pid
344}
345
346// deleteSelfFromTree tries to delete the subchannel from the channelz entry relation tree, which
347// means deleting the subchannel reference from its parent's child list.
348//
349// In order for a subchannel to be deleted from the tree, it must meet the criteria that, removal of
350// the corresponding grpc object has been invoked, and the subchannel does not have any children left.
351//
352// The returned boolean value indicates whether the channel has been successfully deleted from tree.
353func (sc *subChannel) deleteSelfFromTree() (deleted bool) {
354	if !sc.closeCalled || len(sc.sockets) != 0 {
355		return false
356	}
357	sc.cm.findEntry(sc.pid).deleteChild(sc.id)
358	return true
359}
360
361// deleteSelfFromMap checks whether it is valid to delete the subchannel from the map, which means
362// deleting the subchannel from channelz's tracking entirely. Users can no longer use id to query
363// the subchannel, and its memory will be garbage collected.
364//
365// The trace reference count of the subchannel must be 0 in order to be deleted from the map. This is
366// specified in the channel tracing gRFC that as long as some other trace has reference to an entity,
367// the trace of the referenced entity must not be deleted. In order to release the resource allocated
368// by grpc, the reference to the grpc object is reset to a dummy object.
369//
370// deleteSelfFromMap must be called after deleteSelfFromTree returns true.
371//
372// It returns a bool to indicate whether the channel can be safely deleted from map.
373func (sc *subChannel) deleteSelfFromMap() (delete bool) {
374	if sc.getTraceRefCount() != 0 {
375		// free the grpc struct (i.e. addrConn)
376		sc.c = &dummyChannel{}
377		return false
378	}
379	return true
380}
381
382// deleteSelfIfReady tries to delete the subchannel itself from the channelz database.
383// The delete process includes two steps:
384// 1. delete the subchannel from the entry relation tree, i.e. delete the subchannel reference from
385//    its parent's child list.
386// 2. delete the subchannel from the map, i.e. delete the subchannel entirely from channelz. Lookup
387//    by id will return entry not found error.
388func (sc *subChannel) deleteSelfIfReady() {
389	if !sc.deleteSelfFromTree() {
390		return
391	}
392	if !sc.deleteSelfFromMap() {
393		return
394	}
395	sc.cm.deleteEntry(sc.id)
396	sc.trace.clear()
397}
398
399func (sc *subChannel) getChannelTrace() *channelTrace {
400	return sc.trace
401}
402
403func (sc *subChannel) incrTraceRefCount() {
404	atomic.AddInt32(&sc.traceRefCount, 1)
405}
406
407func (sc *subChannel) decrTraceRefCount() {
408	atomic.AddInt32(&sc.traceRefCount, -1)
409}
410
411func (sc *subChannel) getTraceRefCount() int {
412	i := atomic.LoadInt32(&sc.traceRefCount)
413	return int(i)
414}
415
416func (sc *subChannel) getRefName() string {
417	return sc.refName
418}
419
420// SocketMetric defines the info channelz provides for a specific Socket, which
421// includes SocketInternalMetric and channelz-specific data, such as channelz id, etc.
422type SocketMetric struct {
423	// ID is the channelz id of this socket.
424	ID int64
425	// RefName is the human readable reference string of this socket.
426	RefName string
427	// SocketData contains socket internal metric reported by the socket through
428	// ChannelzMetric().
429	SocketData *SocketInternalMetric
430}
431
432// SocketInternalMetric defines the struct that the implementor of Socket interface
433// should return from ChannelzMetric().
434type SocketInternalMetric struct {
435	// The number of streams that have been started.
436	StreamsStarted int64
437	// The number of streams that have ended successfully:
438	// On client side, receiving frame with eos bit set.
439	// On server side, sending frame with eos bit set.
440	StreamsSucceeded int64
441	// The number of streams that have ended unsuccessfully:
442	// On client side, termination without receiving frame with eos bit set.
443	// On server side, termination without sending frame with eos bit set.
444	StreamsFailed int64
445	// The number of messages successfully sent on this socket.
446	MessagesSent     int64
447	MessagesReceived int64
448	// The number of keep alives sent.  This is typically implemented with HTTP/2
449	// ping messages.
450	KeepAlivesSent int64
451	// The last time a stream was created by this endpoint.  Usually unset for
452	// servers.
453	LastLocalStreamCreatedTimestamp time.Time
454	// The last time a stream was created by the remote endpoint.  Usually unset
455	// for clients.
456	LastRemoteStreamCreatedTimestamp time.Time
457	// The last time a message was sent by this endpoint.
458	LastMessageSentTimestamp time.Time
459	// The last time a message was received by this endpoint.
460	LastMessageReceivedTimestamp time.Time
461	// The amount of window, granted to the local endpoint by the remote endpoint.
462	// This may be slightly out of date due to network latency.  This does NOT
463	// include stream level or TCP level flow control info.
464	LocalFlowControlWindow int64
465	// The amount of window, granted to the remote endpoint by the local endpoint.
466	// This may be slightly out of date due to network latency.  This does NOT
467	// include stream level or TCP level flow control info.
468	RemoteFlowControlWindow int64
469	// The locally bound address.
470	LocalAddr net.Addr
471	// The remote bound address.  May be absent.
472	RemoteAddr net.Addr
473	// Optional, represents the name of the remote endpoint, if different than
474	// the original target name.
475	RemoteName    string
476	SocketOptions *SocketOptionData
477	Security      credentials.ChannelzSecurityValue
478}
479
480// Socket is the interface that should be satisfied in order to be tracked by
481// channelz as Socket.
482type Socket interface {
483	ChannelzMetric() *SocketInternalMetric
484}
485
486type listenSocket struct {
487	refName string
488	s       Socket
489	id      int64
490	pid     int64
491	cm      *channelMap
492}
493
494func (ls *listenSocket) addChild(id int64, e entry) {
495	logger.Errorf("cannot add a child (id = %d) of type %T to a listen socket", id, e)
496}
497
498func (ls *listenSocket) deleteChild(id int64) {
499	logger.Errorf("cannot delete a child (id = %d) from a listen socket", id)
500}
501
502func (ls *listenSocket) triggerDelete() {
503	ls.cm.deleteEntry(ls.id)
504	ls.cm.findEntry(ls.pid).deleteChild(ls.id)
505}
506
507func (ls *listenSocket) deleteSelfIfReady() {
508	logger.Errorf("cannot call deleteSelfIfReady on a listen socket")
509}
510
511func (ls *listenSocket) getParentID() int64 {
512	return ls.pid
513}
514
515type normalSocket struct {
516	refName string
517	s       Socket
518	id      int64
519	pid     int64
520	cm      *channelMap
521}
522
523func (ns *normalSocket) addChild(id int64, e entry) {
524	logger.Errorf("cannot add a child (id = %d) of type %T to a normal socket", id, e)
525}
526
527func (ns *normalSocket) deleteChild(id int64) {
528	logger.Errorf("cannot delete a child (id = %d) from a normal socket", id)
529}
530
531func (ns *normalSocket) triggerDelete() {
532	ns.cm.deleteEntry(ns.id)
533	ns.cm.findEntry(ns.pid).deleteChild(ns.id)
534}
535
536func (ns *normalSocket) deleteSelfIfReady() {
537	logger.Errorf("cannot call deleteSelfIfReady on a normal socket")
538}
539
540func (ns *normalSocket) getParentID() int64 {
541	return ns.pid
542}
543
544// ServerMetric defines the info channelz provides for a specific Server, which
545// includes ServerInternalMetric and channelz-specific data, such as channelz id,
546// child list, etc.
547type ServerMetric struct {
548	// ID is the channelz id of this server.
549	ID int64
550	// RefName is the human readable reference string of this server.
551	RefName string
552	// ServerData contains server internal metric reported by the server through
553	// ChannelzMetric().
554	ServerData *ServerInternalMetric
555	// ListenSockets tracks the listener socket type children of this server in the
556	// format of a map from socket channelz id to corresponding reference string.
557	ListenSockets map[int64]string
558}
559
560// ServerInternalMetric defines the struct that the implementor of Server interface
561// should return from ChannelzMetric().
562type ServerInternalMetric struct {
563	// The number of incoming calls started on the server.
564	CallsStarted int64
565	// The number of incoming calls that have completed with an OK status.
566	CallsSucceeded int64
567	// The number of incoming calls that have a completed with a non-OK status.
568	CallsFailed int64
569	// The last time a call was started on the server.
570	LastCallStartedTimestamp time.Time
571}
572
573// Server is the interface to be satisfied in order to be tracked by channelz as
574// Server.
575type Server interface {
576	ChannelzMetric() *ServerInternalMetric
577}
578
579type server struct {
580	refName       string
581	s             Server
582	closeCalled   bool
583	sockets       map[int64]string
584	listenSockets map[int64]string
585	id            int64
586	cm            *channelMap
587}
588
589func (s *server) addChild(id int64, e entry) {
590	switch v := e.(type) {
591	case *normalSocket:
592		s.sockets[id] = v.refName
593	case *listenSocket:
594		s.listenSockets[id] = v.refName
595	default:
596		logger.Errorf("cannot add a child (id = %d) of type %T to a server", id, e)
597	}
598}
599
600func (s *server) deleteChild(id int64) {
601	delete(s.sockets, id)
602	delete(s.listenSockets, id)
603	s.deleteSelfIfReady()
604}
605
606func (s *server) triggerDelete() {
607	s.closeCalled = true
608	s.deleteSelfIfReady()
609}
610
611func (s *server) deleteSelfIfReady() {
612	if !s.closeCalled || len(s.sockets)+len(s.listenSockets) != 0 {
613		return
614	}
615	s.cm.deleteEntry(s.id)
616}
617
618func (s *server) getParentID() int64 {
619	return 0
620}
621
622type tracedChannel interface {
623	getChannelTrace() *channelTrace
624	incrTraceRefCount()
625	decrTraceRefCount()
626	getRefName() string
627}
628
629type channelTrace struct {
630	cm          *channelMap
631	createdTime time.Time
632	eventCount  int64
633	mu          sync.Mutex
634	events      []*TraceEvent
635}
636
637func (c *channelTrace) append(e *TraceEvent) {
638	c.mu.Lock()
639	if len(c.events) == getMaxTraceEntry() {
640		del := c.events[0]
641		c.events = c.events[1:]
642		if del.RefID != 0 {
643			// start recursive cleanup in a goroutine to not block the call originated from grpc.
644			go func() {
645				// need to acquire c.cm.mu lock to call the unlocked attemptCleanup func.
646				c.cm.mu.Lock()
647				c.cm.decrTraceRefCount(del.RefID)
648				c.cm.mu.Unlock()
649			}()
650		}
651	}
652	e.Timestamp = time.Now()
653	c.events = append(c.events, e)
654	c.eventCount++
655	c.mu.Unlock()
656}
657
658func (c *channelTrace) clear() {
659	c.mu.Lock()
660	for _, e := range c.events {
661		if e.RefID != 0 {
662			// caller should have already held the c.cm.mu lock.
663			c.cm.decrTraceRefCount(e.RefID)
664		}
665	}
666	c.mu.Unlock()
667}
668
669// Severity is the severity level of a trace event.
670// The canonical enumeration of all valid values is here:
671// https://github.com/grpc/grpc-proto/blob/9b13d199cc0d4703c7ea26c9c330ba695866eb23/grpc/channelz/v1/channelz.proto#L126.
672type Severity int
673
674const (
675	// CtUnknown indicates unknown severity of a trace event.
676	CtUnknown Severity = iota
677	// CtInfo indicates info level severity of a trace event.
678	CtInfo
679	// CtWarning indicates warning level severity of a trace event.
680	CtWarning
681	// CtError indicates error level severity of a trace event.
682	CtError
683)
684
685// RefChannelType is the type of the entity being referenced in a trace event.
686type RefChannelType int
687
688const (
689	// RefChannel indicates the referenced entity is a Channel.
690	RefChannel RefChannelType = iota
691	// RefSubChannel indicates the referenced entity is a SubChannel.
692	RefSubChannel
693)
694
695func (c *channelTrace) dumpData() *ChannelTrace {
696	c.mu.Lock()
697	ct := &ChannelTrace{EventNum: c.eventCount, CreationTime: c.createdTime}
698	ct.Events = c.events[:len(c.events)]
699	c.mu.Unlock()
700	return ct
701}
702