1/*
2 *
3 * Copyright 2018 gRPC authors.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 */
18
19package channelz
20
21import (
22	"net"
23	"sync"
24	"sync/atomic"
25	"time"
26
27	"google.golang.org/grpc/connectivity"
28	"google.golang.org/grpc/credentials"
29	"google.golang.org/grpc/grpclog"
30)
31
32// entry represents a node in the channelz database.
33type entry interface {
34	// addChild adds a child e, whose channelz id is id to child list
35	addChild(id int64, e entry)
36	// deleteChild deletes a child with channelz id to be id from child list
37	deleteChild(id int64)
38	// triggerDelete tries to delete self from channelz database. However, if child
39	// list is not empty, then deletion from the database is on hold until the last
40	// child is deleted from database.
41	triggerDelete()
42	// deleteSelfIfReady check whether triggerDelete() has been called before, and whether child
43	// list is now empty. If both conditions are met, then delete self from database.
44	deleteSelfIfReady()
45	// getParentID returns parent ID of the entry. 0 value parent ID means no parent.
46	getParentID() int64
47}
48
49// dummyEntry is a fake entry to handle entry not found case.
50type dummyEntry struct {
51	idNotFound int64
52}
53
54func (d *dummyEntry) addChild(id int64, e entry) {
55	// Note: It is possible for a normal program to reach here under race condition.
56	// For example, there could be a race between ClientConn.Close() info being propagated
57	// to addrConn and http2Client. ClientConn.Close() cancel the context and result
58	// in http2Client to error. The error info is then caught by transport monitor
59	// and before addrConn.tearDown() is called in side ClientConn.Close(). Therefore,
60	// the addrConn will create a new transport. And when registering the new transport in
61	// channelz, its parent addrConn could have already been torn down and deleted
62	// from channelz tracking, and thus reach the code here.
63	grpclog.Infof("attempt to add child of type %T with id %d to a parent (id=%d) that doesn't currently exist", e, id, d.idNotFound)
64}
65
66func (d *dummyEntry) deleteChild(id int64) {
67	// It is possible for a normal program to reach here under race condition.
68	// Refer to the example described in addChild().
69	grpclog.Infof("attempt to delete child with id %d from a parent (id=%d) that doesn't currently exist", id, d.idNotFound)
70}
71
72func (d *dummyEntry) triggerDelete() {
73	grpclog.Warningf("attempt to delete an entry (id=%d) that doesn't currently exist", d.idNotFound)
74}
75
76func (*dummyEntry) deleteSelfIfReady() {
77	// code should not reach here. deleteSelfIfReady is always called on an existing entry.
78}
79
80func (*dummyEntry) getParentID() int64 {
81	return 0
82}
83
84// ChannelMetric defines the info channelz provides for a specific Channel, which
85// includes ChannelInternalMetric and channelz-specific data, such as channelz id,
86// child list, etc.
87type ChannelMetric struct {
88	// ID is the channelz id of this channel.
89	ID int64
90	// RefName is the human readable reference string of this channel.
91	RefName string
92	// ChannelData contains channel internal metric reported by the channel through
93	// ChannelzMetric().
94	ChannelData *ChannelInternalMetric
95	// NestedChans tracks the nested channel type children of this channel in the format of
96	// a map from nested channel channelz id to corresponding reference string.
97	NestedChans map[int64]string
98	// SubChans tracks the subchannel type children of this channel in the format of a
99	// map from subchannel channelz id to corresponding reference string.
100	SubChans map[int64]string
101	// Sockets tracks the socket type children of this channel in the format of a map
102	// from socket channelz id to corresponding reference string.
103	// Note current grpc implementation doesn't allow channel having sockets directly,
104	// therefore, this is field is unused.
105	Sockets map[int64]string
106	// Trace contains the most recent traced events.
107	Trace *ChannelTrace
108}
109
110// SubChannelMetric defines the info channelz provides for a specific SubChannel,
111// which includes ChannelInternalMetric and channelz-specific data, such as
112// channelz id, child list, etc.
113type SubChannelMetric struct {
114	// ID is the channelz id of this subchannel.
115	ID int64
116	// RefName is the human readable reference string of this subchannel.
117	RefName string
118	// ChannelData contains subchannel internal metric reported by the subchannel
119	// through ChannelzMetric().
120	ChannelData *ChannelInternalMetric
121	// NestedChans tracks the nested channel type children of this subchannel in the format of
122	// a map from nested channel channelz id to corresponding reference string.
123	// Note current grpc implementation doesn't allow subchannel to have nested channels
124	// as children, therefore, this field is unused.
125	NestedChans map[int64]string
126	// SubChans tracks the subchannel type children of this subchannel in the format of a
127	// map from subchannel channelz id to corresponding reference string.
128	// Note current grpc implementation doesn't allow subchannel to have subchannels
129	// as children, therefore, this field is unused.
130	SubChans map[int64]string
131	// Sockets tracks the socket type children of this subchannel in the format of a map
132	// from socket channelz id to corresponding reference string.
133	Sockets map[int64]string
134	// Trace contains the most recent traced events.
135	Trace *ChannelTrace
136}
137
138// ChannelInternalMetric defines the struct that the implementor of Channel interface
139// should return from ChannelzMetric().
140type ChannelInternalMetric struct {
141	// current connectivity state of the channel.
142	State connectivity.State
143	// The target this channel originally tried to connect to.  May be absent
144	Target string
145	// The number of calls started on the channel.
146	CallsStarted int64
147	// The number of calls that have completed with an OK status.
148	CallsSucceeded int64
149	// The number of calls that have a completed with a non-OK status.
150	CallsFailed int64
151	// The last time a call was started on the channel.
152	LastCallStartedTimestamp time.Time
153}
154
155// ChannelTrace stores traced events on a channel/subchannel and related info.
156type ChannelTrace struct {
157	// EventNum is the number of events that ever got traced (i.e. including those that have been deleted)
158	EventNum int64
159	// CreationTime is the creation time of the trace.
160	CreationTime time.Time
161	// Events stores the most recent trace events (up to $maxTraceEntry, newer event will overwrite the
162	// oldest one)
163	Events []*TraceEvent
164}
165
166// TraceEvent represent a single trace event
167type TraceEvent struct {
168	// Desc is a simple description of the trace event.
169	Desc string
170	// Severity states the severity of this trace event.
171	Severity Severity
172	// Timestamp is the event time.
173	Timestamp time.Time
174	// RefID is the id of the entity that gets referenced in the event. RefID is 0 if no other entity is
175	// involved in this event.
176	// e.g. SubChannel (id: 4[]) Created. --> RefID = 4, RefName = "" (inside [])
177	RefID int64
178	// RefName is the reference name for the entity that gets referenced in the event.
179	RefName string
180	// RefType indicates the referenced entity type, i.e Channel or SubChannel.
181	RefType RefChannelType
182}
183
184// Channel is the interface that should be satisfied in order to be tracked by
185// channelz as Channel or SubChannel.
186type Channel interface {
187	ChannelzMetric() *ChannelInternalMetric
188}
189
190type dummyChannel struct{}
191
192func (d *dummyChannel) ChannelzMetric() *ChannelInternalMetric {
193	return &ChannelInternalMetric{}
194}
195
196type channel struct {
197	refName     string
198	c           Channel
199	closeCalled bool
200	nestedChans map[int64]string
201	subChans    map[int64]string
202	id          int64
203	pid         int64
204	cm          *channelMap
205	trace       *channelTrace
206	// traceRefCount is the number of trace events that reference this channel.
207	// Non-zero traceRefCount means the trace of this channel cannot be deleted.
208	traceRefCount int32
209}
210
211func (c *channel) addChild(id int64, e entry) {
212	switch v := e.(type) {
213	case *subChannel:
214		c.subChans[id] = v.refName
215	case *channel:
216		c.nestedChans[id] = v.refName
217	default:
218		grpclog.Errorf("cannot add a child (id = %d) of type %T to a channel", id, e)
219	}
220}
221
222func (c *channel) deleteChild(id int64) {
223	delete(c.subChans, id)
224	delete(c.nestedChans, id)
225	c.deleteSelfIfReady()
226}
227
228func (c *channel) triggerDelete() {
229	c.closeCalled = true
230	c.deleteSelfIfReady()
231}
232
233func (c *channel) getParentID() int64 {
234	return c.pid
235}
236
237// deleteSelfFromTree tries to delete the channel from the channelz entry relation tree, which means
238// deleting the channel reference from its parent's child list.
239//
240// In order for a channel to be deleted from the tree, it must meet the criteria that, removal of the
241// corresponding grpc object has been invoked, and the channel does not have any children left.
242//
243// The returned boolean value indicates whether the channel has been successfully deleted from tree.
244func (c *channel) deleteSelfFromTree() (deleted bool) {
245	if !c.closeCalled || len(c.subChans)+len(c.nestedChans) != 0 {
246		return false
247	}
248	// not top channel
249	if c.pid != 0 {
250		c.cm.findEntry(c.pid).deleteChild(c.id)
251	}
252	return true
253}
254
255// deleteSelfFromMap checks whether it is valid to delete the channel from the map, which means
256// deleting the channel from channelz's tracking entirely. Users can no longer use id to query the
257// channel, and its memory will be garbage collected.
258//
259// The trace reference count of the channel must be 0 in order to be deleted from the map. This is
260// specified in the channel tracing gRFC that as long as some other trace has reference to an entity,
261// the trace of the referenced entity must not be deleted. In order to release the resource allocated
262// by grpc, the reference to the grpc object is reset to a dummy object.
263//
264// deleteSelfFromMap must be called after deleteSelfFromTree returns true.
265//
266// It returns a bool to indicate whether the channel can be safely deleted from map.
267func (c *channel) deleteSelfFromMap() (delete bool) {
268	if c.getTraceRefCount() != 0 {
269		c.c = &dummyChannel{}
270		return false
271	}
272	return true
273}
274
275// deleteSelfIfReady tries to delete the channel itself from the channelz database.
276// The delete process includes two steps:
277// 1. delete the channel from the entry relation tree, i.e. delete the channel reference from its
278//    parent's child list.
279// 2. delete the channel from the map, i.e. delete the channel entirely from channelz. Lookup by id
280//    will return entry not found error.
281func (c *channel) deleteSelfIfReady() {
282	if !c.deleteSelfFromTree() {
283		return
284	}
285	if !c.deleteSelfFromMap() {
286		return
287	}
288	c.cm.deleteEntry(c.id)
289	c.trace.clear()
290}
291
292func (c *channel) getChannelTrace() *channelTrace {
293	return c.trace
294}
295
296func (c *channel) incrTraceRefCount() {
297	atomic.AddInt32(&c.traceRefCount, 1)
298}
299
300func (c *channel) decrTraceRefCount() {
301	atomic.AddInt32(&c.traceRefCount, -1)
302}
303
304func (c *channel) getTraceRefCount() int {
305	i := atomic.LoadInt32(&c.traceRefCount)
306	return int(i)
307}
308
309func (c *channel) getRefName() string {
310	return c.refName
311}
312
313type subChannel struct {
314	refName       string
315	c             Channel
316	closeCalled   bool
317	sockets       map[int64]string
318	id            int64
319	pid           int64
320	cm            *channelMap
321	trace         *channelTrace
322	traceRefCount int32
323}
324
325func (sc *subChannel) addChild(id int64, e entry) {
326	if v, ok := e.(*normalSocket); ok {
327		sc.sockets[id] = v.refName
328	} else {
329		grpclog.Errorf("cannot add a child (id = %d) of type %T to a subChannel", id, e)
330	}
331}
332
333func (sc *subChannel) deleteChild(id int64) {
334	delete(sc.sockets, id)
335	sc.deleteSelfIfReady()
336}
337
338func (sc *subChannel) triggerDelete() {
339	sc.closeCalled = true
340	sc.deleteSelfIfReady()
341}
342
343func (sc *subChannel) getParentID() int64 {
344	return sc.pid
345}
346
347// deleteSelfFromTree tries to delete the subchannel from the channelz entry relation tree, which
348// means deleting the subchannel reference from its parent's child list.
349//
350// In order for a subchannel to be deleted from the tree, it must meet the criteria that, removal of
351// the corresponding grpc object has been invoked, and the subchannel does not have any children left.
352//
353// The returned boolean value indicates whether the channel has been successfully deleted from tree.
354func (sc *subChannel) deleteSelfFromTree() (deleted bool) {
355	if !sc.closeCalled || len(sc.sockets) != 0 {
356		return false
357	}
358	sc.cm.findEntry(sc.pid).deleteChild(sc.id)
359	return true
360}
361
362// deleteSelfFromMap checks whether it is valid to delete the subchannel from the map, which means
363// deleting the subchannel from channelz's tracking entirely. Users can no longer use id to query
364// the subchannel, and its memory will be garbage collected.
365//
366// The trace reference count of the subchannel must be 0 in order to be deleted from the map. This is
367// specified in the channel tracing gRFC that as long as some other trace has reference to an entity,
368// the trace of the referenced entity must not be deleted. In order to release the resource allocated
369// by grpc, the reference to the grpc object is reset to a dummy object.
370//
371// deleteSelfFromMap must be called after deleteSelfFromTree returns true.
372//
373// It returns a bool to indicate whether the channel can be safely deleted from map.
374func (sc *subChannel) deleteSelfFromMap() (delete bool) {
375	if sc.getTraceRefCount() != 0 {
376		// free the grpc struct (i.e. addrConn)
377		sc.c = &dummyChannel{}
378		return false
379	}
380	return true
381}
382
383// deleteSelfIfReady tries to delete the subchannel itself from the channelz database.
384// The delete process includes two steps:
385// 1. delete the subchannel from the entry relation tree, i.e. delete the subchannel reference from
386//    its parent's child list.
387// 2. delete the subchannel from the map, i.e. delete the subchannel entirely from channelz. Lookup
388//    by id will return entry not found error.
389func (sc *subChannel) deleteSelfIfReady() {
390	if !sc.deleteSelfFromTree() {
391		return
392	}
393	if !sc.deleteSelfFromMap() {
394		return
395	}
396	sc.cm.deleteEntry(sc.id)
397	sc.trace.clear()
398}
399
400func (sc *subChannel) getChannelTrace() *channelTrace {
401	return sc.trace
402}
403
404func (sc *subChannel) incrTraceRefCount() {
405	atomic.AddInt32(&sc.traceRefCount, 1)
406}
407
408func (sc *subChannel) decrTraceRefCount() {
409	atomic.AddInt32(&sc.traceRefCount, -1)
410}
411
412func (sc *subChannel) getTraceRefCount() int {
413	i := atomic.LoadInt32(&sc.traceRefCount)
414	return int(i)
415}
416
417func (sc *subChannel) getRefName() string {
418	return sc.refName
419}
420
421// SocketMetric defines the info channelz provides for a specific Socket, which
422// includes SocketInternalMetric and channelz-specific data, such as channelz id, etc.
423type SocketMetric struct {
424	// ID is the channelz id of this socket.
425	ID int64
426	// RefName is the human readable reference string of this socket.
427	RefName string
428	// SocketData contains socket internal metric reported by the socket through
429	// ChannelzMetric().
430	SocketData *SocketInternalMetric
431}
432
433// SocketInternalMetric defines the struct that the implementor of Socket interface
434// should return from ChannelzMetric().
435type SocketInternalMetric struct {
436	// The number of streams that have been started.
437	StreamsStarted int64
438	// The number of streams that have ended successfully:
439	// On client side, receiving frame with eos bit set.
440	// On server side, sending frame with eos bit set.
441	StreamsSucceeded int64
442	// The number of streams that have ended unsuccessfully:
443	// On client side, termination without receiving frame with eos bit set.
444	// On server side, termination without sending frame with eos bit set.
445	StreamsFailed int64
446	// The number of messages successfully sent on this socket.
447	MessagesSent     int64
448	MessagesReceived int64
449	// The number of keep alives sent.  This is typically implemented with HTTP/2
450	// ping messages.
451	KeepAlivesSent int64
452	// The last time a stream was created by this endpoint.  Usually unset for
453	// servers.
454	LastLocalStreamCreatedTimestamp time.Time
455	// The last time a stream was created by the remote endpoint.  Usually unset
456	// for clients.
457	LastRemoteStreamCreatedTimestamp time.Time
458	// The last time a message was sent by this endpoint.
459	LastMessageSentTimestamp time.Time
460	// The last time a message was received by this endpoint.
461	LastMessageReceivedTimestamp time.Time
462	// The amount of window, granted to the local endpoint by the remote endpoint.
463	// This may be slightly out of date due to network latency.  This does NOT
464	// include stream level or TCP level flow control info.
465	LocalFlowControlWindow int64
466	// The amount of window, granted to the remote endpoint by the local endpoint.
467	// This may be slightly out of date due to network latency.  This does NOT
468	// include stream level or TCP level flow control info.
469	RemoteFlowControlWindow int64
470	// The locally bound address.
471	LocalAddr net.Addr
472	// The remote bound address.  May be absent.
473	RemoteAddr net.Addr
474	// Optional, represents the name of the remote endpoint, if different than
475	// the original target name.
476	RemoteName    string
477	SocketOptions *SocketOptionData
478	Security      credentials.ChannelzSecurityValue
479}
480
481// Socket is the interface that should be satisfied in order to be tracked by
482// channelz as Socket.
483type Socket interface {
484	ChannelzMetric() *SocketInternalMetric
485}
486
487type listenSocket struct {
488	refName string
489	s       Socket
490	id      int64
491	pid     int64
492	cm      *channelMap
493}
494
495func (ls *listenSocket) addChild(id int64, e entry) {
496	grpclog.Errorf("cannot add a child (id = %d) of type %T to a listen socket", id, e)
497}
498
499func (ls *listenSocket) deleteChild(id int64) {
500	grpclog.Errorf("cannot delete a child (id = %d) from a listen socket", id)
501}
502
503func (ls *listenSocket) triggerDelete() {
504	ls.cm.deleteEntry(ls.id)
505	ls.cm.findEntry(ls.pid).deleteChild(ls.id)
506}
507
508func (ls *listenSocket) deleteSelfIfReady() {
509	grpclog.Errorf("cannot call deleteSelfIfReady on a listen socket")
510}
511
512func (ls *listenSocket) getParentID() int64 {
513	return ls.pid
514}
515
516type normalSocket struct {
517	refName string
518	s       Socket
519	id      int64
520	pid     int64
521	cm      *channelMap
522}
523
524func (ns *normalSocket) addChild(id int64, e entry) {
525	grpclog.Errorf("cannot add a child (id = %d) of type %T to a normal socket", id, e)
526}
527
528func (ns *normalSocket) deleteChild(id int64) {
529	grpclog.Errorf("cannot delete a child (id = %d) from a normal socket", id)
530}
531
532func (ns *normalSocket) triggerDelete() {
533	ns.cm.deleteEntry(ns.id)
534	ns.cm.findEntry(ns.pid).deleteChild(ns.id)
535}
536
537func (ns *normalSocket) deleteSelfIfReady() {
538	grpclog.Errorf("cannot call deleteSelfIfReady on a normal socket")
539}
540
541func (ns *normalSocket) getParentID() int64 {
542	return ns.pid
543}
544
545// ServerMetric defines the info channelz provides for a specific Server, which
546// includes ServerInternalMetric and channelz-specific data, such as channelz id,
547// child list, etc.
548type ServerMetric struct {
549	// ID is the channelz id of this server.
550	ID int64
551	// RefName is the human readable reference string of this server.
552	RefName string
553	// ServerData contains server internal metric reported by the server through
554	// ChannelzMetric().
555	ServerData *ServerInternalMetric
556	// ListenSockets tracks the listener socket type children of this server in the
557	// format of a map from socket channelz id to corresponding reference string.
558	ListenSockets map[int64]string
559}
560
561// ServerInternalMetric defines the struct that the implementor of Server interface
562// should return from ChannelzMetric().
563type ServerInternalMetric struct {
564	// The number of incoming calls started on the server.
565	CallsStarted int64
566	// The number of incoming calls that have completed with an OK status.
567	CallsSucceeded int64
568	// The number of incoming calls that have a completed with a non-OK status.
569	CallsFailed int64
570	// The last time a call was started on the server.
571	LastCallStartedTimestamp time.Time
572}
573
574// Server is the interface to be satisfied in order to be tracked by channelz as
575// Server.
576type Server interface {
577	ChannelzMetric() *ServerInternalMetric
578}
579
580type server struct {
581	refName       string
582	s             Server
583	closeCalled   bool
584	sockets       map[int64]string
585	listenSockets map[int64]string
586	id            int64
587	cm            *channelMap
588}
589
590func (s *server) addChild(id int64, e entry) {
591	switch v := e.(type) {
592	case *normalSocket:
593		s.sockets[id] = v.refName
594	case *listenSocket:
595		s.listenSockets[id] = v.refName
596	default:
597		grpclog.Errorf("cannot add a child (id = %d) of type %T to a server", id, e)
598	}
599}
600
601func (s *server) deleteChild(id int64) {
602	delete(s.sockets, id)
603	delete(s.listenSockets, id)
604	s.deleteSelfIfReady()
605}
606
607func (s *server) triggerDelete() {
608	s.closeCalled = true
609	s.deleteSelfIfReady()
610}
611
612func (s *server) deleteSelfIfReady() {
613	if !s.closeCalled || len(s.sockets)+len(s.listenSockets) != 0 {
614		return
615	}
616	s.cm.deleteEntry(s.id)
617}
618
619func (s *server) getParentID() int64 {
620	return 0
621}
622
623type tracedChannel interface {
624	getChannelTrace() *channelTrace
625	incrTraceRefCount()
626	decrTraceRefCount()
627	getRefName() string
628}
629
630type channelTrace struct {
631	cm          *channelMap
632	createdTime time.Time
633	eventCount  int64
634	mu          sync.Mutex
635	events      []*TraceEvent
636}
637
638func (c *channelTrace) append(e *TraceEvent) {
639	c.mu.Lock()
640	if len(c.events) == getMaxTraceEntry() {
641		del := c.events[0]
642		c.events = c.events[1:]
643		if del.RefID != 0 {
644			// start recursive cleanup in a goroutine to not block the call originated from grpc.
645			go func() {
646				// need to acquire c.cm.mu lock to call the unlocked attemptCleanup func.
647				c.cm.mu.Lock()
648				c.cm.decrTraceRefCount(del.RefID)
649				c.cm.mu.Unlock()
650			}()
651		}
652	}
653	e.Timestamp = time.Now()
654	c.events = append(c.events, e)
655	c.eventCount++
656	c.mu.Unlock()
657}
658
659func (c *channelTrace) clear() {
660	c.mu.Lock()
661	for _, e := range c.events {
662		if e.RefID != 0 {
663			// caller should have already held the c.cm.mu lock.
664			c.cm.decrTraceRefCount(e.RefID)
665		}
666	}
667	c.mu.Unlock()
668}
669
670// Severity is the severity level of a trace event.
671// The canonical enumeration of all valid values is here:
672// https://github.com/grpc/grpc-proto/blob/9b13d199cc0d4703c7ea26c9c330ba695866eb23/grpc/channelz/v1/channelz.proto#L126.
673type Severity int
674
675const (
676	// CtUNKNOWN indicates unknown severity of a trace event.
677	CtUNKNOWN Severity = iota
678	// CtINFO indicates info level severity of a trace event.
679	CtINFO
680	// CtWarning indicates warning level severity of a trace event.
681	CtWarning
682	// CtError indicates error level severity of a trace event.
683	CtError
684)
685
686// RefChannelType is the type of the entity being referenced in a trace event.
687type RefChannelType int
688
689const (
690	// RefChannel indicates the referenced entity is a Channel.
691	RefChannel RefChannelType = iota
692	// RefSubChannel indicates the referenced entity is a SubChannel.
693	RefSubChannel
694)
695
696func (c *channelTrace) dumpData() *ChannelTrace {
697	c.mu.Lock()
698	ct := &ChannelTrace{EventNum: c.eventCount, CreationTime: c.createdTime}
699	ct.Events = c.events[:len(c.events)]
700	c.mu.Unlock()
701	return ct
702}
703