1/* 2 * 3 * Copyright 2018 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19package channelz 20 21import ( 22 "net" 23 "sync" 24 "sync/atomic" 25 "time" 26 27 "google.golang.org/grpc/connectivity" 28 "google.golang.org/grpc/credentials" 29 "google.golang.org/grpc/grpclog" 30) 31 32// entry represents a node in the channelz database. 33type entry interface { 34 // addChild adds a child e, whose channelz id is id to child list 35 addChild(id int64, e entry) 36 // deleteChild deletes a child with channelz id to be id from child list 37 deleteChild(id int64) 38 // triggerDelete tries to delete self from channelz database. However, if child 39 // list is not empty, then deletion from the database is on hold until the last 40 // child is deleted from database. 41 triggerDelete() 42 // deleteSelfIfReady check whether triggerDelete() has been called before, and whether child 43 // list is now empty. If both conditions are met, then delete self from database. 44 deleteSelfIfReady() 45 // getParentID returns parent ID of the entry. 0 value parent ID means no parent. 46 getParentID() int64 47} 48 49// dummyEntry is a fake entry to handle entry not found case. 50type dummyEntry struct { 51 idNotFound int64 52} 53 54func (d *dummyEntry) addChild(id int64, e entry) { 55 // Note: It is possible for a normal program to reach here under race condition. 56 // For example, there could be a race between ClientConn.Close() info being propagated 57 // to addrConn and http2Client. ClientConn.Close() cancel the context and result 58 // in http2Client to error. The error info is then caught by transport monitor 59 // and before addrConn.tearDown() is called in side ClientConn.Close(). Therefore, 60 // the addrConn will create a new transport. And when registering the new transport in 61 // channelz, its parent addrConn could have already been torn down and deleted 62 // from channelz tracking, and thus reach the code here. 63 grpclog.Infof("attempt to add child of type %T with id %d to a parent (id=%d) that doesn't currently exist", e, id, d.idNotFound) 64} 65 66func (d *dummyEntry) deleteChild(id int64) { 67 // It is possible for a normal program to reach here under race condition. 68 // Refer to the example described in addChild(). 69 grpclog.Infof("attempt to delete child with id %d from a parent (id=%d) that doesn't currently exist", id, d.idNotFound) 70} 71 72func (d *dummyEntry) triggerDelete() { 73 grpclog.Warningf("attempt to delete an entry (id=%d) that doesn't currently exist", d.idNotFound) 74} 75 76func (*dummyEntry) deleteSelfIfReady() { 77 // code should not reach here. deleteSelfIfReady is always called on an existing entry. 78} 79 80func (*dummyEntry) getParentID() int64 { 81 return 0 82} 83 84// ChannelMetric defines the info channelz provides for a specific Channel, which 85// includes ChannelInternalMetric and channelz-specific data, such as channelz id, 86// child list, etc. 87type ChannelMetric struct { 88 // ID is the channelz id of this channel. 89 ID int64 90 // RefName is the human readable reference string of this channel. 91 RefName string 92 // ChannelData contains channel internal metric reported by the channel through 93 // ChannelzMetric(). 94 ChannelData *ChannelInternalMetric 95 // NestedChans tracks the nested channel type children of this channel in the format of 96 // a map from nested channel channelz id to corresponding reference string. 97 NestedChans map[int64]string 98 // SubChans tracks the subchannel type children of this channel in the format of a 99 // map from subchannel channelz id to corresponding reference string. 100 SubChans map[int64]string 101 // Sockets tracks the socket type children of this channel in the format of a map 102 // from socket channelz id to corresponding reference string. 103 // Note current grpc implementation doesn't allow channel having sockets directly, 104 // therefore, this is field is unused. 105 Sockets map[int64]string 106 // Trace contains the most recent traced events. 107 Trace *ChannelTrace 108} 109 110// SubChannelMetric defines the info channelz provides for a specific SubChannel, 111// which includes ChannelInternalMetric and channelz-specific data, such as 112// channelz id, child list, etc. 113type SubChannelMetric struct { 114 // ID is the channelz id of this subchannel. 115 ID int64 116 // RefName is the human readable reference string of this subchannel. 117 RefName string 118 // ChannelData contains subchannel internal metric reported by the subchannel 119 // through ChannelzMetric(). 120 ChannelData *ChannelInternalMetric 121 // NestedChans tracks the nested channel type children of this subchannel in the format of 122 // a map from nested channel channelz id to corresponding reference string. 123 // Note current grpc implementation doesn't allow subchannel to have nested channels 124 // as children, therefore, this field is unused. 125 NestedChans map[int64]string 126 // SubChans tracks the subchannel type children of this subchannel in the format of a 127 // map from subchannel channelz id to corresponding reference string. 128 // Note current grpc implementation doesn't allow subchannel to have subchannels 129 // as children, therefore, this field is unused. 130 SubChans map[int64]string 131 // Sockets tracks the socket type children of this subchannel in the format of a map 132 // from socket channelz id to corresponding reference string. 133 Sockets map[int64]string 134 // Trace contains the most recent traced events. 135 Trace *ChannelTrace 136} 137 138// ChannelInternalMetric defines the struct that the implementor of Channel interface 139// should return from ChannelzMetric(). 140type ChannelInternalMetric struct { 141 // current connectivity state of the channel. 142 State connectivity.State 143 // The target this channel originally tried to connect to. May be absent 144 Target string 145 // The number of calls started on the channel. 146 CallsStarted int64 147 // The number of calls that have completed with an OK status. 148 CallsSucceeded int64 149 // The number of calls that have a completed with a non-OK status. 150 CallsFailed int64 151 // The last time a call was started on the channel. 152 LastCallStartedTimestamp time.Time 153} 154 155// ChannelTrace stores traced events on a channel/subchannel and related info. 156type ChannelTrace struct { 157 // EventNum is the number of events that ever got traced (i.e. including those that have been deleted) 158 EventNum int64 159 // CreationTime is the creation time of the trace. 160 CreationTime time.Time 161 // Events stores the most recent trace events (up to $maxTraceEntry, newer event will overwrite the 162 // oldest one) 163 Events []*TraceEvent 164} 165 166// TraceEvent represent a single trace event 167type TraceEvent struct { 168 // Desc is a simple description of the trace event. 169 Desc string 170 // Severity states the severity of this trace event. 171 Severity Severity 172 // Timestamp is the event time. 173 Timestamp time.Time 174 // RefID is the id of the entity that gets referenced in the event. RefID is 0 if no other entity is 175 // involved in this event. 176 // e.g. SubChannel (id: 4[]) Created. --> RefID = 4, RefName = "" (inside []) 177 RefID int64 178 // RefName is the reference name for the entity that gets referenced in the event. 179 RefName string 180 // RefType indicates the referenced entity type, i.e Channel or SubChannel. 181 RefType RefChannelType 182} 183 184// Channel is the interface that should be satisfied in order to be tracked by 185// channelz as Channel or SubChannel. 186type Channel interface { 187 ChannelzMetric() *ChannelInternalMetric 188} 189 190type dummyChannel struct{} 191 192func (d *dummyChannel) ChannelzMetric() *ChannelInternalMetric { 193 return &ChannelInternalMetric{} 194} 195 196type channel struct { 197 refName string 198 c Channel 199 closeCalled bool 200 nestedChans map[int64]string 201 subChans map[int64]string 202 id int64 203 pid int64 204 cm *channelMap 205 trace *channelTrace 206 // traceRefCount is the number of trace events that reference this channel. 207 // Non-zero traceRefCount means the trace of this channel cannot be deleted. 208 traceRefCount int32 209} 210 211func (c *channel) addChild(id int64, e entry) { 212 switch v := e.(type) { 213 case *subChannel: 214 c.subChans[id] = v.refName 215 case *channel: 216 c.nestedChans[id] = v.refName 217 default: 218 grpclog.Errorf("cannot add a child (id = %d) of type %T to a channel", id, e) 219 } 220} 221 222func (c *channel) deleteChild(id int64) { 223 delete(c.subChans, id) 224 delete(c.nestedChans, id) 225 c.deleteSelfIfReady() 226} 227 228func (c *channel) triggerDelete() { 229 c.closeCalled = true 230 c.deleteSelfIfReady() 231} 232 233func (c *channel) getParentID() int64 { 234 return c.pid 235} 236 237// deleteSelfFromTree tries to delete the channel from the channelz entry relation tree, which means 238// deleting the channel reference from its parent's child list. 239// 240// In order for a channel to be deleted from the tree, it must meet the criteria that, removal of the 241// corresponding grpc object has been invoked, and the channel does not have any children left. 242// 243// The returned boolean value indicates whether the channel has been successfully deleted from tree. 244func (c *channel) deleteSelfFromTree() (deleted bool) { 245 if !c.closeCalled || len(c.subChans)+len(c.nestedChans) != 0 { 246 return false 247 } 248 // not top channel 249 if c.pid != 0 { 250 c.cm.findEntry(c.pid).deleteChild(c.id) 251 } 252 return true 253} 254 255// deleteSelfFromMap checks whether it is valid to delete the channel from the map, which means 256// deleting the channel from channelz's tracking entirely. Users can no longer use id to query the 257// channel, and its memory will be garbage collected. 258// 259// The trace reference count of the channel must be 0 in order to be deleted from the map. This is 260// specified in the channel tracing gRFC that as long as some other trace has reference to an entity, 261// the trace of the referenced entity must not be deleted. In order to release the resource allocated 262// by grpc, the reference to the grpc object is reset to a dummy object. 263// 264// deleteSelfFromMap must be called after deleteSelfFromTree returns true. 265// 266// It returns a bool to indicate whether the channel can be safely deleted from map. 267func (c *channel) deleteSelfFromMap() (delete bool) { 268 if c.getTraceRefCount() != 0 { 269 c.c = &dummyChannel{} 270 return false 271 } 272 return true 273} 274 275// deleteSelfIfReady tries to delete the channel itself from the channelz database. 276// The delete process includes two steps: 277// 1. delete the channel from the entry relation tree, i.e. delete the channel reference from its 278// parent's child list. 279// 2. delete the channel from the map, i.e. delete the channel entirely from channelz. Lookup by id 280// will return entry not found error. 281func (c *channel) deleteSelfIfReady() { 282 if !c.deleteSelfFromTree() { 283 return 284 } 285 if !c.deleteSelfFromMap() { 286 return 287 } 288 c.cm.deleteEntry(c.id) 289 c.trace.clear() 290} 291 292func (c *channel) getChannelTrace() *channelTrace { 293 return c.trace 294} 295 296func (c *channel) incrTraceRefCount() { 297 atomic.AddInt32(&c.traceRefCount, 1) 298} 299 300func (c *channel) decrTraceRefCount() { 301 atomic.AddInt32(&c.traceRefCount, -1) 302} 303 304func (c *channel) getTraceRefCount() int { 305 i := atomic.LoadInt32(&c.traceRefCount) 306 return int(i) 307} 308 309func (c *channel) getRefName() string { 310 return c.refName 311} 312 313type subChannel struct { 314 refName string 315 c Channel 316 closeCalled bool 317 sockets map[int64]string 318 id int64 319 pid int64 320 cm *channelMap 321 trace *channelTrace 322 traceRefCount int32 323} 324 325func (sc *subChannel) addChild(id int64, e entry) { 326 if v, ok := e.(*normalSocket); ok { 327 sc.sockets[id] = v.refName 328 } else { 329 grpclog.Errorf("cannot add a child (id = %d) of type %T to a subChannel", id, e) 330 } 331} 332 333func (sc *subChannel) deleteChild(id int64) { 334 delete(sc.sockets, id) 335 sc.deleteSelfIfReady() 336} 337 338func (sc *subChannel) triggerDelete() { 339 sc.closeCalled = true 340 sc.deleteSelfIfReady() 341} 342 343func (sc *subChannel) getParentID() int64 { 344 return sc.pid 345} 346 347// deleteSelfFromTree tries to delete the subchannel from the channelz entry relation tree, which 348// means deleting the subchannel reference from its parent's child list. 349// 350// In order for a subchannel to be deleted from the tree, it must meet the criteria that, removal of 351// the corresponding grpc object has been invoked, and the subchannel does not have any children left. 352// 353// The returned boolean value indicates whether the channel has been successfully deleted from tree. 354func (sc *subChannel) deleteSelfFromTree() (deleted bool) { 355 if !sc.closeCalled || len(sc.sockets) != 0 { 356 return false 357 } 358 sc.cm.findEntry(sc.pid).deleteChild(sc.id) 359 return true 360} 361 362// deleteSelfFromMap checks whether it is valid to delete the subchannel from the map, which means 363// deleting the subchannel from channelz's tracking entirely. Users can no longer use id to query 364// the subchannel, and its memory will be garbage collected. 365// 366// The trace reference count of the subchannel must be 0 in order to be deleted from the map. This is 367// specified in the channel tracing gRFC that as long as some other trace has reference to an entity, 368// the trace of the referenced entity must not be deleted. In order to release the resource allocated 369// by grpc, the reference to the grpc object is reset to a dummy object. 370// 371// deleteSelfFromMap must be called after deleteSelfFromTree returns true. 372// 373// It returns a bool to indicate whether the channel can be safely deleted from map. 374func (sc *subChannel) deleteSelfFromMap() (delete bool) { 375 if sc.getTraceRefCount() != 0 { 376 // free the grpc struct (i.e. addrConn) 377 sc.c = &dummyChannel{} 378 return false 379 } 380 return true 381} 382 383// deleteSelfIfReady tries to delete the subchannel itself from the channelz database. 384// The delete process includes two steps: 385// 1. delete the subchannel from the entry relation tree, i.e. delete the subchannel reference from 386// its parent's child list. 387// 2. delete the subchannel from the map, i.e. delete the subchannel entirely from channelz. Lookup 388// by id will return entry not found error. 389func (sc *subChannel) deleteSelfIfReady() { 390 if !sc.deleteSelfFromTree() { 391 return 392 } 393 if !sc.deleteSelfFromMap() { 394 return 395 } 396 sc.cm.deleteEntry(sc.id) 397 sc.trace.clear() 398} 399 400func (sc *subChannel) getChannelTrace() *channelTrace { 401 return sc.trace 402} 403 404func (sc *subChannel) incrTraceRefCount() { 405 atomic.AddInt32(&sc.traceRefCount, 1) 406} 407 408func (sc *subChannel) decrTraceRefCount() { 409 atomic.AddInt32(&sc.traceRefCount, -1) 410} 411 412func (sc *subChannel) getTraceRefCount() int { 413 i := atomic.LoadInt32(&sc.traceRefCount) 414 return int(i) 415} 416 417func (sc *subChannel) getRefName() string { 418 return sc.refName 419} 420 421// SocketMetric defines the info channelz provides for a specific Socket, which 422// includes SocketInternalMetric and channelz-specific data, such as channelz id, etc. 423type SocketMetric struct { 424 // ID is the channelz id of this socket. 425 ID int64 426 // RefName is the human readable reference string of this socket. 427 RefName string 428 // SocketData contains socket internal metric reported by the socket through 429 // ChannelzMetric(). 430 SocketData *SocketInternalMetric 431} 432 433// SocketInternalMetric defines the struct that the implementor of Socket interface 434// should return from ChannelzMetric(). 435type SocketInternalMetric struct { 436 // The number of streams that have been started. 437 StreamsStarted int64 438 // The number of streams that have ended successfully: 439 // On client side, receiving frame with eos bit set. 440 // On server side, sending frame with eos bit set. 441 StreamsSucceeded int64 442 // The number of streams that have ended unsuccessfully: 443 // On client side, termination without receiving frame with eos bit set. 444 // On server side, termination without sending frame with eos bit set. 445 StreamsFailed int64 446 // The number of messages successfully sent on this socket. 447 MessagesSent int64 448 MessagesReceived int64 449 // The number of keep alives sent. This is typically implemented with HTTP/2 450 // ping messages. 451 KeepAlivesSent int64 452 // The last time a stream was created by this endpoint. Usually unset for 453 // servers. 454 LastLocalStreamCreatedTimestamp time.Time 455 // The last time a stream was created by the remote endpoint. Usually unset 456 // for clients. 457 LastRemoteStreamCreatedTimestamp time.Time 458 // The last time a message was sent by this endpoint. 459 LastMessageSentTimestamp time.Time 460 // The last time a message was received by this endpoint. 461 LastMessageReceivedTimestamp time.Time 462 // The amount of window, granted to the local endpoint by the remote endpoint. 463 // This may be slightly out of date due to network latency. This does NOT 464 // include stream level or TCP level flow control info. 465 LocalFlowControlWindow int64 466 // The amount of window, granted to the remote endpoint by the local endpoint. 467 // This may be slightly out of date due to network latency. This does NOT 468 // include stream level or TCP level flow control info. 469 RemoteFlowControlWindow int64 470 // The locally bound address. 471 LocalAddr net.Addr 472 // The remote bound address. May be absent. 473 RemoteAddr net.Addr 474 // Optional, represents the name of the remote endpoint, if different than 475 // the original target name. 476 RemoteName string 477 SocketOptions *SocketOptionData 478 Security credentials.ChannelzSecurityValue 479} 480 481// Socket is the interface that should be satisfied in order to be tracked by 482// channelz as Socket. 483type Socket interface { 484 ChannelzMetric() *SocketInternalMetric 485} 486 487type listenSocket struct { 488 refName string 489 s Socket 490 id int64 491 pid int64 492 cm *channelMap 493} 494 495func (ls *listenSocket) addChild(id int64, e entry) { 496 grpclog.Errorf("cannot add a child (id = %d) of type %T to a listen socket", id, e) 497} 498 499func (ls *listenSocket) deleteChild(id int64) { 500 grpclog.Errorf("cannot delete a child (id = %d) from a listen socket", id) 501} 502 503func (ls *listenSocket) triggerDelete() { 504 ls.cm.deleteEntry(ls.id) 505 ls.cm.findEntry(ls.pid).deleteChild(ls.id) 506} 507 508func (ls *listenSocket) deleteSelfIfReady() { 509 grpclog.Errorf("cannot call deleteSelfIfReady on a listen socket") 510} 511 512func (ls *listenSocket) getParentID() int64 { 513 return ls.pid 514} 515 516type normalSocket struct { 517 refName string 518 s Socket 519 id int64 520 pid int64 521 cm *channelMap 522} 523 524func (ns *normalSocket) addChild(id int64, e entry) { 525 grpclog.Errorf("cannot add a child (id = %d) of type %T to a normal socket", id, e) 526} 527 528func (ns *normalSocket) deleteChild(id int64) { 529 grpclog.Errorf("cannot delete a child (id = %d) from a normal socket", id) 530} 531 532func (ns *normalSocket) triggerDelete() { 533 ns.cm.deleteEntry(ns.id) 534 ns.cm.findEntry(ns.pid).deleteChild(ns.id) 535} 536 537func (ns *normalSocket) deleteSelfIfReady() { 538 grpclog.Errorf("cannot call deleteSelfIfReady on a normal socket") 539} 540 541func (ns *normalSocket) getParentID() int64 { 542 return ns.pid 543} 544 545// ServerMetric defines the info channelz provides for a specific Server, which 546// includes ServerInternalMetric and channelz-specific data, such as channelz id, 547// child list, etc. 548type ServerMetric struct { 549 // ID is the channelz id of this server. 550 ID int64 551 // RefName is the human readable reference string of this server. 552 RefName string 553 // ServerData contains server internal metric reported by the server through 554 // ChannelzMetric(). 555 ServerData *ServerInternalMetric 556 // ListenSockets tracks the listener socket type children of this server in the 557 // format of a map from socket channelz id to corresponding reference string. 558 ListenSockets map[int64]string 559} 560 561// ServerInternalMetric defines the struct that the implementor of Server interface 562// should return from ChannelzMetric(). 563type ServerInternalMetric struct { 564 // The number of incoming calls started on the server. 565 CallsStarted int64 566 // The number of incoming calls that have completed with an OK status. 567 CallsSucceeded int64 568 // The number of incoming calls that have a completed with a non-OK status. 569 CallsFailed int64 570 // The last time a call was started on the server. 571 LastCallStartedTimestamp time.Time 572} 573 574// Server is the interface to be satisfied in order to be tracked by channelz as 575// Server. 576type Server interface { 577 ChannelzMetric() *ServerInternalMetric 578} 579 580type server struct { 581 refName string 582 s Server 583 closeCalled bool 584 sockets map[int64]string 585 listenSockets map[int64]string 586 id int64 587 cm *channelMap 588} 589 590func (s *server) addChild(id int64, e entry) { 591 switch v := e.(type) { 592 case *normalSocket: 593 s.sockets[id] = v.refName 594 case *listenSocket: 595 s.listenSockets[id] = v.refName 596 default: 597 grpclog.Errorf("cannot add a child (id = %d) of type %T to a server", id, e) 598 } 599} 600 601func (s *server) deleteChild(id int64) { 602 delete(s.sockets, id) 603 delete(s.listenSockets, id) 604 s.deleteSelfIfReady() 605} 606 607func (s *server) triggerDelete() { 608 s.closeCalled = true 609 s.deleteSelfIfReady() 610} 611 612func (s *server) deleteSelfIfReady() { 613 if !s.closeCalled || len(s.sockets)+len(s.listenSockets) != 0 { 614 return 615 } 616 s.cm.deleteEntry(s.id) 617} 618 619func (s *server) getParentID() int64 { 620 return 0 621} 622 623type tracedChannel interface { 624 getChannelTrace() *channelTrace 625 incrTraceRefCount() 626 decrTraceRefCount() 627 getRefName() string 628} 629 630type channelTrace struct { 631 cm *channelMap 632 createdTime time.Time 633 eventCount int64 634 mu sync.Mutex 635 events []*TraceEvent 636} 637 638func (c *channelTrace) append(e *TraceEvent) { 639 c.mu.Lock() 640 if len(c.events) == getMaxTraceEntry() { 641 del := c.events[0] 642 c.events = c.events[1:] 643 if del.RefID != 0 { 644 // start recursive cleanup in a goroutine to not block the call originated from grpc. 645 go func() { 646 // need to acquire c.cm.mu lock to call the unlocked attemptCleanup func. 647 c.cm.mu.Lock() 648 c.cm.decrTraceRefCount(del.RefID) 649 c.cm.mu.Unlock() 650 }() 651 } 652 } 653 e.Timestamp = time.Now() 654 c.events = append(c.events, e) 655 c.eventCount++ 656 c.mu.Unlock() 657} 658 659func (c *channelTrace) clear() { 660 c.mu.Lock() 661 for _, e := range c.events { 662 if e.RefID != 0 { 663 // caller should have already held the c.cm.mu lock. 664 c.cm.decrTraceRefCount(e.RefID) 665 } 666 } 667 c.mu.Unlock() 668} 669 670// Severity is the severity level of a trace event. 671// The canonical enumeration of all valid values is here: 672// https://github.com/grpc/grpc-proto/blob/9b13d199cc0d4703c7ea26c9c330ba695866eb23/grpc/channelz/v1/channelz.proto#L126. 673type Severity int 674 675const ( 676 // CtUNKNOWN indicates unknown severity of a trace event. 677 CtUNKNOWN Severity = iota 678 // CtINFO indicates info level severity of a trace event. 679 CtINFO 680 // CtWarning indicates warning level severity of a trace event. 681 CtWarning 682 // CtError indicates error level severity of a trace event. 683 CtError 684) 685 686// RefChannelType is the type of the entity being referenced in a trace event. 687type RefChannelType int 688 689const ( 690 // RefChannel indicates the referenced entity is a Channel. 691 RefChannel RefChannelType = iota 692 // RefSubChannel indicates the referenced entity is a SubChannel. 693 RefSubChannel 694) 695 696func (c *channelTrace) dumpData() *ChannelTrace { 697 c.mu.Lock() 698 ct := &ChannelTrace{EventNum: c.eventCount, CreationTime: c.createdTime} 699 ct.Events = c.events[:len(c.events)] 700 c.mu.Unlock() 701 return ct 702} 703