1/* 2 * 3 * Copyright 2018 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19package channelz 20 21import ( 22 "net" 23 "sync" 24 "sync/atomic" 25 "time" 26 27 "google.golang.org/grpc/connectivity" 28 "google.golang.org/grpc/credentials" 29) 30 31// entry represents a node in the channelz database. 32type entry interface { 33 // addChild adds a child e, whose channelz id is id to child list 34 addChild(id int64, e entry) 35 // deleteChild deletes a child with channelz id to be id from child list 36 deleteChild(id int64) 37 // triggerDelete tries to delete self from channelz database. However, if child 38 // list is not empty, then deletion from the database is on hold until the last 39 // child is deleted from database. 40 triggerDelete() 41 // deleteSelfIfReady check whether triggerDelete() has been called before, and whether child 42 // list is now empty. If both conditions are met, then delete self from database. 43 deleteSelfIfReady() 44 // getParentID returns parent ID of the entry. 0 value parent ID means no parent. 45 getParentID() int64 46} 47 48// dummyEntry is a fake entry to handle entry not found case. 49type dummyEntry struct { 50 idNotFound int64 51} 52 53func (d *dummyEntry) addChild(id int64, e entry) { 54 // Note: It is possible for a normal program to reach here under race condition. 55 // For example, there could be a race between ClientConn.Close() info being propagated 56 // to addrConn and http2Client. ClientConn.Close() cancel the context and result 57 // in http2Client to error. The error info is then caught by transport monitor 58 // and before addrConn.tearDown() is called in side ClientConn.Close(). Therefore, 59 // the addrConn will create a new transport. And when registering the new transport in 60 // channelz, its parent addrConn could have already been torn down and deleted 61 // from channelz tracking, and thus reach the code here. 62 logger.Infof("attempt to add child of type %T with id %d to a parent (id=%d) that doesn't currently exist", e, id, d.idNotFound) 63} 64 65func (d *dummyEntry) deleteChild(id int64) { 66 // It is possible for a normal program to reach here under race condition. 67 // Refer to the example described in addChild(). 68 logger.Infof("attempt to delete child with id %d from a parent (id=%d) that doesn't currently exist", id, d.idNotFound) 69} 70 71func (d *dummyEntry) triggerDelete() { 72 logger.Warningf("attempt to delete an entry (id=%d) that doesn't currently exist", d.idNotFound) 73} 74 75func (*dummyEntry) deleteSelfIfReady() { 76 // code should not reach here. deleteSelfIfReady is always called on an existing entry. 77} 78 79func (*dummyEntry) getParentID() int64 { 80 return 0 81} 82 83// ChannelMetric defines the info channelz provides for a specific Channel, which 84// includes ChannelInternalMetric and channelz-specific data, such as channelz id, 85// child list, etc. 86type ChannelMetric struct { 87 // ID is the channelz id of this channel. 88 ID int64 89 // RefName is the human readable reference string of this channel. 90 RefName string 91 // ChannelData contains channel internal metric reported by the channel through 92 // ChannelzMetric(). 93 ChannelData *ChannelInternalMetric 94 // NestedChans tracks the nested channel type children of this channel in the format of 95 // a map from nested channel channelz id to corresponding reference string. 96 NestedChans map[int64]string 97 // SubChans tracks the subchannel type children of this channel in the format of a 98 // map from subchannel channelz id to corresponding reference string. 99 SubChans map[int64]string 100 // Sockets tracks the socket type children of this channel in the format of a map 101 // from socket channelz id to corresponding reference string. 102 // Note current grpc implementation doesn't allow channel having sockets directly, 103 // therefore, this is field is unused. 104 Sockets map[int64]string 105 // Trace contains the most recent traced events. 106 Trace *ChannelTrace 107} 108 109// SubChannelMetric defines the info channelz provides for a specific SubChannel, 110// which includes ChannelInternalMetric and channelz-specific data, such as 111// channelz id, child list, etc. 112type SubChannelMetric struct { 113 // ID is the channelz id of this subchannel. 114 ID int64 115 // RefName is the human readable reference string of this subchannel. 116 RefName string 117 // ChannelData contains subchannel internal metric reported by the subchannel 118 // through ChannelzMetric(). 119 ChannelData *ChannelInternalMetric 120 // NestedChans tracks the nested channel type children of this subchannel in the format of 121 // a map from nested channel channelz id to corresponding reference string. 122 // Note current grpc implementation doesn't allow subchannel to have nested channels 123 // as children, therefore, this field is unused. 124 NestedChans map[int64]string 125 // SubChans tracks the subchannel type children of this subchannel in the format of a 126 // map from subchannel channelz id to corresponding reference string. 127 // Note current grpc implementation doesn't allow subchannel to have subchannels 128 // as children, therefore, this field is unused. 129 SubChans map[int64]string 130 // Sockets tracks the socket type children of this subchannel in the format of a map 131 // from socket channelz id to corresponding reference string. 132 Sockets map[int64]string 133 // Trace contains the most recent traced events. 134 Trace *ChannelTrace 135} 136 137// ChannelInternalMetric defines the struct that the implementor of Channel interface 138// should return from ChannelzMetric(). 139type ChannelInternalMetric struct { 140 // current connectivity state of the channel. 141 State connectivity.State 142 // The target this channel originally tried to connect to. May be absent 143 Target string 144 // The number of calls started on the channel. 145 CallsStarted int64 146 // The number of calls that have completed with an OK status. 147 CallsSucceeded int64 148 // The number of calls that have a completed with a non-OK status. 149 CallsFailed int64 150 // The last time a call was started on the channel. 151 LastCallStartedTimestamp time.Time 152} 153 154// ChannelTrace stores traced events on a channel/subchannel and related info. 155type ChannelTrace struct { 156 // EventNum is the number of events that ever got traced (i.e. including those that have been deleted) 157 EventNum int64 158 // CreationTime is the creation time of the trace. 159 CreationTime time.Time 160 // Events stores the most recent trace events (up to $maxTraceEntry, newer event will overwrite the 161 // oldest one) 162 Events []*TraceEvent 163} 164 165// TraceEvent represent a single trace event 166type TraceEvent struct { 167 // Desc is a simple description of the trace event. 168 Desc string 169 // Severity states the severity of this trace event. 170 Severity Severity 171 // Timestamp is the event time. 172 Timestamp time.Time 173 // RefID is the id of the entity that gets referenced in the event. RefID is 0 if no other entity is 174 // involved in this event. 175 // e.g. SubChannel (id: 4[]) Created. --> RefID = 4, RefName = "" (inside []) 176 RefID int64 177 // RefName is the reference name for the entity that gets referenced in the event. 178 RefName string 179 // RefType indicates the referenced entity type, i.e Channel or SubChannel. 180 RefType RefChannelType 181} 182 183// Channel is the interface that should be satisfied in order to be tracked by 184// channelz as Channel or SubChannel. 185type Channel interface { 186 ChannelzMetric() *ChannelInternalMetric 187} 188 189type dummyChannel struct{} 190 191func (d *dummyChannel) ChannelzMetric() *ChannelInternalMetric { 192 return &ChannelInternalMetric{} 193} 194 195type channel struct { 196 refName string 197 c Channel 198 closeCalled bool 199 nestedChans map[int64]string 200 subChans map[int64]string 201 id int64 202 pid int64 203 cm *channelMap 204 trace *channelTrace 205 // traceRefCount is the number of trace events that reference this channel. 206 // Non-zero traceRefCount means the trace of this channel cannot be deleted. 207 traceRefCount int32 208} 209 210func (c *channel) addChild(id int64, e entry) { 211 switch v := e.(type) { 212 case *subChannel: 213 c.subChans[id] = v.refName 214 case *channel: 215 c.nestedChans[id] = v.refName 216 default: 217 logger.Errorf("cannot add a child (id = %d) of type %T to a channel", id, e) 218 } 219} 220 221func (c *channel) deleteChild(id int64) { 222 delete(c.subChans, id) 223 delete(c.nestedChans, id) 224 c.deleteSelfIfReady() 225} 226 227func (c *channel) triggerDelete() { 228 c.closeCalled = true 229 c.deleteSelfIfReady() 230} 231 232func (c *channel) getParentID() int64 { 233 return c.pid 234} 235 236// deleteSelfFromTree tries to delete the channel from the channelz entry relation tree, which means 237// deleting the channel reference from its parent's child list. 238// 239// In order for a channel to be deleted from the tree, it must meet the criteria that, removal of the 240// corresponding grpc object has been invoked, and the channel does not have any children left. 241// 242// The returned boolean value indicates whether the channel has been successfully deleted from tree. 243func (c *channel) deleteSelfFromTree() (deleted bool) { 244 if !c.closeCalled || len(c.subChans)+len(c.nestedChans) != 0 { 245 return false 246 } 247 // not top channel 248 if c.pid != 0 { 249 c.cm.findEntry(c.pid).deleteChild(c.id) 250 } 251 return true 252} 253 254// deleteSelfFromMap checks whether it is valid to delete the channel from the map, which means 255// deleting the channel from channelz's tracking entirely. Users can no longer use id to query the 256// channel, and its memory will be garbage collected. 257// 258// The trace reference count of the channel must be 0 in order to be deleted from the map. This is 259// specified in the channel tracing gRFC that as long as some other trace has reference to an entity, 260// the trace of the referenced entity must not be deleted. In order to release the resource allocated 261// by grpc, the reference to the grpc object is reset to a dummy object. 262// 263// deleteSelfFromMap must be called after deleteSelfFromTree returns true. 264// 265// It returns a bool to indicate whether the channel can be safely deleted from map. 266func (c *channel) deleteSelfFromMap() (delete bool) { 267 if c.getTraceRefCount() != 0 { 268 c.c = &dummyChannel{} 269 return false 270 } 271 return true 272} 273 274// deleteSelfIfReady tries to delete the channel itself from the channelz database. 275// The delete process includes two steps: 276// 1. delete the channel from the entry relation tree, i.e. delete the channel reference from its 277// parent's child list. 278// 2. delete the channel from the map, i.e. delete the channel entirely from channelz. Lookup by id 279// will return entry not found error. 280func (c *channel) deleteSelfIfReady() { 281 if !c.deleteSelfFromTree() { 282 return 283 } 284 if !c.deleteSelfFromMap() { 285 return 286 } 287 c.cm.deleteEntry(c.id) 288 c.trace.clear() 289} 290 291func (c *channel) getChannelTrace() *channelTrace { 292 return c.trace 293} 294 295func (c *channel) incrTraceRefCount() { 296 atomic.AddInt32(&c.traceRefCount, 1) 297} 298 299func (c *channel) decrTraceRefCount() { 300 atomic.AddInt32(&c.traceRefCount, -1) 301} 302 303func (c *channel) getTraceRefCount() int { 304 i := atomic.LoadInt32(&c.traceRefCount) 305 return int(i) 306} 307 308func (c *channel) getRefName() string { 309 return c.refName 310} 311 312type subChannel struct { 313 refName string 314 c Channel 315 closeCalled bool 316 sockets map[int64]string 317 id int64 318 pid int64 319 cm *channelMap 320 trace *channelTrace 321 traceRefCount int32 322} 323 324func (sc *subChannel) addChild(id int64, e entry) { 325 if v, ok := e.(*normalSocket); ok { 326 sc.sockets[id] = v.refName 327 } else { 328 logger.Errorf("cannot add a child (id = %d) of type %T to a subChannel", id, e) 329 } 330} 331 332func (sc *subChannel) deleteChild(id int64) { 333 delete(sc.sockets, id) 334 sc.deleteSelfIfReady() 335} 336 337func (sc *subChannel) triggerDelete() { 338 sc.closeCalled = true 339 sc.deleteSelfIfReady() 340} 341 342func (sc *subChannel) getParentID() int64 { 343 return sc.pid 344} 345 346// deleteSelfFromTree tries to delete the subchannel from the channelz entry relation tree, which 347// means deleting the subchannel reference from its parent's child list. 348// 349// In order for a subchannel to be deleted from the tree, it must meet the criteria that, removal of 350// the corresponding grpc object has been invoked, and the subchannel does not have any children left. 351// 352// The returned boolean value indicates whether the channel has been successfully deleted from tree. 353func (sc *subChannel) deleteSelfFromTree() (deleted bool) { 354 if !sc.closeCalled || len(sc.sockets) != 0 { 355 return false 356 } 357 sc.cm.findEntry(sc.pid).deleteChild(sc.id) 358 return true 359} 360 361// deleteSelfFromMap checks whether it is valid to delete the subchannel from the map, which means 362// deleting the subchannel from channelz's tracking entirely. Users can no longer use id to query 363// the subchannel, and its memory will be garbage collected. 364// 365// The trace reference count of the subchannel must be 0 in order to be deleted from the map. This is 366// specified in the channel tracing gRFC that as long as some other trace has reference to an entity, 367// the trace of the referenced entity must not be deleted. In order to release the resource allocated 368// by grpc, the reference to the grpc object is reset to a dummy object. 369// 370// deleteSelfFromMap must be called after deleteSelfFromTree returns true. 371// 372// It returns a bool to indicate whether the channel can be safely deleted from map. 373func (sc *subChannel) deleteSelfFromMap() (delete bool) { 374 if sc.getTraceRefCount() != 0 { 375 // free the grpc struct (i.e. addrConn) 376 sc.c = &dummyChannel{} 377 return false 378 } 379 return true 380} 381 382// deleteSelfIfReady tries to delete the subchannel itself from the channelz database. 383// The delete process includes two steps: 384// 1. delete the subchannel from the entry relation tree, i.e. delete the subchannel reference from 385// its parent's child list. 386// 2. delete the subchannel from the map, i.e. delete the subchannel entirely from channelz. Lookup 387// by id will return entry not found error. 388func (sc *subChannel) deleteSelfIfReady() { 389 if !sc.deleteSelfFromTree() { 390 return 391 } 392 if !sc.deleteSelfFromMap() { 393 return 394 } 395 sc.cm.deleteEntry(sc.id) 396 sc.trace.clear() 397} 398 399func (sc *subChannel) getChannelTrace() *channelTrace { 400 return sc.trace 401} 402 403func (sc *subChannel) incrTraceRefCount() { 404 atomic.AddInt32(&sc.traceRefCount, 1) 405} 406 407func (sc *subChannel) decrTraceRefCount() { 408 atomic.AddInt32(&sc.traceRefCount, -1) 409} 410 411func (sc *subChannel) getTraceRefCount() int { 412 i := atomic.LoadInt32(&sc.traceRefCount) 413 return int(i) 414} 415 416func (sc *subChannel) getRefName() string { 417 return sc.refName 418} 419 420// SocketMetric defines the info channelz provides for a specific Socket, which 421// includes SocketInternalMetric and channelz-specific data, such as channelz id, etc. 422type SocketMetric struct { 423 // ID is the channelz id of this socket. 424 ID int64 425 // RefName is the human readable reference string of this socket. 426 RefName string 427 // SocketData contains socket internal metric reported by the socket through 428 // ChannelzMetric(). 429 SocketData *SocketInternalMetric 430} 431 432// SocketInternalMetric defines the struct that the implementor of Socket interface 433// should return from ChannelzMetric(). 434type SocketInternalMetric struct { 435 // The number of streams that have been started. 436 StreamsStarted int64 437 // The number of streams that have ended successfully: 438 // On client side, receiving frame with eos bit set. 439 // On server side, sending frame with eos bit set. 440 StreamsSucceeded int64 441 // The number of streams that have ended unsuccessfully: 442 // On client side, termination without receiving frame with eos bit set. 443 // On server side, termination without sending frame with eos bit set. 444 StreamsFailed int64 445 // The number of messages successfully sent on this socket. 446 MessagesSent int64 447 MessagesReceived int64 448 // The number of keep alives sent. This is typically implemented with HTTP/2 449 // ping messages. 450 KeepAlivesSent int64 451 // The last time a stream was created by this endpoint. Usually unset for 452 // servers. 453 LastLocalStreamCreatedTimestamp time.Time 454 // The last time a stream was created by the remote endpoint. Usually unset 455 // for clients. 456 LastRemoteStreamCreatedTimestamp time.Time 457 // The last time a message was sent by this endpoint. 458 LastMessageSentTimestamp time.Time 459 // The last time a message was received by this endpoint. 460 LastMessageReceivedTimestamp time.Time 461 // The amount of window, granted to the local endpoint by the remote endpoint. 462 // This may be slightly out of date due to network latency. This does NOT 463 // include stream level or TCP level flow control info. 464 LocalFlowControlWindow int64 465 // The amount of window, granted to the remote endpoint by the local endpoint. 466 // This may be slightly out of date due to network latency. This does NOT 467 // include stream level or TCP level flow control info. 468 RemoteFlowControlWindow int64 469 // The locally bound address. 470 LocalAddr net.Addr 471 // The remote bound address. May be absent. 472 RemoteAddr net.Addr 473 // Optional, represents the name of the remote endpoint, if different than 474 // the original target name. 475 RemoteName string 476 SocketOptions *SocketOptionData 477 Security credentials.ChannelzSecurityValue 478} 479 480// Socket is the interface that should be satisfied in order to be tracked by 481// channelz as Socket. 482type Socket interface { 483 ChannelzMetric() *SocketInternalMetric 484} 485 486type listenSocket struct { 487 refName string 488 s Socket 489 id int64 490 pid int64 491 cm *channelMap 492} 493 494func (ls *listenSocket) addChild(id int64, e entry) { 495 logger.Errorf("cannot add a child (id = %d) of type %T to a listen socket", id, e) 496} 497 498func (ls *listenSocket) deleteChild(id int64) { 499 logger.Errorf("cannot delete a child (id = %d) from a listen socket", id) 500} 501 502func (ls *listenSocket) triggerDelete() { 503 ls.cm.deleteEntry(ls.id) 504 ls.cm.findEntry(ls.pid).deleteChild(ls.id) 505} 506 507func (ls *listenSocket) deleteSelfIfReady() { 508 logger.Errorf("cannot call deleteSelfIfReady on a listen socket") 509} 510 511func (ls *listenSocket) getParentID() int64 { 512 return ls.pid 513} 514 515type normalSocket struct { 516 refName string 517 s Socket 518 id int64 519 pid int64 520 cm *channelMap 521} 522 523func (ns *normalSocket) addChild(id int64, e entry) { 524 logger.Errorf("cannot add a child (id = %d) of type %T to a normal socket", id, e) 525} 526 527func (ns *normalSocket) deleteChild(id int64) { 528 logger.Errorf("cannot delete a child (id = %d) from a normal socket", id) 529} 530 531func (ns *normalSocket) triggerDelete() { 532 ns.cm.deleteEntry(ns.id) 533 ns.cm.findEntry(ns.pid).deleteChild(ns.id) 534} 535 536func (ns *normalSocket) deleteSelfIfReady() { 537 logger.Errorf("cannot call deleteSelfIfReady on a normal socket") 538} 539 540func (ns *normalSocket) getParentID() int64 { 541 return ns.pid 542} 543 544// ServerMetric defines the info channelz provides for a specific Server, which 545// includes ServerInternalMetric and channelz-specific data, such as channelz id, 546// child list, etc. 547type ServerMetric struct { 548 // ID is the channelz id of this server. 549 ID int64 550 // RefName is the human readable reference string of this server. 551 RefName string 552 // ServerData contains server internal metric reported by the server through 553 // ChannelzMetric(). 554 ServerData *ServerInternalMetric 555 // ListenSockets tracks the listener socket type children of this server in the 556 // format of a map from socket channelz id to corresponding reference string. 557 ListenSockets map[int64]string 558} 559 560// ServerInternalMetric defines the struct that the implementor of Server interface 561// should return from ChannelzMetric(). 562type ServerInternalMetric struct { 563 // The number of incoming calls started on the server. 564 CallsStarted int64 565 // The number of incoming calls that have completed with an OK status. 566 CallsSucceeded int64 567 // The number of incoming calls that have a completed with a non-OK status. 568 CallsFailed int64 569 // The last time a call was started on the server. 570 LastCallStartedTimestamp time.Time 571} 572 573// Server is the interface to be satisfied in order to be tracked by channelz as 574// Server. 575type Server interface { 576 ChannelzMetric() *ServerInternalMetric 577} 578 579type server struct { 580 refName string 581 s Server 582 closeCalled bool 583 sockets map[int64]string 584 listenSockets map[int64]string 585 id int64 586 cm *channelMap 587} 588 589func (s *server) addChild(id int64, e entry) { 590 switch v := e.(type) { 591 case *normalSocket: 592 s.sockets[id] = v.refName 593 case *listenSocket: 594 s.listenSockets[id] = v.refName 595 default: 596 logger.Errorf("cannot add a child (id = %d) of type %T to a server", id, e) 597 } 598} 599 600func (s *server) deleteChild(id int64) { 601 delete(s.sockets, id) 602 delete(s.listenSockets, id) 603 s.deleteSelfIfReady() 604} 605 606func (s *server) triggerDelete() { 607 s.closeCalled = true 608 s.deleteSelfIfReady() 609} 610 611func (s *server) deleteSelfIfReady() { 612 if !s.closeCalled || len(s.sockets)+len(s.listenSockets) != 0 { 613 return 614 } 615 s.cm.deleteEntry(s.id) 616} 617 618func (s *server) getParentID() int64 { 619 return 0 620} 621 622type tracedChannel interface { 623 getChannelTrace() *channelTrace 624 incrTraceRefCount() 625 decrTraceRefCount() 626 getRefName() string 627} 628 629type channelTrace struct { 630 cm *channelMap 631 createdTime time.Time 632 eventCount int64 633 mu sync.Mutex 634 events []*TraceEvent 635} 636 637func (c *channelTrace) append(e *TraceEvent) { 638 c.mu.Lock() 639 if len(c.events) == getMaxTraceEntry() { 640 del := c.events[0] 641 c.events = c.events[1:] 642 if del.RefID != 0 { 643 // start recursive cleanup in a goroutine to not block the call originated from grpc. 644 go func() { 645 // need to acquire c.cm.mu lock to call the unlocked attemptCleanup func. 646 c.cm.mu.Lock() 647 c.cm.decrTraceRefCount(del.RefID) 648 c.cm.mu.Unlock() 649 }() 650 } 651 } 652 e.Timestamp = time.Now() 653 c.events = append(c.events, e) 654 c.eventCount++ 655 c.mu.Unlock() 656} 657 658func (c *channelTrace) clear() { 659 c.mu.Lock() 660 for _, e := range c.events { 661 if e.RefID != 0 { 662 // caller should have already held the c.cm.mu lock. 663 c.cm.decrTraceRefCount(e.RefID) 664 } 665 } 666 c.mu.Unlock() 667} 668 669// Severity is the severity level of a trace event. 670// The canonical enumeration of all valid values is here: 671// https://github.com/grpc/grpc-proto/blob/9b13d199cc0d4703c7ea26c9c330ba695866eb23/grpc/channelz/v1/channelz.proto#L126. 672type Severity int 673 674const ( 675 // CtUnknown indicates unknown severity of a trace event. 676 CtUnknown Severity = iota 677 // CtInfo indicates info level severity of a trace event. 678 CtInfo 679 // CtWarning indicates warning level severity of a trace event. 680 CtWarning 681 // CtError indicates error level severity of a trace event. 682 CtError 683) 684 685// RefChannelType is the type of the entity being referenced in a trace event. 686type RefChannelType int 687 688const ( 689 // RefChannel indicates the referenced entity is a Channel. 690 RefChannel RefChannelType = iota 691 // RefSubChannel indicates the referenced entity is a SubChannel. 692 RefSubChannel 693) 694 695func (c *channelTrace) dumpData() *ChannelTrace { 696 c.mu.Lock() 697 ct := &ChannelTrace{EventNum: c.eventCount, CreationTime: c.createdTime} 698 ct.Events = c.events[:len(c.events)] 699 c.mu.Unlock() 700 return ct 701} 702