1// +build !providerless 2 3/* 4Copyright 2016 The Kubernetes Authors. 5 6Licensed under the Apache License, Version 2.0 (the "License"); 7you may not use this file except in compliance with the License. 8You may obtain a copy of the License at 9 10 http://www.apache.org/licenses/LICENSE-2.0 11 12Unless required by applicable law or agreed to in writing, software 13distributed under the License is distributed on an "AS IS" BASIS, 14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15See the License for the specific language governing permissions and 16limitations under the License. 17*/ 18 19package vsphere 20 21import ( 22 "context" 23 "fmt" 24 "strings" 25 "sync" 26 27 "github.com/vmware/govmomi/object" 28 "github.com/vmware/govmomi/vim25/mo" 29 "k8s.io/klog/v2" 30 31 v1 "k8s.io/api/core/v1" 32 k8stypes "k8s.io/apimachinery/pkg/types" 33 cloudprovider "k8s.io/cloud-provider" 34 "k8s.io/legacy-cloud-providers/vsphere/vclib" 35) 36 37// Stores info about the kubernetes node 38type NodeInfo struct { 39 dataCenter *vclib.Datacenter 40 vm *vclib.VirtualMachine 41 vcServer string 42 vmUUID string 43 zone *cloudprovider.Zone 44} 45 46func (n NodeInfo) String() string { 47 return fmt.Sprintf("{datacenter: %v, vm: %v, vcServer: %s, vmUUID: %s, zone: %v}", 48 *n.dataCenter, n.vm.Reference(), n.vcServer, n.vmUUID, *n.zone) 49} 50 51type NodeManager struct { 52 // TODO: replace map with concurrent map when k8s supports go v1.9 53 54 // Maps the VC server to VSphereInstance 55 vsphereInstanceMap map[string]*VSphereInstance 56 // Maps node name to node info. 57 nodeInfoMap map[string]*NodeInfo 58 // Maps node name to node structure 59 registeredNodes map[string]*v1.Node 60 //CredentialsManager 61 credentialManager *SecretCredentialManager 62 63 // Mutexes 64 registeredNodesLock sync.RWMutex 65 nodeInfoLock sync.RWMutex 66 credentialManagerLock sync.Mutex 67} 68 69type NodeDetails struct { 70 NodeName string 71 vm *vclib.VirtualMachine 72 VMUUID string 73 Zone *cloudprovider.Zone 74} 75 76// TODO: Make it configurable in vsphere.conf 77const ( 78 POOL_SIZE = 8 79 QUEUE_SIZE = POOL_SIZE * 10 80) 81 82func (nm *NodeManager) DiscoverNode(node *v1.Node) error { 83 type VmSearch struct { 84 vc string 85 datacenter *vclib.Datacenter 86 } 87 88 var mutex = &sync.Mutex{} 89 var globalErrMutex = &sync.Mutex{} 90 var queueChannel chan *VmSearch 91 var wg sync.WaitGroup 92 var globalErr *error 93 94 queueChannel = make(chan *VmSearch, QUEUE_SIZE) 95 nodeUUID, err := GetNodeUUID(node) 96 if err != nil { 97 klog.Errorf("Node Discovery failed to get node uuid for node %s with error: %v", node.Name, err) 98 return err 99 } 100 101 klog.V(4).Infof("Discovering node %s with uuid %s", node.ObjectMeta.Name, nodeUUID) 102 103 vmFound := false 104 globalErr = nil 105 106 setGlobalErr := func(err error) { 107 globalErrMutex.Lock() 108 globalErr = &err 109 globalErrMutex.Unlock() 110 } 111 112 setVMFound := func(found bool) { 113 mutex.Lock() 114 vmFound = found 115 mutex.Unlock() 116 } 117 118 getVMFound := func() bool { 119 mutex.Lock() 120 found := vmFound 121 mutex.Unlock() 122 return found 123 } 124 125 go func() { 126 var datacenterObjs []*vclib.Datacenter 127 for vc, vsi := range nm.vsphereInstanceMap { 128 129 found := getVMFound() 130 if found == true { 131 break 132 } 133 134 // Create context 135 ctx, cancel := context.WithCancel(context.Background()) 136 defer cancel() 137 138 err := nm.vcConnect(ctx, vsi) 139 if err != nil { 140 klog.V(4).Info("Discovering node error vc:", err) 141 setGlobalErr(err) 142 continue 143 } 144 145 if vsi.cfg.Datacenters == "" { 146 datacenterObjs, err = vclib.GetAllDatacenter(ctx, vsi.conn) 147 if err != nil { 148 klog.V(4).Info("Discovering node error dc:", err) 149 setGlobalErr(err) 150 continue 151 } 152 } else { 153 datacenters := strings.Split(vsi.cfg.Datacenters, ",") 154 for _, dc := range datacenters { 155 dc = strings.TrimSpace(dc) 156 if dc == "" { 157 continue 158 } 159 datacenterObj, err := vclib.GetDatacenter(ctx, vsi.conn, dc) 160 if err != nil { 161 klog.V(4).Info("Discovering node error dc:", err) 162 setGlobalErr(err) 163 continue 164 } 165 datacenterObjs = append(datacenterObjs, datacenterObj) 166 } 167 } 168 169 for _, datacenterObj := range datacenterObjs { 170 found := getVMFound() 171 if found == true { 172 break 173 } 174 175 klog.V(4).Infof("Finding node %s in vc=%s and datacenter=%s", node.Name, vc, datacenterObj.Name()) 176 queueChannel <- &VmSearch{ 177 vc: vc, 178 datacenter: datacenterObj, 179 } 180 } 181 } 182 close(queueChannel) 183 }() 184 185 for i := 0; i < POOL_SIZE; i++ { 186 wg.Add(1) 187 go func() { 188 for res := range queueChannel { 189 ctx, cancel := context.WithCancel(context.Background()) 190 vm, err := res.datacenter.GetVMByUUID(ctx, nodeUUID) 191 if err != nil { 192 klog.V(4).Infof("Error while looking for vm=%+v in vc=%s and datacenter=%s: %v", 193 vm, res.vc, res.datacenter.Name(), err) 194 if err != vclib.ErrNoVMFound { 195 setGlobalErr(err) 196 } else { 197 klog.V(4).Infof("Did not find node %s in vc=%s and datacenter=%s", 198 node.Name, res.vc, res.datacenter.Name()) 199 } 200 cancel() 201 continue 202 } 203 if vm != nil { 204 klog.V(4).Infof("Found node %s as vm=%+v in vc=%s and datacenter=%s", 205 node.Name, vm, res.vc, res.datacenter.Name()) 206 var vmObj mo.VirtualMachine 207 err := vm.Properties(ctx, vm.Reference(), []string{"config"}, &vmObj) 208 if err != nil || vmObj.Config == nil { 209 klog.Errorf("failed to retrieve guest vmconfig for node: %s Err: %v", node.Name, err) 210 } else { 211 klog.V(4).Infof("vm hardware version for node:%s is %s", node.Name, vmObj.Config.Version) 212 // vmconfig.Version returns vm hardware version as vmx-11, vmx-13, vmx-14, vmx-15 etc. 213 vmhardwaredeprecated, err := isGuestHardwareVersionDeprecated(vmObj.Config.Version) 214 if err != nil { 215 klog.Errorf("failed to check if vm hardware version is deprecated. VM Hardware Version: %s Err: %v", vmObj.Config.Version, err) 216 } 217 if vmhardwaredeprecated { 218 klog.Warningf("VM Hardware version: %s from node: %s is deprecated. Please consider upgrading virtual machine hardware version to vmx-15 or higher", vmObj.Config.Version, node.Name) 219 } 220 } 221 // Get the node zone information 222 nodeFd := node.ObjectMeta.Labels[v1.LabelTopologyZone] 223 nodeRegion := node.ObjectMeta.Labels[v1.LabelTopologyRegion] 224 nodeZone := &cloudprovider.Zone{FailureDomain: nodeFd, Region: nodeRegion} 225 nodeInfo := &NodeInfo{dataCenter: res.datacenter, vm: vm, vcServer: res.vc, vmUUID: nodeUUID, zone: nodeZone} 226 nm.addNodeInfo(node.ObjectMeta.Name, nodeInfo) 227 for range queueChannel { 228 } 229 setVMFound(true) 230 cancel() 231 break 232 } 233 } 234 wg.Done() 235 }() 236 } 237 wg.Wait() 238 if vmFound { 239 return nil 240 } 241 if globalErr != nil { 242 return *globalErr 243 } 244 245 klog.V(4).Infof("Discovery Node: %q vm not found", node.Name) 246 return vclib.ErrNoVMFound 247} 248 249func (nm *NodeManager) RegisterNode(node *v1.Node) error { 250 nm.addNode(node) 251 return nm.DiscoverNode(node) 252} 253 254func (nm *NodeManager) UnRegisterNode(node *v1.Node) error { 255 nm.removeNode(node) 256 return nil 257} 258 259func (nm *NodeManager) RediscoverNode(nodeName k8stypes.NodeName) error { 260 node, err := nm.GetNode(nodeName) 261 262 if err != nil { 263 return err 264 } 265 return nm.DiscoverNode(&node) 266} 267 268func (nm *NodeManager) GetNode(nodeName k8stypes.NodeName) (v1.Node, error) { 269 nm.registeredNodesLock.RLock() 270 node := nm.registeredNodes[convertToString(nodeName)] 271 nm.registeredNodesLock.RUnlock() 272 if node == nil { 273 return v1.Node{}, vclib.ErrNoVMFound 274 } 275 return *node, nil 276} 277 278func (nm *NodeManager) getNodes() map[string]*v1.Node { 279 nm.registeredNodesLock.RLock() 280 defer nm.registeredNodesLock.RUnlock() 281 registeredNodes := make(map[string]*v1.Node, len(nm.registeredNodes)) 282 for nodeName, node := range nm.registeredNodes { 283 registeredNodes[nodeName] = node 284 } 285 return registeredNodes 286} 287 288func (nm *NodeManager) addNode(node *v1.Node) { 289 nm.registeredNodesLock.Lock() 290 nm.registeredNodes[node.ObjectMeta.Name] = node 291 nm.registeredNodesLock.Unlock() 292} 293 294func (nm *NodeManager) removeNode(node *v1.Node) { 295 nm.registeredNodesLock.Lock() 296 delete(nm.registeredNodes, node.ObjectMeta.Name) 297 nm.registeredNodesLock.Unlock() 298 299 nm.nodeInfoLock.Lock() 300 delete(nm.nodeInfoMap, node.ObjectMeta.Name) 301 nm.nodeInfoLock.Unlock() 302} 303 304// GetNodeInfo returns a NodeInfo which datacenter, vm and vc server ip address. 305// This method returns an error if it is unable find node VCs and DCs listed in vSphere.conf 306// NodeInfo returned may not be updated to reflect current VM location. 307// 308// This method is a getter but it can cause side-effect of updating NodeInfo object. 309func (nm *NodeManager) GetNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error) { 310 return nm.getRefreshedNodeInfo(nodeName) 311} 312 313// GetNodeDetails returns NodeDetails for all the discovered nodes. 314// 315// This method is a getter but it can cause side-effect of updating NodeInfo objects. 316func (nm *NodeManager) GetNodeDetails() ([]NodeDetails, error) { 317 var nodeDetails []NodeDetails 318 319 for nodeName, nodeObj := range nm.getNodes() { 320 nodeInfo, err := nm.GetNodeInfoWithNodeObject(nodeObj) 321 if err != nil { 322 return nil, err 323 } 324 klog.V(4).Infof("Updated NodeInfo %v for node %q.", nodeInfo, nodeName) 325 nodeDetails = append(nodeDetails, NodeDetails{nodeName, nodeInfo.vm, nodeInfo.vmUUID, nodeInfo.zone}) 326 } 327 return nodeDetails, nil 328} 329 330// GetNodeNames returns list of nodes that are known to vsphere cloudprovider. 331// These are typically nodes that make up k8s cluster. 332func (nm *NodeManager) GetNodeNames() []k8stypes.NodeName { 333 nodes := nm.getNodes() 334 var nodeNameList []k8stypes.NodeName 335 for _, node := range nodes { 336 nodeNameList = append(nodeNameList, k8stypes.NodeName(node.Name)) 337 } 338 return nodeNameList 339} 340 341func (nm *NodeManager) refreshNodes() (errList []error) { 342 for nodeName := range nm.getNodes() { 343 nodeInfo, err := nm.getRefreshedNodeInfo(convertToK8sType(nodeName)) 344 if err != nil { 345 errList = append(errList, err) 346 continue 347 } 348 klog.V(4).Infof("Updated NodeInfo %v for node %q.", nodeInfo, nodeName) 349 } 350 return errList 351} 352 353func (nm *NodeManager) getRefreshedNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error) { 354 nodeInfo := nm.getNodeInfo(nodeName) 355 var err error 356 if nodeInfo == nil { 357 // Rediscover node if no NodeInfo found. 358 klog.V(4).Infof("No VM found for node %q. Initiating rediscovery.", convertToString(nodeName)) 359 err = nm.RediscoverNode(nodeName) 360 if err != nil { 361 klog.Errorf("Error %q node info for node %q not found", err, convertToString(nodeName)) 362 return NodeInfo{}, err 363 } 364 nodeInfo = nm.getNodeInfo(nodeName) 365 } else { 366 // Renew the found NodeInfo to avoid stale vSphere connection. 367 klog.V(4).Infof("Renewing NodeInfo %+v for node %q", nodeInfo, convertToString(nodeName)) 368 nodeInfo, err = nm.renewNodeInfo(nodeInfo, true) 369 if err != nil { 370 klog.Errorf("Error %q occurred while renewing NodeInfo for %q", err, convertToString(nodeName)) 371 return NodeInfo{}, err 372 } 373 nm.addNodeInfo(convertToString(nodeName), nodeInfo) 374 } 375 return *nodeInfo, nil 376} 377 378func (nm *NodeManager) addNodeInfo(nodeName string, nodeInfo *NodeInfo) { 379 nm.nodeInfoLock.Lock() 380 nm.nodeInfoMap[nodeName] = nodeInfo 381 nm.nodeInfoLock.Unlock() 382} 383 384func (nm *NodeManager) getNodeInfo(nodeName k8stypes.NodeName) *NodeInfo { 385 nm.nodeInfoLock.RLock() 386 nodeInfo := nm.nodeInfoMap[convertToString(nodeName)] 387 nm.nodeInfoLock.RUnlock() 388 return nodeInfo 389} 390 391func (nm *NodeManager) GetVSphereInstance(nodeName k8stypes.NodeName) (VSphereInstance, error) { 392 nodeInfo, err := nm.GetNodeInfo(nodeName) 393 if err != nil { 394 klog.V(4).Infof("node info for node %q not found", convertToString(nodeName)) 395 return VSphereInstance{}, err 396 } 397 vsphereInstance := nm.vsphereInstanceMap[nodeInfo.vcServer] 398 if vsphereInstance == nil { 399 return VSphereInstance{}, fmt.Errorf("vSphereInstance for vc server %q not found while looking for node %q", nodeInfo.vcServer, convertToString(nodeName)) 400 } 401 return *vsphereInstance, nil 402} 403 404// renewNodeInfo renews vSphere connection, VirtualMachine and Datacenter for NodeInfo instance. 405func (nm *NodeManager) renewNodeInfo(nodeInfo *NodeInfo, reconnect bool) (*NodeInfo, error) { 406 ctx, cancel := context.WithCancel(context.Background()) 407 defer cancel() 408 409 vsphereInstance := nm.vsphereInstanceMap[nodeInfo.vcServer] 410 if vsphereInstance == nil { 411 err := fmt.Errorf("vSphereInstance for vSphere %q not found while refershing NodeInfo for VM %q", nodeInfo.vcServer, nodeInfo.vm) 412 return nil, err 413 } 414 if reconnect { 415 err := nm.vcConnect(ctx, vsphereInstance) 416 if err != nil { 417 return nil, err 418 } 419 } 420 vm := nodeInfo.vm.RenewVM(vsphereInstance.conn.Client) 421 return &NodeInfo{ 422 vm: &vm, 423 dataCenter: vm.Datacenter, 424 vcServer: nodeInfo.vcServer, 425 vmUUID: nodeInfo.vmUUID, 426 zone: nodeInfo.zone, 427 }, nil 428} 429 430func (nodeInfo *NodeInfo) VM() *vclib.VirtualMachine { 431 if nodeInfo == nil { 432 return nil 433 } 434 return nodeInfo.vm 435} 436 437// vcConnect connects to vCenter with existing credentials 438// If credentials are invalid: 439// 1. It will fetch credentials from credentialManager 440// 2. Update the credentials 441// 3. Connects again to vCenter with fetched credentials 442func (nm *NodeManager) vcConnect(ctx context.Context, vsphereInstance *VSphereInstance) error { 443 err := vsphereInstance.conn.Connect(ctx) 444 if err == nil { 445 return nil 446 } 447 448 credentialManager := nm.CredentialManager() 449 if !vclib.IsInvalidCredentialsError(err) || credentialManager == nil { 450 klog.Errorf("Cannot connect to vCenter with err: %v", err) 451 return err 452 } 453 454 klog.V(4).Infof("Invalid credentials. Cannot connect to server %q. Fetching credentials from secrets.", vsphereInstance.conn.Hostname) 455 456 // Get latest credentials from SecretCredentialManager 457 credentials, err := credentialManager.GetCredential(vsphereInstance.conn.Hostname) 458 if err != nil { 459 klog.Errorf("Failed to get credentials from Secret Credential Manager with err: %v", err) 460 return err 461 } 462 vsphereInstance.conn.UpdateCredentials(credentials.User, credentials.Password) 463 return vsphereInstance.conn.Connect(ctx) 464} 465 466// GetNodeInfoWithNodeObject returns a NodeInfo which datacenter, vm and vc server ip address. 467// This method returns an error if it is unable find node VCs and DCs listed in vSphere.conf 468// NodeInfo returned may not be updated to reflect current VM location. 469// 470// This method is a getter but it can cause side-effect of updating NodeInfo object. 471func (nm *NodeManager) GetNodeInfoWithNodeObject(node *v1.Node) (NodeInfo, error) { 472 return nm.getRefreshedNodeInfo(convertToK8sType(node.Name)) 473} 474 475func (nm *NodeManager) CredentialManager() *SecretCredentialManager { 476 nm.credentialManagerLock.Lock() 477 defer nm.credentialManagerLock.Unlock() 478 return nm.credentialManager 479} 480 481func (nm *NodeManager) UpdateCredentialManager(credentialManager *SecretCredentialManager) { 482 nm.credentialManagerLock.Lock() 483 defer nm.credentialManagerLock.Unlock() 484 nm.credentialManager = credentialManager 485} 486 487func (nm *NodeManager) GetHostsInZone(ctx context.Context, zoneFailureDomain string) ([]*object.HostSystem, error) { 488 klog.V(9).Infof("GetHostsInZone called with registeredNodes: %v", nm.registeredNodes) 489 nodeDetails, err := nm.GetNodeDetails() 490 if err != nil { 491 return nil, err 492 } 493 klog.V(4).Infof("Node Details: %v", nodeDetails) 494 // Build a map of Host moRef to HostSystem 495 hostMap := make(map[string]*object.HostSystem) 496 for _, n := range nodeDetails { 497 // Match the provided zone failure domain with the node. 498 klog.V(9).Infof("Matching provided zone %s with node %s zone %s", zoneFailureDomain, n.NodeName, n.Zone.FailureDomain) 499 if zoneFailureDomain == n.Zone.FailureDomain { 500 host, err := n.vm.HostSystem(ctx) 501 if err != nil { 502 klog.Errorf("Failed to get host system for VM %s. err: %+v", n.vm, err) 503 continue 504 } 505 hostMap[host.Reference().Value] = host 506 } 507 } 508 // Build the unique list of hosts. 509 hosts := make([]*object.HostSystem, 0) 510 for _, value := range hostMap { 511 hosts = append(hosts, value) 512 } 513 klog.V(4).Infof("GetHostsInZone %v returning: %v", zoneFailureDomain, hosts) 514 return hosts, nil 515} 516