1// +build !providerless
2
3/*
4Copyright 2016 The Kubernetes Authors.
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10    http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17*/
18
19package vsphere
20
21import (
22	"context"
23	"fmt"
24	"strings"
25	"sync"
26
27	"github.com/vmware/govmomi/object"
28	"github.com/vmware/govmomi/vim25/mo"
29	"k8s.io/klog/v2"
30
31	v1 "k8s.io/api/core/v1"
32	k8stypes "k8s.io/apimachinery/pkg/types"
33	cloudprovider "k8s.io/cloud-provider"
34	"k8s.io/legacy-cloud-providers/vsphere/vclib"
35)
36
37// Stores info about the kubernetes node
38type NodeInfo struct {
39	dataCenter *vclib.Datacenter
40	vm         *vclib.VirtualMachine
41	vcServer   string
42	vmUUID     string
43	zone       *cloudprovider.Zone
44}
45
46func (n NodeInfo) String() string {
47	return fmt.Sprintf("{datacenter: %v, vm: %v, vcServer: %s, vmUUID: %s, zone: %v}",
48		*n.dataCenter, n.vm.Reference(), n.vcServer, n.vmUUID, *n.zone)
49}
50
51type NodeManager struct {
52	// TODO: replace map with concurrent map when k8s supports go v1.9
53
54	// Maps the VC server to VSphereInstance
55	vsphereInstanceMap map[string]*VSphereInstance
56	// Maps node name to node info.
57	nodeInfoMap map[string]*NodeInfo
58	// Maps node name to node structure
59	registeredNodes map[string]*v1.Node
60	//CredentialsManager
61	credentialManager *SecretCredentialManager
62
63	// Mutexes
64	registeredNodesLock   sync.RWMutex
65	nodeInfoLock          sync.RWMutex
66	credentialManagerLock sync.Mutex
67}
68
69type NodeDetails struct {
70	NodeName string
71	vm       *vclib.VirtualMachine
72	VMUUID   string
73	Zone     *cloudprovider.Zone
74}
75
76// TODO: Make it configurable in vsphere.conf
77const (
78	POOL_SIZE  = 8
79	QUEUE_SIZE = POOL_SIZE * 10
80)
81
82func (nm *NodeManager) DiscoverNode(node *v1.Node) error {
83	type VmSearch struct {
84		vc         string
85		datacenter *vclib.Datacenter
86	}
87
88	var mutex = &sync.Mutex{}
89	var globalErrMutex = &sync.Mutex{}
90	var queueChannel chan *VmSearch
91	var wg sync.WaitGroup
92	var globalErr *error
93
94	queueChannel = make(chan *VmSearch, QUEUE_SIZE)
95	nodeUUID, err := GetNodeUUID(node)
96	if err != nil {
97		klog.Errorf("Node Discovery failed to get node uuid for node %s with error: %v", node.Name, err)
98		return err
99	}
100
101	klog.V(4).Infof("Discovering node %s with uuid %s", node.ObjectMeta.Name, nodeUUID)
102
103	vmFound := false
104	globalErr = nil
105
106	setGlobalErr := func(err error) {
107		globalErrMutex.Lock()
108		globalErr = &err
109		globalErrMutex.Unlock()
110	}
111
112	setVMFound := func(found bool) {
113		mutex.Lock()
114		vmFound = found
115		mutex.Unlock()
116	}
117
118	getVMFound := func() bool {
119		mutex.Lock()
120		found := vmFound
121		mutex.Unlock()
122		return found
123	}
124
125	go func() {
126		var datacenterObjs []*vclib.Datacenter
127		for vc, vsi := range nm.vsphereInstanceMap {
128
129			found := getVMFound()
130			if found == true {
131				break
132			}
133
134			// Create context
135			ctx, cancel := context.WithCancel(context.Background())
136			defer cancel()
137
138			err := nm.vcConnect(ctx, vsi)
139			if err != nil {
140				klog.V(4).Info("Discovering node error vc:", err)
141				setGlobalErr(err)
142				continue
143			}
144
145			if vsi.cfg.Datacenters == "" {
146				datacenterObjs, err = vclib.GetAllDatacenter(ctx, vsi.conn)
147				if err != nil {
148					klog.V(4).Info("Discovering node error dc:", err)
149					setGlobalErr(err)
150					continue
151				}
152			} else {
153				datacenters := strings.Split(vsi.cfg.Datacenters, ",")
154				for _, dc := range datacenters {
155					dc = strings.TrimSpace(dc)
156					if dc == "" {
157						continue
158					}
159					datacenterObj, err := vclib.GetDatacenter(ctx, vsi.conn, dc)
160					if err != nil {
161						klog.V(4).Info("Discovering node error dc:", err)
162						setGlobalErr(err)
163						continue
164					}
165					datacenterObjs = append(datacenterObjs, datacenterObj)
166				}
167			}
168
169			for _, datacenterObj := range datacenterObjs {
170				found := getVMFound()
171				if found == true {
172					break
173				}
174
175				klog.V(4).Infof("Finding node %s in vc=%s and datacenter=%s", node.Name, vc, datacenterObj.Name())
176				queueChannel <- &VmSearch{
177					vc:         vc,
178					datacenter: datacenterObj,
179				}
180			}
181		}
182		close(queueChannel)
183	}()
184
185	for i := 0; i < POOL_SIZE; i++ {
186		wg.Add(1)
187		go func() {
188			for res := range queueChannel {
189				ctx, cancel := context.WithCancel(context.Background())
190				vm, err := res.datacenter.GetVMByUUID(ctx, nodeUUID)
191				if err != nil {
192					klog.V(4).Infof("Error while looking for vm=%+v in vc=%s and datacenter=%s: %v",
193						vm, res.vc, res.datacenter.Name(), err)
194					if err != vclib.ErrNoVMFound {
195						setGlobalErr(err)
196					} else {
197						klog.V(4).Infof("Did not find node %s in vc=%s and datacenter=%s",
198							node.Name, res.vc, res.datacenter.Name())
199					}
200					cancel()
201					continue
202				}
203				if vm != nil {
204					klog.V(4).Infof("Found node %s as vm=%+v in vc=%s and datacenter=%s",
205						node.Name, vm, res.vc, res.datacenter.Name())
206					var vmObj mo.VirtualMachine
207					err := vm.Properties(ctx, vm.Reference(), []string{"config"}, &vmObj)
208					if err != nil || vmObj.Config == nil {
209						klog.Errorf("failed to retrieve guest vmconfig for node: %s Err: %v", node.Name, err)
210					} else {
211						klog.V(4).Infof("vm hardware version for node:%s is %s", node.Name, vmObj.Config.Version)
212						// vmconfig.Version returns vm hardware version as vmx-11, vmx-13, vmx-14, vmx-15 etc.
213						vmhardwaredeprecated, err := isGuestHardwareVersionDeprecated(vmObj.Config.Version)
214						if err != nil {
215							klog.Errorf("failed to check if vm hardware version is deprecated. VM Hardware Version: %s Err: %v", vmObj.Config.Version, err)
216						}
217						if vmhardwaredeprecated {
218							klog.Warningf("VM Hardware version: %s from node: %s is deprecated. Please consider upgrading virtual machine hardware version to vmx-15 or higher", vmObj.Config.Version, node.Name)
219						}
220					}
221					// Get the node zone information
222					nodeFd := node.ObjectMeta.Labels[v1.LabelTopologyZone]
223					nodeRegion := node.ObjectMeta.Labels[v1.LabelTopologyRegion]
224					nodeZone := &cloudprovider.Zone{FailureDomain: nodeFd, Region: nodeRegion}
225					nodeInfo := &NodeInfo{dataCenter: res.datacenter, vm: vm, vcServer: res.vc, vmUUID: nodeUUID, zone: nodeZone}
226					nm.addNodeInfo(node.ObjectMeta.Name, nodeInfo)
227					for range queueChannel {
228					}
229					setVMFound(true)
230					cancel()
231					break
232				}
233			}
234			wg.Done()
235		}()
236	}
237	wg.Wait()
238	if vmFound {
239		return nil
240	}
241	if globalErr != nil {
242		return *globalErr
243	}
244
245	klog.V(4).Infof("Discovery Node: %q vm not found", node.Name)
246	return vclib.ErrNoVMFound
247}
248
249func (nm *NodeManager) RegisterNode(node *v1.Node) error {
250	nm.addNode(node)
251	return nm.DiscoverNode(node)
252}
253
254func (nm *NodeManager) UnRegisterNode(node *v1.Node) error {
255	nm.removeNode(node)
256	return nil
257}
258
259func (nm *NodeManager) RediscoverNode(nodeName k8stypes.NodeName) error {
260	node, err := nm.GetNode(nodeName)
261
262	if err != nil {
263		return err
264	}
265	return nm.DiscoverNode(&node)
266}
267
268func (nm *NodeManager) GetNode(nodeName k8stypes.NodeName) (v1.Node, error) {
269	nm.registeredNodesLock.RLock()
270	node := nm.registeredNodes[convertToString(nodeName)]
271	nm.registeredNodesLock.RUnlock()
272	if node == nil {
273		return v1.Node{}, vclib.ErrNoVMFound
274	}
275	return *node, nil
276}
277
278func (nm *NodeManager) getNodes() map[string]*v1.Node {
279	nm.registeredNodesLock.RLock()
280	defer nm.registeredNodesLock.RUnlock()
281	registeredNodes := make(map[string]*v1.Node, len(nm.registeredNodes))
282	for nodeName, node := range nm.registeredNodes {
283		registeredNodes[nodeName] = node
284	}
285	return registeredNodes
286}
287
288func (nm *NodeManager) addNode(node *v1.Node) {
289	nm.registeredNodesLock.Lock()
290	nm.registeredNodes[node.ObjectMeta.Name] = node
291	nm.registeredNodesLock.Unlock()
292}
293
294func (nm *NodeManager) removeNode(node *v1.Node) {
295	nm.registeredNodesLock.Lock()
296	delete(nm.registeredNodes, node.ObjectMeta.Name)
297	nm.registeredNodesLock.Unlock()
298
299	nm.nodeInfoLock.Lock()
300	delete(nm.nodeInfoMap, node.ObjectMeta.Name)
301	nm.nodeInfoLock.Unlock()
302}
303
304// GetNodeInfo returns a NodeInfo which datacenter, vm and vc server ip address.
305// This method returns an error if it is unable find node VCs and DCs listed in vSphere.conf
306// NodeInfo returned may not be updated to reflect current VM location.
307//
308// This method is a getter but it can cause side-effect of updating NodeInfo object.
309func (nm *NodeManager) GetNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error) {
310	return nm.getRefreshedNodeInfo(nodeName)
311}
312
313// GetNodeDetails returns NodeDetails for all the discovered nodes.
314//
315// This method is a getter but it can cause side-effect of updating NodeInfo objects.
316func (nm *NodeManager) GetNodeDetails() ([]NodeDetails, error) {
317	var nodeDetails []NodeDetails
318
319	for nodeName, nodeObj := range nm.getNodes() {
320		nodeInfo, err := nm.GetNodeInfoWithNodeObject(nodeObj)
321		if err != nil {
322			return nil, err
323		}
324		klog.V(4).Infof("Updated NodeInfo %v for node %q.", nodeInfo, nodeName)
325		nodeDetails = append(nodeDetails, NodeDetails{nodeName, nodeInfo.vm, nodeInfo.vmUUID, nodeInfo.zone})
326	}
327	return nodeDetails, nil
328}
329
330// GetNodeNames returns list of nodes that are known to vsphere cloudprovider.
331// These are typically nodes that make up k8s cluster.
332func (nm *NodeManager) GetNodeNames() []k8stypes.NodeName {
333	nodes := nm.getNodes()
334	var nodeNameList []k8stypes.NodeName
335	for _, node := range nodes {
336		nodeNameList = append(nodeNameList, k8stypes.NodeName(node.Name))
337	}
338	return nodeNameList
339}
340
341func (nm *NodeManager) refreshNodes() (errList []error) {
342	for nodeName := range nm.getNodes() {
343		nodeInfo, err := nm.getRefreshedNodeInfo(convertToK8sType(nodeName))
344		if err != nil {
345			errList = append(errList, err)
346			continue
347		}
348		klog.V(4).Infof("Updated NodeInfo %v for node %q.", nodeInfo, nodeName)
349	}
350	return errList
351}
352
353func (nm *NodeManager) getRefreshedNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error) {
354	nodeInfo := nm.getNodeInfo(nodeName)
355	var err error
356	if nodeInfo == nil {
357		// Rediscover node if no NodeInfo found.
358		klog.V(4).Infof("No VM found for node %q. Initiating rediscovery.", convertToString(nodeName))
359		err = nm.RediscoverNode(nodeName)
360		if err != nil {
361			klog.Errorf("Error %q node info for node %q not found", err, convertToString(nodeName))
362			return NodeInfo{}, err
363		}
364		nodeInfo = nm.getNodeInfo(nodeName)
365	} else {
366		// Renew the found NodeInfo to avoid stale vSphere connection.
367		klog.V(4).Infof("Renewing NodeInfo %+v for node %q", nodeInfo, convertToString(nodeName))
368		nodeInfo, err = nm.renewNodeInfo(nodeInfo, true)
369		if err != nil {
370			klog.Errorf("Error %q occurred while renewing NodeInfo for %q", err, convertToString(nodeName))
371			return NodeInfo{}, err
372		}
373		nm.addNodeInfo(convertToString(nodeName), nodeInfo)
374	}
375	return *nodeInfo, nil
376}
377
378func (nm *NodeManager) addNodeInfo(nodeName string, nodeInfo *NodeInfo) {
379	nm.nodeInfoLock.Lock()
380	nm.nodeInfoMap[nodeName] = nodeInfo
381	nm.nodeInfoLock.Unlock()
382}
383
384func (nm *NodeManager) getNodeInfo(nodeName k8stypes.NodeName) *NodeInfo {
385	nm.nodeInfoLock.RLock()
386	nodeInfo := nm.nodeInfoMap[convertToString(nodeName)]
387	nm.nodeInfoLock.RUnlock()
388	return nodeInfo
389}
390
391func (nm *NodeManager) GetVSphereInstance(nodeName k8stypes.NodeName) (VSphereInstance, error) {
392	nodeInfo, err := nm.GetNodeInfo(nodeName)
393	if err != nil {
394		klog.V(4).Infof("node info for node %q not found", convertToString(nodeName))
395		return VSphereInstance{}, err
396	}
397	vsphereInstance := nm.vsphereInstanceMap[nodeInfo.vcServer]
398	if vsphereInstance == nil {
399		return VSphereInstance{}, fmt.Errorf("vSphereInstance for vc server %q not found while looking for node %q", nodeInfo.vcServer, convertToString(nodeName))
400	}
401	return *vsphereInstance, nil
402}
403
404// renewNodeInfo renews vSphere connection, VirtualMachine and Datacenter for NodeInfo instance.
405func (nm *NodeManager) renewNodeInfo(nodeInfo *NodeInfo, reconnect bool) (*NodeInfo, error) {
406	ctx, cancel := context.WithCancel(context.Background())
407	defer cancel()
408
409	vsphereInstance := nm.vsphereInstanceMap[nodeInfo.vcServer]
410	if vsphereInstance == nil {
411		err := fmt.Errorf("vSphereInstance for vSphere %q not found while refershing NodeInfo for VM %q", nodeInfo.vcServer, nodeInfo.vm)
412		return nil, err
413	}
414	if reconnect {
415		err := nm.vcConnect(ctx, vsphereInstance)
416		if err != nil {
417			return nil, err
418		}
419	}
420	vm := nodeInfo.vm.RenewVM(vsphereInstance.conn.Client)
421	return &NodeInfo{
422		vm:         &vm,
423		dataCenter: vm.Datacenter,
424		vcServer:   nodeInfo.vcServer,
425		vmUUID:     nodeInfo.vmUUID,
426		zone:       nodeInfo.zone,
427	}, nil
428}
429
430func (nodeInfo *NodeInfo) VM() *vclib.VirtualMachine {
431	if nodeInfo == nil {
432		return nil
433	}
434	return nodeInfo.vm
435}
436
437// vcConnect connects to vCenter with existing credentials
438// If credentials are invalid:
439// 		1. It will fetch credentials from credentialManager
440//      2. Update the credentials
441//		3. Connects again to vCenter with fetched credentials
442func (nm *NodeManager) vcConnect(ctx context.Context, vsphereInstance *VSphereInstance) error {
443	err := vsphereInstance.conn.Connect(ctx)
444	if err == nil {
445		return nil
446	}
447
448	credentialManager := nm.CredentialManager()
449	if !vclib.IsInvalidCredentialsError(err) || credentialManager == nil {
450		klog.Errorf("Cannot connect to vCenter with err: %v", err)
451		return err
452	}
453
454	klog.V(4).Infof("Invalid credentials. Cannot connect to server %q. Fetching credentials from secrets.", vsphereInstance.conn.Hostname)
455
456	// Get latest credentials from SecretCredentialManager
457	credentials, err := credentialManager.GetCredential(vsphereInstance.conn.Hostname)
458	if err != nil {
459		klog.Errorf("Failed to get credentials from Secret Credential Manager with err: %v", err)
460		return err
461	}
462	vsphereInstance.conn.UpdateCredentials(credentials.User, credentials.Password)
463	return vsphereInstance.conn.Connect(ctx)
464}
465
466// GetNodeInfoWithNodeObject returns a NodeInfo which datacenter, vm and vc server ip address.
467// This method returns an error if it is unable find node VCs and DCs listed in vSphere.conf
468// NodeInfo returned may not be updated to reflect current VM location.
469//
470// This method is a getter but it can cause side-effect of updating NodeInfo object.
471func (nm *NodeManager) GetNodeInfoWithNodeObject(node *v1.Node) (NodeInfo, error) {
472	return nm.getRefreshedNodeInfo(convertToK8sType(node.Name))
473}
474
475func (nm *NodeManager) CredentialManager() *SecretCredentialManager {
476	nm.credentialManagerLock.Lock()
477	defer nm.credentialManagerLock.Unlock()
478	return nm.credentialManager
479}
480
481func (nm *NodeManager) UpdateCredentialManager(credentialManager *SecretCredentialManager) {
482	nm.credentialManagerLock.Lock()
483	defer nm.credentialManagerLock.Unlock()
484	nm.credentialManager = credentialManager
485}
486
487func (nm *NodeManager) GetHostsInZone(ctx context.Context, zoneFailureDomain string) ([]*object.HostSystem, error) {
488	klog.V(9).Infof("GetHostsInZone called with registeredNodes: %v", nm.registeredNodes)
489	nodeDetails, err := nm.GetNodeDetails()
490	if err != nil {
491		return nil, err
492	}
493	klog.V(4).Infof("Node Details: %v", nodeDetails)
494	// Build a map of Host moRef to HostSystem
495	hostMap := make(map[string]*object.HostSystem)
496	for _, n := range nodeDetails {
497		// Match the provided zone failure domain with the node.
498		klog.V(9).Infof("Matching provided zone %s with node %s zone %s", zoneFailureDomain, n.NodeName, n.Zone.FailureDomain)
499		if zoneFailureDomain == n.Zone.FailureDomain {
500			host, err := n.vm.HostSystem(ctx)
501			if err != nil {
502				klog.Errorf("Failed to get host system for VM %s. err: %+v", n.vm, err)
503				continue
504			}
505			hostMap[host.Reference().Value] = host
506		}
507	}
508	// Build the unique list of hosts.
509	hosts := make([]*object.HostSystem, 0)
510	for _, value := range hostMap {
511		hosts = append(hosts, value)
512	}
513	klog.V(4).Infof("GetHostsInZone %v returning: %v", zoneFailureDomain, hosts)
514	return hosts, nil
515}
516