Merge "hypervisors: Optimize uptime retrieval for better performance"
This commit is contained in:
@@ -3902,8 +3902,11 @@ hypervisor_type_body:
|
||||
type: string
|
||||
hypervisor_uptime:
|
||||
description: |
|
||||
The total uptime of the hypervisor and information about average load. Only
|
||||
reported for active hosts where the virt driver supports this feature.
|
||||
The response format of this api depends on the virt driver in use on a
|
||||
given host. The libvirt driver returns the output of the `uptime` command
|
||||
directly, the z/VM driver returns the `ILP` time. All other drivers
|
||||
always return `null`. Note this value is cached and updated periodically.
|
||||
|
||||
in: body
|
||||
required: true
|
||||
type: string
|
||||
|
||||
@@ -96,18 +96,24 @@ class HypervisorsController(wsgi.Controller):
|
||||
|
||||
# The 2.88 microversion also *added* the 'uptime' field to the response
|
||||
if detail and api_version_request.is_supported(req, '2.88'):
|
||||
try:
|
||||
hyp_dict['uptime'] = self.host_api.get_host_uptime(
|
||||
req.environ['nova.context'], hypervisor.host)
|
||||
except (
|
||||
NotImplementedError,
|
||||
exception.ComputeServiceUnavailable,
|
||||
exception.HostMappingNotFound,
|
||||
exception.HostNotFound,
|
||||
):
|
||||
# Not all virt drivers support this, and it's not generally
|
||||
# possible to get uptime for a down host
|
||||
hyp_dict['uptime'] = None
|
||||
uptime = None
|
||||
if "stats" in hypervisor and "uptime" in hypervisor.stats:
|
||||
uptime = hypervisor.stats.get("uptime")
|
||||
else:
|
||||
try:
|
||||
uptime = self.host_api.get_host_uptime(
|
||||
req.environ['nova.context'], hypervisor.host)
|
||||
except (
|
||||
NotImplementedError, # only raised in tests
|
||||
exception.ComputeServiceUnavailable,
|
||||
exception.HostMappingNotFound,
|
||||
exception.HostNotFound,
|
||||
):
|
||||
# Only libvirt and ZVM drivers support this, and it's
|
||||
# not generally possible to get uptime for a down host
|
||||
pass
|
||||
|
||||
hyp_dict['uptime'] = uptime
|
||||
|
||||
if servers:
|
||||
hyp_dict['servers'] = [
|
||||
|
||||
@@ -1173,7 +1173,8 @@ class ResourceTracker(object):
|
||||
"used_disk=%(used_disk)sGB "
|
||||
"total_vcpus=%(total_vcpus)s "
|
||||
"used_vcpus=%(used_vcpus)s "
|
||||
"pci_stats=%(pci_stats)s",
|
||||
"pci_stats=%(pci_stats)s "
|
||||
"stats=%(stats)s",
|
||||
{'node': nodename,
|
||||
'phys_ram': cn.memory_mb,
|
||||
'used_ram': cn.memory_mb_used,
|
||||
@@ -1181,7 +1182,9 @@ class ResourceTracker(object):
|
||||
'used_disk': cn.local_gb_used,
|
||||
'total_vcpus': tcpu,
|
||||
'used_vcpus': ucpu,
|
||||
'pci_stats': pci_stats})
|
||||
'pci_stats': pci_stats,
|
||||
'stats': cn.stats or {}
|
||||
})
|
||||
|
||||
def _resource_change(self, compute_node):
|
||||
"""Check to see if any resources have changed."""
|
||||
|
||||
@@ -37,6 +37,12 @@ class Stats(dict):
|
||||
if stats is None:
|
||||
return
|
||||
if isinstance(stats, dict):
|
||||
# use None as a sentinel to the API that
|
||||
# the driver does not support uptime
|
||||
# setdefault will update the dict if and only if
|
||||
# uptime is not set then return the value.
|
||||
# since we dont need it we just discard the result
|
||||
stats.setdefault('uptime', None)
|
||||
self.update(stats)
|
||||
return
|
||||
raise ValueError(_('Unexpected type adding stats'))
|
||||
|
||||
@@ -47,7 +47,7 @@ TEST_HYPERS = [
|
||||
vcpus_used=2,
|
||||
memory_mb_used=5 * 1024,
|
||||
local_gb_used=125,
|
||||
hypervisor_type="xen",
|
||||
hypervisor_type="qemu",
|
||||
hypervisor_version=3,
|
||||
hypervisor_hostname="hyper1",
|
||||
free_ram_mb=5 * 1024,
|
||||
@@ -67,7 +67,7 @@ TEST_HYPERS = [
|
||||
vcpus_used=2,
|
||||
memory_mb_used=5 * 1024,
|
||||
local_gb_used=125,
|
||||
hypervisor_type="xen",
|
||||
hypervisor_type="qemu",
|
||||
hypervisor_version=3,
|
||||
hypervisor_hostname="hyper2",
|
||||
free_ram_mb=5 * 1024,
|
||||
@@ -76,7 +76,8 @@ TEST_HYPERS = [
|
||||
running_vms=2,
|
||||
cpu_info=CPU_INFO,
|
||||
disk_available_least=100,
|
||||
host_ip=netaddr.IPAddress('2.2.2.2'))]
|
||||
host_ip=netaddr.IPAddress('2.2.2.2'),
|
||||
stats={'uptime': 'fake uptime'})]
|
||||
|
||||
|
||||
TEST_SERVICES = [
|
||||
@@ -203,6 +204,11 @@ class HypervisorsTestV21(test.NoDBTestCase):
|
||||
del DETAIL_HYPERS_DICTS[1]['host']
|
||||
del DETAIL_HYPERS_DICTS[0]['uuid']
|
||||
del DETAIL_HYPERS_DICTS[1]['uuid']
|
||||
# Remove stats since it's not exposed in the API response, but preserve
|
||||
# uptime for v2.88+ tests which expect it
|
||||
for hyper_dict in DETAIL_HYPERS_DICTS:
|
||||
if 'stats' in hyper_dict:
|
||||
del hyper_dict['stats']
|
||||
DETAIL_HYPERS_DICTS[0].update({'state': 'up',
|
||||
'status': 'enabled',
|
||||
'service': dict(id=1, host='compute1',
|
||||
@@ -850,7 +856,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
|
||||
'free_ram_mb': 5120,
|
||||
'host_ip': netaddr.IPAddress('2.2.2.2'),
|
||||
'hypervisor_hostname': 'hyper2',
|
||||
'hypervisor_type': 'xen',
|
||||
'hypervisor_type': 'qemu',
|
||||
'hypervisor_version': 3,
|
||||
'id': 2,
|
||||
'local_gb': 250,
|
||||
@@ -904,7 +910,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
|
||||
'free_ram_mb': 5120,
|
||||
'host_ip': netaddr.IPAddress('2.2.2.2'),
|
||||
'hypervisor_hostname': 'hyper2',
|
||||
'hypervisor_type': 'xen',
|
||||
'hypervisor_type': 'qemu',
|
||||
'hypervisor_version': 3,
|
||||
'id': 2,
|
||||
'local_gb': 250,
|
||||
@@ -951,7 +957,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
|
||||
'free_ram_mb': 5120,
|
||||
'host_ip': netaddr.IPAddress('2.2.2.2'),
|
||||
'hypervisor_hostname': 'hyper2',
|
||||
'hypervisor_type': 'xen',
|
||||
'hypervisor_type': 'qemu',
|
||||
'hypervisor_version': 3,
|
||||
'id': 2,
|
||||
'local_gb': 250,
|
||||
@@ -1448,6 +1454,21 @@ class HypervisorsTestV288(HypervisorsTestV275):
|
||||
# cpu_info is no longer included in the response, so skip this test
|
||||
pass
|
||||
|
||||
def test_show_with_uptime_provided_by_compute_node(self):
|
||||
req = self._get_request(use_admin_context=True)
|
||||
result = self.controller.show(req, self.TEST_HYPERS_OBJ[1].uuid)
|
||||
expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[1])
|
||||
self.assertEqual({'hypervisor': expected_dict}, result)
|
||||
self.controller.host_api.get_host_uptime.assert_not_called()
|
||||
|
||||
def test_detail_list_uptime(self):
|
||||
_ = self._test_servers_with_no_servers(self.controller.detail)
|
||||
# we have simulated that compute 2 is upgraded to store the uptime
|
||||
# in the stats so we expect 1 call to get the result via RPC
|
||||
# for compute1
|
||||
self.controller.host_api.get_host_uptime.assert_called_with(
|
||||
mock.ANY, "compute1")
|
||||
|
||||
def test_uptime(self):
|
||||
req = self._get_request(True)
|
||||
self.assertRaises(
|
||||
|
||||
@@ -23043,18 +23043,18 @@ class HostStateTestCase(test.NoDBTestCase):
|
||||
|
||||
drvr = HostStateTestCase.FakeConnection()
|
||||
|
||||
stats = drvr.get_available_resource("compute1")
|
||||
self.assertEqual(stats["vcpus"], 1)
|
||||
self.assertEqual(stats["memory_mb"], 497)
|
||||
self.assertEqual(stats["local_gb"], 100)
|
||||
self.assertEqual(stats["vcpus_used"], 0)
|
||||
self.assertEqual(stats["memory_mb_used"], 88)
|
||||
self.assertEqual(stats["local_gb_used"], 20)
|
||||
self.assertEqual(stats["hypervisor_type"], 'QEMU')
|
||||
self.assertEqual(stats["hypervisor_version"],
|
||||
res = drvr.get_available_resource("compute1")
|
||||
self.assertEqual(res["vcpus"], 1)
|
||||
self.assertEqual(res["memory_mb"], 497)
|
||||
self.assertEqual(res["local_gb"], 100)
|
||||
self.assertEqual(res["vcpus_used"], 0)
|
||||
self.assertEqual(res["memory_mb_used"], 88)
|
||||
self.assertEqual(res["local_gb_used"], 20)
|
||||
self.assertEqual(res["hypervisor_type"], 'QEMU')
|
||||
self.assertEqual(res["hypervisor_version"],
|
||||
fakelibvirt.FAKE_QEMU_VERSION)
|
||||
self.assertEqual(stats["hypervisor_hostname"], 'compute1')
|
||||
cpu_info = jsonutils.loads(stats["cpu_info"])
|
||||
self.assertEqual(res["hypervisor_hostname"], 'compute1')
|
||||
cpu_info = jsonutils.loads(res["cpu_info"])
|
||||
self.assertEqual(cpu_info,
|
||||
{"vendor": "Intel", "model": "pentium",
|
||||
"arch": fields.Architecture.I686,
|
||||
@@ -23064,12 +23064,13 @@ class HostStateTestCase(test.NoDBTestCase):
|
||||
"topology": {"cores": "1", "threads": "1", "sockets": "1"},
|
||||
"maxphysaddr": {"mode": "emulate", "bits": "42"}
|
||||
})
|
||||
self.assertEqual(stats["disk_available_least"], 80)
|
||||
self.assertEqual(jsonutils.loads(stats["pci_passthrough_devices"]),
|
||||
self.assertEqual(res["disk_available_least"], 80)
|
||||
self.assertEqual(jsonutils.loads(res["pci_passthrough_devices"]),
|
||||
HostStateTestCase.pci_devices)
|
||||
self.assertEqual(objects.NUMATopology.obj_from_db_obj(
|
||||
stats['numa_topology']),
|
||||
res['numa_topology']),
|
||||
HostStateTestCase.numa_topology)
|
||||
self.assertEqual(res['stats']['uptime'], drvr.get_host_uptime())
|
||||
|
||||
|
||||
class TestUpdateProviderTree(test.NoDBTestCase):
|
||||
|
||||
@@ -128,8 +128,11 @@ class TestZVMDriver(test.NoDBTestCase):
|
||||
self.assertRaises(exception.ZVMDriverException,
|
||||
zvmdriver.ZVMDriver, 'virtapi')
|
||||
|
||||
@mock.patch(
|
||||
'nova.virt.zvm.driver.ZVMDriver.get_host_uptime',
|
||||
return_value='IPL at 11/14/17 10:47:44 EST')
|
||||
@mock.patch('nova.virt.zvm.utils.ConnectorClient.call')
|
||||
def test_get_available_resource_err_case(self, call):
|
||||
def test_get_available_resource_err_case(self, call, uptime_mock):
|
||||
res = {'overallRC': 1, 'errmsg': 'err', 'rc': 0, 'rs': 0}
|
||||
call.side_effect = exception.ZVMConnectorError(results=res)
|
||||
results = self._driver.get_available_resource()
|
||||
@@ -138,6 +141,8 @@ class TestZVMDriver(test.NoDBTestCase):
|
||||
self.assertEqual(0, results['disk_available_least'])
|
||||
self.assertEqual(0, results['hypervisor_version'])
|
||||
self.assertEqual('TESTHOST', results['hypervisor_hostname'])
|
||||
self.assertEqual(uptime_mock.return_value, results['stats']['uptime'])
|
||||
uptime_mock.assert_called_once()
|
||||
|
||||
def test_driver_template_validation(self):
|
||||
self.flags(instance_name_template='abc%6d')
|
||||
|
||||
@@ -10379,6 +10379,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
else:
|
||||
data['numa_topology'] = None
|
||||
|
||||
data['stats'] = {'uptime': self.get_host_uptime()}
|
||||
return data
|
||||
|
||||
def check_instance_shared_storage_local(self, context, instance):
|
||||
|
||||
@@ -132,6 +132,7 @@ class ZVMDriver(driver.ComputeDriver):
|
||||
obj_fields.HVType.ZVM,
|
||||
obj_fields.VMMode.HVM)],
|
||||
'numa_topology': None,
|
||||
'stats': {'uptime': self.get_host_uptime()}
|
||||
}
|
||||
|
||||
LOG.debug("Getting available resource for %(host)s:%(nodename)s",
|
||||
|
||||
+23
@@ -0,0 +1,23 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixed performance issue with the ``/os-hypervisors/detail`` API endpoint
|
||||
when using microversion 2.88 or higher. The API was making sequential RPC
|
||||
calls to each compute node to gather uptime information, causing significant
|
||||
delays in environments with many compute nodes (LP#2122036).
|
||||
|
||||
The fix optimizes uptime retrieval by:
|
||||
|
||||
* Adding uptime information to the periodic resource updates sent by
|
||||
nova-compute to the database, eliminating the need for synchronous RPC
|
||||
calls during API requests
|
||||
* Only attempting RPC-based uptime retrieval for hypervisor types that
|
||||
actually support it (libvirt and z/VM), avoiding unnecessary calls to
|
||||
other hypervisor types that would always return NotImplementedError
|
||||
* Preferring cached uptime data from the database over RPC calls when
|
||||
available, this updates at the cadence specified by
|
||||
`[DEFAULT]update_resources_interval` which is the same interval the
|
||||
other hypervisor stats update.
|
||||
|
||||
This change significantly reduces response times for the hypervisor detail
|
||||
API in large deployments while maintaining backward compatibility.
|
||||
Reference in New Issue
Block a user