Merge "hypervisors: Optimize uptime retrieval for better performance"

2025-09-10 11:36:59 +00:00
parent 51aceec3ab 567dbe1867
commit 36c63f1664
10 changed files with 107 additions and 37 deletions
@@ -3902,8 +3902,11 @@ hypervisor_type_body:
  type: string
 hypervisor_uptime:
  description: |
-    The total uptime of the hypervisor and information about average load. Only
-    reported for active hosts where the virt driver supports this feature.
+    The response format of this api depends on the virt driver in use on a
+    given host. The libvirt driver returns the output of the `uptime` command
+    directly, the z/VM driver returns the `ILP` time. All other drivers
+    always return `null`. Note this value is cached and updated periodically.
+
  in: body
  required: true
  type: string
@@ -96,18 +96,24 @@ class HypervisorsController(wsgi.Controller):

        # The 2.88 microversion also *added* the 'uptime' field to the response
        if detail and api_version_request.is_supported(req, '2.88'):
-            try:
-                hyp_dict['uptime'] = self.host_api.get_host_uptime(
-                    req.environ['nova.context'], hypervisor.host)
-            except (
-                NotImplementedError,
-                exception.ComputeServiceUnavailable,
-                exception.HostMappingNotFound,
-                exception.HostNotFound,
-            ):
-                # Not all virt drivers support this, and it's not generally
-                # possible to get uptime for a down host
-                hyp_dict['uptime'] = None
+            uptime = None
+            if "stats" in hypervisor and "uptime" in hypervisor.stats:
+                uptime = hypervisor.stats.get("uptime")
+            else:
+                try:
+                    uptime = self.host_api.get_host_uptime(
+                        req.environ['nova.context'], hypervisor.host)
+                except (
+                        NotImplementedError,  # only raised in tests
+                        exception.ComputeServiceUnavailable,
+                        exception.HostMappingNotFound,
+                        exception.HostNotFound,
+                ):
+                    # Only libvirt and ZVM drivers support this, and it's
+                    # not generally possible to get uptime for a down host
+                    pass
+
+            hyp_dict['uptime'] = uptime

        if servers:
            hyp_dict['servers'] = [
@@ -1173,7 +1173,8 @@ class ResourceTracker(object):
                  "used_disk=%(used_disk)sGB "
                  "total_vcpus=%(total_vcpus)s "
                  "used_vcpus=%(used_vcpus)s "
-                  "pci_stats=%(pci_stats)s",
+                  "pci_stats=%(pci_stats)s "
+                  "stats=%(stats)s",
                  {'node': nodename,
                   'phys_ram': cn.memory_mb,
                   'used_ram': cn.memory_mb_used,
@@ -1181,7 +1182,9 @@ class ResourceTracker(object):
                   'used_disk': cn.local_gb_used,
                   'total_vcpus': tcpu,
                   'used_vcpus': ucpu,
-                   'pci_stats': pci_stats})
+                   'pci_stats': pci_stats,
+                   'stats': cn.stats or {}
+                   })

    def _resource_change(self, compute_node):
        """Check to see if any resources have changed."""
@@ -37,6 +37,12 @@ class Stats(dict):
        if stats is None:
            return
        if isinstance(stats, dict):
+            # use None as a sentinel to the API that
+            # the driver does not support uptime
+            # setdefault will update the dict if and only if
+            # uptime is not set then return the value.
+            # since we dont need it we just discard the result
+            stats.setdefault('uptime', None)
            self.update(stats)
            return
        raise ValueError(_('Unexpected type adding stats'))
@@ -47,7 +47,7 @@ TEST_HYPERS = [
         vcpus_used=2,
         memory_mb_used=5 * 1024,
         local_gb_used=125,
-         hypervisor_type="xen",
+         hypervisor_type="qemu",
         hypervisor_version=3,
         hypervisor_hostname="hyper1",
         free_ram_mb=5 * 1024,
@@ -67,7 +67,7 @@ TEST_HYPERS = [
         vcpus_used=2,
         memory_mb_used=5 * 1024,
         local_gb_used=125,
-         hypervisor_type="xen",
+         hypervisor_type="qemu",
         hypervisor_version=3,
         hypervisor_hostname="hyper2",
         free_ram_mb=5 * 1024,
@@ -76,7 +76,8 @@ TEST_HYPERS = [
         running_vms=2,
         cpu_info=CPU_INFO,
         disk_available_least=100,
-         host_ip=netaddr.IPAddress('2.2.2.2'))]
+         host_ip=netaddr.IPAddress('2.2.2.2'),
+         stats={'uptime': 'fake uptime'})]


 TEST_SERVICES = [
@@ -203,6 +204,11 @@ class HypervisorsTestV21(test.NoDBTestCase):
    del DETAIL_HYPERS_DICTS[1]['host']
    del DETAIL_HYPERS_DICTS[0]['uuid']
    del DETAIL_HYPERS_DICTS[1]['uuid']
+    # Remove stats since it's not exposed in the API response, but preserve
+    # uptime for v2.88+ tests which expect it
+    for hyper_dict in DETAIL_HYPERS_DICTS:
+        if 'stats' in hyper_dict:
+            del hyper_dict['stats']
    DETAIL_HYPERS_DICTS[0].update({'state': 'up',
                           'status': 'enabled',
                           'service': dict(id=1, host='compute1',
@@ -850,7 +856,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
                'free_ram_mb': 5120,
                'host_ip': netaddr.IPAddress('2.2.2.2'),
                'hypervisor_hostname': 'hyper2',
-                'hypervisor_type': 'xen',
+                'hypervisor_type': 'qemu',
                'hypervisor_version': 3,
                'id': 2,
                'local_gb': 250,
@@ -904,7 +910,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
                'free_ram_mb': 5120,
                'host_ip': netaddr.IPAddress('2.2.2.2'),
                'hypervisor_hostname': 'hyper2',
-                'hypervisor_type': 'xen',
+                'hypervisor_type': 'qemu',
                'hypervisor_version': 3,
                'id': 2,
                'local_gb': 250,
@@ -951,7 +957,7 @@ class HypervisorsTestV233(HypervisorsTestV228):
                'free_ram_mb': 5120,
                'host_ip': netaddr.IPAddress('2.2.2.2'),
                'hypervisor_hostname': 'hyper2',
-                'hypervisor_type': 'xen',
+                'hypervisor_type': 'qemu',
                'hypervisor_version': 3,
                'id': 2,
                'local_gb': 250,
@@ -1448,6 +1454,21 @@ class HypervisorsTestV288(HypervisorsTestV275):
        # cpu_info is no longer included in the response, so skip this test
        pass

+    def test_show_with_uptime_provided_by_compute_node(self):
+        req = self._get_request(use_admin_context=True)
+        result = self.controller.show(req, self.TEST_HYPERS_OBJ[1].uuid)
+        expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[1])
+        self.assertEqual({'hypervisor': expected_dict}, result)
+        self.controller.host_api.get_host_uptime.assert_not_called()
+
+    def test_detail_list_uptime(self):
+        _ = self._test_servers_with_no_servers(self.controller.detail)
+        # we have simulated that compute 2 is upgraded to store the uptime
+        # in the stats  so we expect 1 call to get the result via RPC
+        # for compute1
+        self.controller.host_api.get_host_uptime.assert_called_with(
+            mock.ANY, "compute1")
+
    def test_uptime(self):
        req = self._get_request(True)
        self.assertRaises(
@@ -23043,18 +23043,18 @@ class HostStateTestCase(test.NoDBTestCase):

        drvr = HostStateTestCase.FakeConnection()

-        stats = drvr.get_available_resource("compute1")
-        self.assertEqual(stats["vcpus"], 1)
-        self.assertEqual(stats["memory_mb"], 497)
-        self.assertEqual(stats["local_gb"], 100)
-        self.assertEqual(stats["vcpus_used"], 0)
-        self.assertEqual(stats["memory_mb_used"], 88)
-        self.assertEqual(stats["local_gb_used"], 20)
-        self.assertEqual(stats["hypervisor_type"], 'QEMU')
-        self.assertEqual(stats["hypervisor_version"],
+        res = drvr.get_available_resource("compute1")
+        self.assertEqual(res["vcpus"], 1)
+        self.assertEqual(res["memory_mb"], 497)
+        self.assertEqual(res["local_gb"], 100)
+        self.assertEqual(res["vcpus_used"], 0)
+        self.assertEqual(res["memory_mb_used"], 88)
+        self.assertEqual(res["local_gb_used"], 20)
+        self.assertEqual(res["hypervisor_type"], 'QEMU')
+        self.assertEqual(res["hypervisor_version"],
                         fakelibvirt.FAKE_QEMU_VERSION)
-        self.assertEqual(stats["hypervisor_hostname"], 'compute1')
-        cpu_info = jsonutils.loads(stats["cpu_info"])
+        self.assertEqual(res["hypervisor_hostname"], 'compute1')
+        cpu_info = jsonutils.loads(res["cpu_info"])
        self.assertEqual(cpu_info,
                {"vendor": "Intel", "model": "pentium",
                 "arch": fields.Architecture.I686,
@@ -23064,12 +23064,13 @@ class HostStateTestCase(test.NoDBTestCase):
                 "topology": {"cores": "1", "threads": "1", "sockets": "1"},
                 "maxphysaddr": {"mode": "emulate", "bits": "42"}
                })
-        self.assertEqual(stats["disk_available_least"], 80)
-        self.assertEqual(jsonutils.loads(stats["pci_passthrough_devices"]),
+        self.assertEqual(res["disk_available_least"], 80)
+        self.assertEqual(jsonutils.loads(res["pci_passthrough_devices"]),
                         HostStateTestCase.pci_devices)
        self.assertEqual(objects.NUMATopology.obj_from_db_obj(
-                            stats['numa_topology']),
+                            res['numa_topology']),
                         HostStateTestCase.numa_topology)
+        self.assertEqual(res['stats']['uptime'], drvr.get_host_uptime())


 class TestUpdateProviderTree(test.NoDBTestCase):
@@ -128,8 +128,11 @@ class TestZVMDriver(test.NoDBTestCase):
        self.assertRaises(exception.ZVMDriverException,
                          zvmdriver.ZVMDriver, 'virtapi')

+    @mock.patch(
+        'nova.virt.zvm.driver.ZVMDriver.get_host_uptime',
+        return_value='IPL at 11/14/17 10:47:44 EST')
    @mock.patch('nova.virt.zvm.utils.ConnectorClient.call')
-    def test_get_available_resource_err_case(self, call):
+    def test_get_available_resource_err_case(self, call, uptime_mock):
        res = {'overallRC': 1, 'errmsg': 'err', 'rc': 0, 'rs': 0}
        call.side_effect = exception.ZVMConnectorError(results=res)
        results = self._driver.get_available_resource()
@@ -138,6 +141,8 @@ class TestZVMDriver(test.NoDBTestCase):
        self.assertEqual(0, results['disk_available_least'])
        self.assertEqual(0, results['hypervisor_version'])
        self.assertEqual('TESTHOST', results['hypervisor_hostname'])
+        self.assertEqual(uptime_mock.return_value, results['stats']['uptime'])
+        uptime_mock.assert_called_once()

    def test_driver_template_validation(self):
        self.flags(instance_name_template='abc%6d')
@@ -10379,6 +10379,7 @@ class LibvirtDriver(driver.ComputeDriver):
        else:
            data['numa_topology'] = None

+        data['stats'] = {'uptime': self.get_host_uptime()}
        return data

    def check_instance_shared_storage_local(self, context, instance):
@@ -132,6 +132,7 @@ class ZVMDriver(driver.ComputeDriver):
                                     obj_fields.HVType.ZVM,
                                     obj_fields.VMMode.HVM)],
            'numa_topology': None,
+            'stats': {'uptime': self.get_host_uptime()}
        }

        LOG.debug("Getting available resource for %(host)s:%(nodename)s",
@@ -0,0 +1,23 @@
+---
+fixes:
+  - |
+    Fixed performance issue with the ``/os-hypervisors/detail`` API endpoint
+    when using microversion 2.88 or higher. The API was making sequential RPC
+    calls to each compute node to gather uptime information, causing significant
+    delays in environments with many compute nodes (LP#2122036).
+
+    The fix optimizes uptime retrieval by:
+
+    * Adding uptime information to the periodic resource updates sent by
+      nova-compute to the database, eliminating the need for synchronous RPC
+      calls during API requests
+    * Only attempting RPC-based uptime retrieval for hypervisor types that
+      actually support it (libvirt and z/VM), avoiding unnecessary calls to
+      other hypervisor types that would always return NotImplementedError
+    * Preferring cached uptime data from the database over RPC calls when
+      available, this updates at the cadence specified by
+      `[DEFAULT]update_resources_interval` which is the same interval the
+      other hypervisor stats update.
+
+    This change significantly reduces response times for the hypervisor detail
+    API in large deployments while maintaining backward compatibility.