From 32676a9f45807ea8770dc7bdff1e859673af1b61 Mon Sep 17 00:00:00 2001 From: Stephen Finucane Date: Wed, 28 Apr 2021 13:53:39 +0100 Subject: [PATCH] Clear rebalanced compute nodes from resource tracker There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the compute host that actually owns the node might not recreate it if the node is already in its resource tracker cache. This change fixes the issue by clearing nodes from the resource tracker cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the compute node object is not found in the cache and gets recreated. Change-Id: I39241223b447fcc671161c370dbf16e1773b684a Partial-Bug: #1853009 --- nova/compute/manager.py | 2 + nova/compute/resource_tracker.py | 17 ++++++++ .../regressions/test_bug_1853009.py | 42 ++++++++++++------- nova/tests/unit/compute/test_compute_mgr.py | 6 ++- .../unit/compute/test_resource_tracker.py | 17 ++++++++ 5 files changed, 66 insertions(+), 18 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 52cf260fe9..fa6e093048 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -9996,6 +9996,8 @@ class ComputeManager(manager.Manager): use_slave=True, startup=startup) + self.rt.clean_compute_node_cache(compute_nodes_in_db) + # Delete orphan compute node not reported by driver but still in db for cn in compute_nodes_in_db: if cn.hypervisor_hostname not in nodenames: diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index c3b74546c7..bf1dd580b0 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1945,3 +1945,20 @@ class ResourceTracker(object): if migration: migration.status = 'done' migration.save() + + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) + def clean_compute_node_cache(self, compute_nodes_in_db): + """Clean the compute node cache of any nodes that no longer exist. + + :param compute_nodes_in_db: list of ComputeNode objects from the DB. + """ + compute_nodes_in_db_nodenames = {cn.hypervisor_hostname + for cn in compute_nodes_in_db} + stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames + + for stale_cn in stale_cns: + # NOTE(mgoddard): we have found a node in the cache that has no + # compute node in the DB. This could be due to a node rebalance + # where another compute service took ownership of the node. Clean + # up the cache. + self.remove_node(stale_cn) diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py index ff6369b710..16b8209a85 100644 --- a/nova/tests/functional/regressions/test_bug_1853009.py +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -82,9 +82,6 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( # Now run the update_available_resource periodic to register fake-node # and have it managed by host_b. This will also detect the "host_b" # node as orphaned and delete it along with its resource provider. - cn_host_b_node = objects.ComputeNode.get_by_host_and_nodename( - self.ctxt, 'host_b', 'host_b', - ) # host_b[1]: Finds no compute record in RT. Tries to create one # (_init_compute_node). @@ -93,10 +90,6 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( # update for this node. See # https://bugs.launchpad.net/nova/+bug/1853159. host_b.manager.update_available_resource(self.ctxt) - self.assertIn( - 'Deleting orphan compute node %s hypervisor host ' - 'is host_b, nodes are' % cn_host_b_node.id, - self.stdlog.logger.output) self._assert_hypervisor_api(self.nodename, expected_host='host_b') # There should only be one resource provider (fake-node). original_rps = self._get_all_providers() @@ -160,21 +153,17 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( self.assertEqual(0, len(rps), rps) # host_b[3]: Should recreate compute node and resource provider. - # FIXME(mgoddard): Compute node not recreated here, because it is - # already in RT.compute_nodes. See - # https://bugs.launchpad.net/nova/+bug/1853009. # FIXME(mgoddard): Resource provider not recreated here, because it # exists in the provider tree. See # https://bugs.launchpad.net/nova/+bug/1841481. host_b.manager.update_available_resource(self.ctxt) - # Verify that the node was not recreated. - hypervisors = self.api.api_get( - '/os-hypervisors/detail').body['hypervisors'] - self.assertEqual(0, len(hypervisors), hypervisors) + # Verify that the node was recreated. + self._assert_hypervisor_api(self.nodename, 'host_b') - # But the compute node exists in the RT. - self.assertIn(self.nodename, host_b.manager.rt.compute_nodes) + # But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute + # node is not cached in the RT. + self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes) # There is no RP. rps = self._get_all_providers() @@ -184,6 +173,27 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists( self.nodename)) + # host_b[1]: Should add compute node to RT cache and recreate resource + # provider. + # FIXME(mgoddard): Resource provider not recreated here, because it + # exists in the provider tree. See + # https://bugs.launchpad.net/nova/+bug/1841481. + host_b.manager.update_available_resource(self.ctxt) + + # Verify that the node still exists. + self._assert_hypervisor_api(self.nodename, 'host_b') + + # And it is now in the RT cache. + self.assertIn(self.nodename, host_b.manager.rt.compute_nodes) + + # There is still no RP. + rps = self._get_all_providers() + self.assertEqual(0, len(rps), rps) + + # But the RP it exists in the provider tree. + self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists( + self.nodename)) + # This fails due to the lack of a resource provider. self.assertIn( 'Skipping removal of allocations for deleted instances', diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 3a8fa207db..282f2ee506 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -373,18 +373,20 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, ) # First node in set should have been removed from DB + # Last node in set should have been added to DB. for db_node in db_nodes: if db_node.hypervisor_hostname == 'node1': db_node.destroy.assert_called_once_with() rc_mock.delete_resource_provider.assert_called_once_with( self.context, db_node, cascade=True) - mock_rt.remove_node.assert_called_once_with( - 'node1') + mock_rt.remove_node.assert_called_once_with('node1') mock_log.error.assert_called_once_with( "Failed to delete compute node resource provider for " "compute node %s: %s", db_node.uuid, mock.ANY) else: self.assertFalse(db_node.destroy.called) + self.assertEqual(1, mock_rt.remove_node.call_count) + mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes) @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'delete_resource_provider') diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 947e281b98..147a02bc90 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -4177,3 +4177,20 @@ class ProviderConfigTestCases(BaseTestCase): mock_log.warning.assert_called_once_with(*expected_log_call) self.assertIn(uuids.unknown, self.rt.absent_providers) self.assertEqual(result, []) + + +class TestCleanComputeNodeCache(BaseTestCase): + + def setUp(self): + super(TestCleanComputeNodeCache, self).setUp() + self._setup_rt() + self.context = context.RequestContext( + mock.sentinel.user_id, mock.sentinel.project_id) + + @mock.patch.object(resource_tracker.ResourceTracker, "remove_node") + def test_clean_compute_node_cache(self, mock_remove): + invalid_nodename = "invalid-node" + self.rt.compute_nodes[_NODENAME] = self.compute + self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute + self.rt.clean_compute_node_cache([self.compute]) + mock_remove.assert_called_once_with(invalid_nodename)