From 2bb4527228c8e6fa4a1fa6cfbe80e8790e4e0789 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Tue, 19 Nov 2019 16:51:01 +0000 Subject: [PATCH] Invalidate provider tree when compute node disappears There is a race condition in nova-compute with the ironic virt driver as nodes get rebalanced. It can lead to compute nodes being removed in the DB and not repopulated. Ultimately this prevents these nodes from being scheduled to. The issue being addressed here is that if a compute node is deleted by a host which thinks it is an orphan, then the resource provider for that node might also be deleted. The compute host that owns the node might not recreate the resource provider if it exists in the provider tree cache. This change fixes the issue by clearing resource providers from the provider tree cache for which a compute node entry does not exist. Then, when the available resource for the node is updated, the resource providers are not found in the cache and get recreated in placement. Change-Id: Ia53ff43e6964963cdf295604ba0fb7171389606e Related-Bug: #1853009 Related-Bug: #1841481 --- nova/compute/resource_tracker.py | 1 + nova/scheduler/client/report.py | 17 ++++++++++++----- .../regressions/test_bug_1853009.py | 19 ++++++------------- .../unit/compute/test_resource_tracker.py | 8 ++++++-- 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index bf1dd580b0..fba1526967 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1962,3 +1962,4 @@ class ResourceTracker(object): # where another compute service took ownership of the node. Clean # up the cache. self.remove_node(stale_cn) + self.reportclient.invalidate_resource_provider(stale_cn) diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index d8f70dc498..257aaf07ca 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -677,11 +677,7 @@ class SchedulerReportClient(object): if resp: LOG.info("Deleted resource provider %s", rp_uuid) # clean the caches - try: - self._provider_tree.remove(rp_uuid) - except ValueError: - pass - self._association_refresh_time.pop(rp_uuid, None) + self.invalidate_resource_provider(rp_uuid) return msg = ("[%(placement_req_id)s] Failed to delete resource provider " @@ -2266,6 +2262,17 @@ class SchedulerReportClient(object): # left a no-op for backward compatibility. pass + def invalidate_resource_provider(self, name_or_uuid): + """Invalidate the cache for a resource provider. + + :param name_or_uuid: Name or UUID of the resource provider to look up. + """ + try: + self._provider_tree.remove(name_or_uuid) + except ValueError: + pass + self._association_refresh_time.pop(name_or_uuid, None) + def get_provider_by_name(self, context, name): """Queries the placement API for resource provider information matching a supplied name. diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py index 16b8209a85..dddc79606f 100644 --- a/nova/tests/functional/regressions/test_bug_1853009.py +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -153,9 +153,8 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( self.assertEqual(0, len(rps), rps) # host_b[3]: Should recreate compute node and resource provider. - # FIXME(mgoddard): Resource provider not recreated here, because it - # exists in the provider tree. See - # https://bugs.launchpad.net/nova/+bug/1841481. + # FIXME(mgoddard): Resource provider not recreated here, due to + # https://bugs.launchpad.net/nova/+bug/1853159. host_b.manager.update_available_resource(self.ctxt) # Verify that the node was recreated. @@ -170,14 +169,11 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( self.assertEqual(0, len(rps), rps) # But the RP exists in the provider tree. - self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists( + self.assertFalse(host_b.manager.rt.reportclient._provider_tree.exists( self.nodename)) # host_b[1]: Should add compute node to RT cache and recreate resource # provider. - # FIXME(mgoddard): Resource provider not recreated here, because it - # exists in the provider tree. See - # https://bugs.launchpad.net/nova/+bug/1841481. host_b.manager.update_available_resource(self.ctxt) # Verify that the node still exists. @@ -186,13 +182,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( # And it is now in the RT cache. self.assertIn(self.nodename, host_b.manager.rt.compute_nodes) - # There is still no RP. + # The resource provider has now been created. rps = self._get_all_providers() - self.assertEqual(0, len(rps), rps) - - # But the RP it exists in the provider tree. - self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists( - self.nodename)) + self.assertEqual(1, len(rps), rps) + self.assertEqual(self.nodename, rps[0]['name']) # This fails due to the lack of a resource provider. self.assertIn( diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 147a02bc90..09d3bc7d18 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -4192,5 +4192,9 @@ class TestCleanComputeNodeCache(BaseTestCase): invalid_nodename = "invalid-node" self.rt.compute_nodes[_NODENAME] = self.compute self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute - self.rt.clean_compute_node_cache([self.compute]) - mock_remove.assert_called_once_with(invalid_nodename) + with mock.patch.object( + self.rt.reportclient, "invalidate_resource_provider", + ) as mock_invalidate: + self.rt.clean_compute_node_cache([self.compute]) + mock_remove.assert_called_once_with(invalid_nodename) + mock_invalidate.assert_called_once_with(invalid_nodename)