diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 944ee2c123..24417258dc 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -10036,6 +10036,8 @@ class ComputeManager(manager.Manager): use_slave=True, startup=startup) + self.rt.clean_compute_node_cache(compute_nodes_in_db) + # Delete orphan compute node not reported by driver but still in db for cn in compute_nodes_in_db: if cn.hypervisor_hostname not in nodenames: diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index c3b74546c7..bf1dd580b0 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1945,3 +1945,20 @@ class ResourceTracker(object): if migration: migration.status = 'done' migration.save() + + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) + def clean_compute_node_cache(self, compute_nodes_in_db): + """Clean the compute node cache of any nodes that no longer exist. + + :param compute_nodes_in_db: list of ComputeNode objects from the DB. + """ + compute_nodes_in_db_nodenames = {cn.hypervisor_hostname + for cn in compute_nodes_in_db} + stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames + + for stale_cn in stale_cns: + # NOTE(mgoddard): we have found a node in the cache that has no + # compute node in the DB. This could be due to a node rebalance + # where another compute service took ownership of the node. Clean + # up the cache. + self.remove_node(stale_cn) diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py index ff6369b710..16b8209a85 100644 --- a/nova/tests/functional/regressions/test_bug_1853009.py +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -82,9 +82,6 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( # Now run the update_available_resource periodic to register fake-node # and have it managed by host_b. This will also detect the "host_b" # node as orphaned and delete it along with its resource provider. - cn_host_b_node = objects.ComputeNode.get_by_host_and_nodename( - self.ctxt, 'host_b', 'host_b', - ) # host_b[1]: Finds no compute record in RT. Tries to create one # (_init_compute_node). @@ -93,10 +90,6 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( # update for this node. See # https://bugs.launchpad.net/nova/+bug/1853159. host_b.manager.update_available_resource(self.ctxt) - self.assertIn( - 'Deleting orphan compute node %s hypervisor host ' - 'is host_b, nodes are' % cn_host_b_node.id, - self.stdlog.logger.output) self._assert_hypervisor_api(self.nodename, expected_host='host_b') # There should only be one resource provider (fake-node). original_rps = self._get_all_providers() @@ -160,21 +153,17 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( self.assertEqual(0, len(rps), rps) # host_b[3]: Should recreate compute node and resource provider. - # FIXME(mgoddard): Compute node not recreated here, because it is - # already in RT.compute_nodes. See - # https://bugs.launchpad.net/nova/+bug/1853009. # FIXME(mgoddard): Resource provider not recreated here, because it # exists in the provider tree. See # https://bugs.launchpad.net/nova/+bug/1841481. host_b.manager.update_available_resource(self.ctxt) - # Verify that the node was not recreated. - hypervisors = self.api.api_get( - '/os-hypervisors/detail').body['hypervisors'] - self.assertEqual(0, len(hypervisors), hypervisors) + # Verify that the node was recreated. + self._assert_hypervisor_api(self.nodename, 'host_b') - # But the compute node exists in the RT. - self.assertIn(self.nodename, host_b.manager.rt.compute_nodes) + # But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute + # node is not cached in the RT. + self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes) # There is no RP. rps = self._get_all_providers() @@ -184,6 +173,27 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists( self.nodename)) + # host_b[1]: Should add compute node to RT cache and recreate resource + # provider. + # FIXME(mgoddard): Resource provider not recreated here, because it + # exists in the provider tree. See + # https://bugs.launchpad.net/nova/+bug/1841481. + host_b.manager.update_available_resource(self.ctxt) + + # Verify that the node still exists. + self._assert_hypervisor_api(self.nodename, 'host_b') + + # And it is now in the RT cache. + self.assertIn(self.nodename, host_b.manager.rt.compute_nodes) + + # There is still no RP. + rps = self._get_all_providers() + self.assertEqual(0, len(rps), rps) + + # But the RP it exists in the provider tree. + self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists( + self.nodename)) + # This fails due to the lack of a resource provider. self.assertIn( 'Skipping removal of allocations for deleted instances', diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 255c8664d0..f0a69a4fde 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -373,18 +373,20 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, ) # First node in set should have been removed from DB + # Last node in set should have been added to DB. for db_node in db_nodes: if db_node.hypervisor_hostname == 'node1': db_node.destroy.assert_called_once_with() rc_mock.delete_resource_provider.assert_called_once_with( self.context, db_node, cascade=True) - mock_rt.remove_node.assert_called_once_with( - 'node1') + mock_rt.remove_node.assert_called_once_with('node1') mock_log.error.assert_called_once_with( "Failed to delete compute node resource provider for " "compute node %s: %s", db_node.uuid, mock.ANY) else: self.assertFalse(db_node.destroy.called) + self.assertEqual(1, mock_rt.remove_node.call_count) + mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes) @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'delete_resource_provider') diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 947e281b98..147a02bc90 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -4177,3 +4177,20 @@ class ProviderConfigTestCases(BaseTestCase): mock_log.warning.assert_called_once_with(*expected_log_call) self.assertIn(uuids.unknown, self.rt.absent_providers) self.assertEqual(result, []) + + +class TestCleanComputeNodeCache(BaseTestCase): + + def setUp(self): + super(TestCleanComputeNodeCache, self).setUp() + self._setup_rt() + self.context = context.RequestContext( + mock.sentinel.user_id, mock.sentinel.project_id) + + @mock.patch.object(resource_tracker.ResourceTracker, "remove_node") + def test_clean_compute_node_cache(self, mock_remove): + invalid_nodename = "invalid-node" + self.rt.compute_nodes[_NODENAME] = self.compute + self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute + self.rt.clean_compute_node_cache([self.compute]) + mock_remove.assert_called_once_with(invalid_nodename)