Merge "Clear rebalanced compute nodes from resource tracker"

This commit is contained in:
Zuul
2021-08-20 12:05:16 +00:00
committed by Gerrit Code Review
5 changed files with 66 additions and 18 deletions
+2
View File
@@ -10036,6 +10036,8 @@ class ComputeManager(manager.Manager):
use_slave=True,
startup=startup)
self.rt.clean_compute_node_cache(compute_nodes_in_db)
# Delete orphan compute node not reported by driver but still in db
for cn in compute_nodes_in_db:
if cn.hypervisor_hostname not in nodenames:
+17
View File
@@ -1945,3 +1945,20 @@ class ResourceTracker(object):
if migration:
migration.status = 'done'
migration.save()
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
def clean_compute_node_cache(self, compute_nodes_in_db):
"""Clean the compute node cache of any nodes that no longer exist.
:param compute_nodes_in_db: list of ComputeNode objects from the DB.
"""
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
for cn in compute_nodes_in_db}
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
for stale_cn in stale_cns:
# NOTE(mgoddard): we have found a node in the cache that has no
# compute node in the DB. This could be due to a node rebalance
# where another compute service took ownership of the node. Clean
# up the cache.
self.remove_node(stale_cn)
@@ -82,9 +82,6 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
# Now run the update_available_resource periodic to register fake-node
# and have it managed by host_b. This will also detect the "host_b"
# node as orphaned and delete it along with its resource provider.
cn_host_b_node = objects.ComputeNode.get_by_host_and_nodename(
self.ctxt, 'host_b', 'host_b',
)
# host_b[1]: Finds no compute record in RT. Tries to create one
# (_init_compute_node).
@@ -93,10 +90,6 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
# update for this node. See
# https://bugs.launchpad.net/nova/+bug/1853159.
host_b.manager.update_available_resource(self.ctxt)
self.assertIn(
'Deleting orphan compute node %s hypervisor host '
'is host_b, nodes are' % cn_host_b_node.id,
self.stdlog.logger.output)
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
# There should only be one resource provider (fake-node).
original_rps = self._get_all_providers()
@@ -160,21 +153,17 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
self.assertEqual(0, len(rps), rps)
# host_b[3]: Should recreate compute node and resource provider.
# FIXME(mgoddard): Compute node not recreated here, because it is
# already in RT.compute_nodes. See
# https://bugs.launchpad.net/nova/+bug/1853009.
# FIXME(mgoddard): Resource provider not recreated here, because it
# exists in the provider tree. See
# https://bugs.launchpad.net/nova/+bug/1841481.
host_b.manager.update_available_resource(self.ctxt)
# Verify that the node was not recreated.
hypervisors = self.api.api_get(
'/os-hypervisors/detail').body['hypervisors']
self.assertEqual(0, len(hypervisors), hypervisors)
# Verify that the node was recreated.
self._assert_hypervisor_api(self.nodename, 'host_b')
# But the compute node exists in the RT.
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
# But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
# node is not cached in the RT.
self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)
# There is no RP.
rps = self._get_all_providers()
@@ -184,6 +173,27 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
self.nodename))
# host_b[1]: Should add compute node to RT cache and recreate resource
# provider.
# FIXME(mgoddard): Resource provider not recreated here, because it
# exists in the provider tree. See
# https://bugs.launchpad.net/nova/+bug/1841481.
host_b.manager.update_available_resource(self.ctxt)
# Verify that the node still exists.
self._assert_hypervisor_api(self.nodename, 'host_b')
# And it is now in the RT cache.
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
# There is still no RP.
rps = self._get_all_providers()
self.assertEqual(0, len(rps), rps)
# But the RP it exists in the provider tree.
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
self.nodename))
# This fails due to the lack of a resource provider.
self.assertIn(
'Skipping removal of allocations for deleted instances',
+4 -2
View File
@@ -373,18 +373,20 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
)
# First node in set should have been removed from DB
# Last node in set should have been added to DB.
for db_node in db_nodes:
if db_node.hypervisor_hostname == 'node1':
db_node.destroy.assert_called_once_with()
rc_mock.delete_resource_provider.assert_called_once_with(
self.context, db_node, cascade=True)
mock_rt.remove_node.assert_called_once_with(
'node1')
mock_rt.remove_node.assert_called_once_with('node1')
mock_log.error.assert_called_once_with(
"Failed to delete compute node resource provider for "
"compute node %s: %s", db_node.uuid, mock.ANY)
else:
self.assertFalse(db_node.destroy.called)
self.assertEqual(1, mock_rt.remove_node.call_count)
mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'delete_resource_provider')
@@ -4177,3 +4177,20 @@ class ProviderConfigTestCases(BaseTestCase):
mock_log.warning.assert_called_once_with(*expected_log_call)
self.assertIn(uuids.unknown, self.rt.absent_providers)
self.assertEqual(result, [])
class TestCleanComputeNodeCache(BaseTestCase):
def setUp(self):
super(TestCleanComputeNodeCache, self).setUp()
self._setup_rt()
self.context = context.RequestContext(
mock.sentinel.user_id, mock.sentinel.project_id)
@mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
def test_clean_compute_node_cache(self, mock_remove):
invalid_nodename = "invalid-node"
self.rt.compute_nodes[_NODENAME] = self.compute
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
self.rt.clean_compute_node_cache([self.compute])
mock_remove.assert_called_once_with(invalid_nodename)