Merge "Clear rebalanced compute nodes from resource tracker"
This commit is contained in:
@@ -10036,6 +10036,8 @@ class ComputeManager(manager.Manager):
|
||||
use_slave=True,
|
||||
startup=startup)
|
||||
|
||||
self.rt.clean_compute_node_cache(compute_nodes_in_db)
|
||||
|
||||
# Delete orphan compute node not reported by driver but still in db
|
||||
for cn in compute_nodes_in_db:
|
||||
if cn.hypervisor_hostname not in nodenames:
|
||||
|
||||
@@ -1945,3 +1945,20 @@ class ResourceTracker(object):
|
||||
if migration:
|
||||
migration.status = 'done'
|
||||
migration.save()
|
||||
|
||||
@utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True)
|
||||
def clean_compute_node_cache(self, compute_nodes_in_db):
|
||||
"""Clean the compute node cache of any nodes that no longer exist.
|
||||
|
||||
:param compute_nodes_in_db: list of ComputeNode objects from the DB.
|
||||
"""
|
||||
compute_nodes_in_db_nodenames = {cn.hypervisor_hostname
|
||||
for cn in compute_nodes_in_db}
|
||||
stale_cns = set(self.compute_nodes) - compute_nodes_in_db_nodenames
|
||||
|
||||
for stale_cn in stale_cns:
|
||||
# NOTE(mgoddard): we have found a node in the cache that has no
|
||||
# compute node in the DB. This could be due to a node rebalance
|
||||
# where another compute service took ownership of the node. Clean
|
||||
# up the cache.
|
||||
self.remove_node(stale_cn)
|
||||
|
||||
@@ -82,9 +82,6 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
# Now run the update_available_resource periodic to register fake-node
|
||||
# and have it managed by host_b. This will also detect the "host_b"
|
||||
# node as orphaned and delete it along with its resource provider.
|
||||
cn_host_b_node = objects.ComputeNode.get_by_host_and_nodename(
|
||||
self.ctxt, 'host_b', 'host_b',
|
||||
)
|
||||
|
||||
# host_b[1]: Finds no compute record in RT. Tries to create one
|
||||
# (_init_compute_node).
|
||||
@@ -93,10 +90,6 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
# update for this node. See
|
||||
# https://bugs.launchpad.net/nova/+bug/1853159.
|
||||
host_b.manager.update_available_resource(self.ctxt)
|
||||
self.assertIn(
|
||||
'Deleting orphan compute node %s hypervisor host '
|
||||
'is host_b, nodes are' % cn_host_b_node.id,
|
||||
self.stdlog.logger.output)
|
||||
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
|
||||
# There should only be one resource provider (fake-node).
|
||||
original_rps = self._get_all_providers()
|
||||
@@ -160,21 +153,17 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
self.assertEqual(0, len(rps), rps)
|
||||
|
||||
# host_b[3]: Should recreate compute node and resource provider.
|
||||
# FIXME(mgoddard): Compute node not recreated here, because it is
|
||||
# already in RT.compute_nodes. See
|
||||
# https://bugs.launchpad.net/nova/+bug/1853009.
|
||||
# FIXME(mgoddard): Resource provider not recreated here, because it
|
||||
# exists in the provider tree. See
|
||||
# https://bugs.launchpad.net/nova/+bug/1841481.
|
||||
host_b.manager.update_available_resource(self.ctxt)
|
||||
|
||||
# Verify that the node was not recreated.
|
||||
hypervisors = self.api.api_get(
|
||||
'/os-hypervisors/detail').body['hypervisors']
|
||||
self.assertEqual(0, len(hypervisors), hypervisors)
|
||||
# Verify that the node was recreated.
|
||||
self._assert_hypervisor_api(self.nodename, 'host_b')
|
||||
|
||||
# But the compute node exists in the RT.
|
||||
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
|
||||
# But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
|
||||
# node is not cached in the RT.
|
||||
self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)
|
||||
|
||||
# There is no RP.
|
||||
rps = self._get_all_providers()
|
||||
@@ -184,6 +173,27 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase(
|
||||
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
|
||||
self.nodename))
|
||||
|
||||
# host_b[1]: Should add compute node to RT cache and recreate resource
|
||||
# provider.
|
||||
# FIXME(mgoddard): Resource provider not recreated here, because it
|
||||
# exists in the provider tree. See
|
||||
# https://bugs.launchpad.net/nova/+bug/1841481.
|
||||
host_b.manager.update_available_resource(self.ctxt)
|
||||
|
||||
# Verify that the node still exists.
|
||||
self._assert_hypervisor_api(self.nodename, 'host_b')
|
||||
|
||||
# And it is now in the RT cache.
|
||||
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
|
||||
|
||||
# There is still no RP.
|
||||
rps = self._get_all_providers()
|
||||
self.assertEqual(0, len(rps), rps)
|
||||
|
||||
# But the RP it exists in the provider tree.
|
||||
self.assertTrue(host_b.manager.rt.reportclient._provider_tree.exists(
|
||||
self.nodename))
|
||||
|
||||
# This fails due to the lack of a resource provider.
|
||||
self.assertIn(
|
||||
'Skipping removal of allocations for deleted instances',
|
||||
|
||||
@@ -373,18 +373,20 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
)
|
||||
|
||||
# First node in set should have been removed from DB
|
||||
# Last node in set should have been added to DB.
|
||||
for db_node in db_nodes:
|
||||
if db_node.hypervisor_hostname == 'node1':
|
||||
db_node.destroy.assert_called_once_with()
|
||||
rc_mock.delete_resource_provider.assert_called_once_with(
|
||||
self.context, db_node, cascade=True)
|
||||
mock_rt.remove_node.assert_called_once_with(
|
||||
'node1')
|
||||
mock_rt.remove_node.assert_called_once_with('node1')
|
||||
mock_log.error.assert_called_once_with(
|
||||
"Failed to delete compute node resource provider for "
|
||||
"compute node %s: %s", db_node.uuid, mock.ANY)
|
||||
else:
|
||||
self.assertFalse(db_node.destroy.called)
|
||||
self.assertEqual(1, mock_rt.remove_node.call_count)
|
||||
mock_rt.clean_compute_node_cache.assert_called_once_with(db_nodes)
|
||||
|
||||
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
|
||||
'delete_resource_provider')
|
||||
|
||||
@@ -4177,3 +4177,20 @@ class ProviderConfigTestCases(BaseTestCase):
|
||||
mock_log.warning.assert_called_once_with(*expected_log_call)
|
||||
self.assertIn(uuids.unknown, self.rt.absent_providers)
|
||||
self.assertEqual(result, [])
|
||||
|
||||
|
||||
class TestCleanComputeNodeCache(BaseTestCase):
|
||||
|
||||
def setUp(self):
|
||||
super(TestCleanComputeNodeCache, self).setUp()
|
||||
self._setup_rt()
|
||||
self.context = context.RequestContext(
|
||||
mock.sentinel.user_id, mock.sentinel.project_id)
|
||||
|
||||
@mock.patch.object(resource_tracker.ResourceTracker, "remove_node")
|
||||
def test_clean_compute_node_cache(self, mock_remove):
|
||||
invalid_nodename = "invalid-node"
|
||||
self.rt.compute_nodes[_NODENAME] = self.compute
|
||||
self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
|
||||
self.rt.clean_compute_node_cache([self.compute])
|
||||
mock_remove.assert_called_once_with(invalid_nodename)
|
||||
|
||||
Reference in New Issue
Block a user