From 772f5a1ae4fa9f9a584f42f774894d540b4b168f Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Wed, 28 Jun 2023 10:45:30 +0100 Subject: [PATCH] Make compute node rebalance safter Many bugs around nova-compute rebalancing are focused around problems when the compute node and placement resources are deleted, and sometimes they never get re-created. To limit this class of bugs, we add a check to ensure a compute node is only ever deleted when it is known to have been deleted in Ironic. There is a risk this might leave orphaned compute nodes and resource providers that need manual clean up because users do not want to delete the node in Ironic, but are removing it from nova management. But on balance, it seems safer to leave these cases up to the operator to resolve manually, and collect feedback on how to better help those users. blueprint ironic-shards Change-Id: I7cd9e5ab878cea05462cac24de581dca6d50b3c3 --- nova/compute/manager.py | 13 +++++++ .../regressions/test_bug_1839560.py | 7 +++- .../regressions/test_bug_1853009.py | 6 ++- nova/tests/unit/compute/test_compute_mgr.py | 38 ++++++++++++++++++- nova/virt/driver.py | 20 ++++++++++ nova/virt/ironic/driver.py | 8 ++++ 6 files changed, 88 insertions(+), 4 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index e6ad4ac154..c9339b4b88 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -10575,6 +10575,19 @@ class ComputeManager(manager.Manager): # Delete orphan compute node not reported by driver but still in db for cn in compute_nodes_in_db: if cn.hypervisor_hostname not in nodenames: + # if the node could be migrated, we don't delete + # the compute node database records + if not self.driver.is_node_deleted(cn.hypervisor_hostname): + LOG.warning( + "Found orphan compute node %(id)s " + "hypervisor host is %(hh)s, " + "nodes are %(nodes)s. " + "We are not deleting this as the driver " + "says this node has not been deleted.", + {'id': cn.id, 'hh': cn.hypervisor_hostname, + 'nodes': nodenames}) + continue + LOG.info("Deleting orphan compute node %(id)s " "hypervisor host is %(hh)s, " "nodes are %(nodes)s", diff --git a/nova/tests/functional/regressions/test_bug_1839560.py b/nova/tests/functional/regressions/test_bug_1839560.py index 54ae3af191..005f6537f3 100644 --- a/nova/tests/functional/regressions/test_bug_1839560.py +++ b/nova/tests/functional/regressions/test_bug_1839560.py @@ -11,6 +11,7 @@ # under the License. from oslo_log import log as logging +from unittest import mock from nova import context from nova.db.main import api as db_api @@ -20,6 +21,7 @@ from nova.tests import fixtures as nova_fixtures from nova.tests.functional import fixtures as func_fixtures from nova.tests.functional import integrated_helpers from nova import utils +from nova.virt import fake as fake_driver LOG = logging.getLogger(__name__) @@ -56,7 +58,10 @@ class PeriodicNodeRecreateTestCase(test.TestCase, # for each node. self.flags(compute_driver='fake.PredictableNodeUUIDDriver') - def test_update_available_resource_node_recreate(self): + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') + def test_update_available_resource_node_recreate(self, mock_delete): + # make the fake driver allow nodes to be delete, like ironic driver + mock_delete.return_value = True # First we create a compute service to manage a couple of fake nodes. compute = self.start_service('compute', 'node1') # When start_service runs, it will create the node1 ComputeNode. diff --git a/nova/tests/functional/regressions/test_bug_1853009.py b/nova/tests/functional/regressions/test_bug_1853009.py index 5266e6166b..c3a220ae58 100644 --- a/nova/tests/functional/regressions/test_bug_1853009.py +++ b/nova/tests/functional/regressions/test_bug_1853009.py @@ -15,6 +15,7 @@ from unittest import mock from nova import context from nova import objects from nova.tests.functional import integrated_helpers +from nova.virt import fake as fake_driver class NodeRebalanceDeletedComputeNodeRaceTestCase( @@ -54,7 +55,10 @@ class NodeRebalanceDeletedComputeNodeRaceTestCase( self.nodename = 'fake-node' self.ctxt = context.get_admin_context() - def test_node_rebalance_deleted_compute_node_race(self): + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') + def test_node_rebalance_deleted_compute_node_race(self, mock_delete): + # make the fake driver allow nodes to be delete, like ironic driver + mock_delete.return_value = True # Simulate a service running and then stopping. host_a runs, creates # fake-node, then is stopped. The fake-node compute node is destroyed. # This leaves a soft-deleted node in the DB. diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 18dbfd3ea2..02098b0d0b 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -396,13 +396,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, {'node': mock.sentinel.node} ) + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') @mock.patch.object(manager, 'LOG') @mock.patch.object(manager.ComputeManager, '_update_available_resource_for_node') @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') def test_update_available_resource(self, get_db_nodes, get_avail_nodes, - update_mock, mock_log): + update_mock, mock_log, mock_deleted): mock_rt = self._mock_rt() rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( self.compute, 'reportclient')).mock @@ -415,6 +416,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, get_db_nodes.return_value = db_nodes get_avail_nodes.return_value = avail_nodes + mock_deleted.return_value = True self.compute.update_available_resource(self.context, startup=True) get_db_nodes.assert_called_once_with(self.context, avail_nodes, use_slave=True, startup=True) @@ -460,12 +462,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, update_mock.assert_not_called() del_rp_mock.assert_not_called() + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') @mock.patch.object(manager.ComputeManager, '_update_available_resource_for_node') @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') def test_update_available_resource_destroy_rebalance( - self, get_db_nodes, get_avail_nodes, update_mock): + self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted): mock_rt = self._mock_rt() rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( self.compute, 'reportclient')).mock @@ -476,6 +479,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, db_nodes[0].destroy.side_effect = exception.ObjectActionError( action='destroy', reason='host changed') get_avail_nodes.return_value = set() + mock_deleted.return_value = True self.compute.update_available_resource(self.context) get_db_nodes.assert_called_once_with(self.context, set(), use_slave=True, startup=False) @@ -486,6 +490,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock_rt.remove_node.assert_called_once_with('node1') rc_mock.invalidate_resource_provider.assert_called_once_with( db_nodes[0].uuid) + mock_deleted.assert_called_once_with('node1') + + @mock.patch.object(fake_driver.FakeDriver, 'is_node_deleted') + @mock.patch.object(manager.ComputeManager, + '_update_available_resource_for_node') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + @mock.patch.object(manager.ComputeManager, '_get_compute_nodes_in_db') + def test_update_available_resource_no_destroy_rebalance( + self, get_db_nodes, get_avail_nodes, update_mock, mock_deleted): + mock_rt = self._mock_rt() + rc_mock = self.useFixture(fixtures.fixtures.MockPatchObject( + self.compute, 'reportclient')).mock + db_nodes = [self._make_compute_node('node1', 1)] + get_db_nodes.return_value = db_nodes + # Destroy can fail if nodes were rebalanced between getting the node + # list and calling destroy. + db_nodes[0].destroy.side_effect = exception.ObjectActionError( + action='destroy', reason='host changed') + get_avail_nodes.return_value = set() + mock_deleted.return_value = False + self.compute.update_available_resource(self.context) + get_db_nodes.assert_called_once_with(self.context, set(), + use_slave=True, startup=False) + self.assertEqual(0, update_mock.call_count) + + db_nodes[0].destroy.assert_not_called() + self.assertEqual(0, rc_mock.delete_resource_provider.call_count) + mock_rt.remove_node.assert_not_called() + rc_mock.invalidate_resource_provider.assert_not_called() + mock_deleted.assert_called_once_with('node1') @mock.patch('nova.context.get_admin_context') def test_pre_start_hook(self, get_admin_context): diff --git a/nova/virt/driver.py b/nova/virt/driver.py index 8cd56bdd21..0f95c57408 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -1160,6 +1160,26 @@ class ComputeDriver(object): """ raise NotImplementedError() + def is_node_deleted(self, nodename): + """Check this compute node has been deleted. + + This method is called when the compute manager notices that a + node that was previously reported as available is no longer + available. + In this case, we need to know if the node has actually been + deleted, or if it is simply no longer managed by this + nova-compute service. + If True is returned, the database and placement records will + be removed. + + :param nodename: + node which the caller wants to check if its deleted + :returns: True if the node is safe to delete + """ + # For most driver, compute nodes should never get deleted + # during a periodic task, they are only deleted via the API + return False + def pre_live_migration(self, context, instance, block_device_info, network_info, disk_info, migrate_data): """Prepare an instance for live migration diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 7bd4bf6e73..1aff16b7c5 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -700,6 +700,14 @@ class IronicDriver(virt_driver.ComputeDriver): except sdk_exc.ResourceNotFound: return False + def is_node_deleted(self, nodename): + # check if the node is missing in Ironic + try: + self._get_node(nodename) + return False + except sdk_exc.ResourceNotFound: + return True + def _refresh_hash_ring(self, ctxt): peer_list = None if CONF.ironic.shard is not None: