From dc51a4271b6b328fdd655d48559f7220efcf794b Mon Sep 17 00:00:00 2001 From: Julien Le Jeune Date: Fri, 12 Sep 2025 10:15:32 +0200 Subject: [PATCH] nova-conductor puts instance in error state Nova-conductor puts instance in error if an unknown exception is raised in the _build_live_migrate_task during the live-migration. [1] The exception comes from _call_livem_checks_on_host and we can see raise exception.MigrationPreCheckError if we face to messaging.MessagingTimeout exception for example. [2] The function check_can_live_migrate_destination does a check also on source host with check_can_live_migrate_source [3] and this check can also return exceptions like MessagingTimeout and this one is not caught properly because it's a remote "Remote error: MessagingTimeout" due to dest host try to contact source host and this source host not reply. [1] https://github.com/openstack/nova/blob/master/nova/conductor/manager.py#L523 [2] https://github.com/openstack/nova/blob/master/nova/conductor/tasks/live_migrate.py#L381 [3] https://github.com/openstack/nova/blob/master/nova/compute/manager.py#L9090 Closes-Bug: #2044235 Change-Id: Ie1f96fee743c235ab35113a9ad1549a67b975839 Signed-off-by: Julien Le Jeune --- nova/compute/manager.py | 16 ++++++++++++++-- .../functional/regressions/test_bug_2044235.py | 3 +-- nova/tests/unit/compute/test_compute_mgr.py | 4 ++-- ...ce-after-rpc-issue-during-live-migration.yaml | 9 +++++++++ 4 files changed, 26 insertions(+), 6 deletions(-) create mode 100644 releasenotes/notes/bug-2044235-reset-instance-after-rpc-issue-during-live-migration.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 7b7da4fd56..022128612e 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -9087,8 +9087,20 @@ class ComputeManager(manager.Manager): try: allocs = self.reportclient.get_allocations_for_consumer( ctxt, instance.uuid) - migrate_data = self.compute_rpcapi.check_can_live_migrate_source( - ctxt, instance, dest_check_data) + try: + migrate_data = ( + self.compute_rpcapi.check_can_live_migrate_source( + ctxt, instance, dest_check_data) + ) + except Exception as ex: + msg = _("Error while check_can_live_migrate_source from " + "%(src)s to host %(dest)s: %(ex_type)s %(ex)s") % { + 'src': instance.host, + 'dest': CONF.host, + 'ex_type': type(ex).__name__, + 'ex': ex + } + raise exception.MigrationPreCheckError(msg) if ('src_supports_numa_live_migration' in migrate_data and migrate_data.src_supports_numa_live_migration): migrate_data = self._live_migration_claim( diff --git a/nova/tests/functional/regressions/test_bug_2044235.py b/nova/tests/functional/regressions/test_bug_2044235.py index c81847f58f..71f99c8f67 100644 --- a/nova/tests/functional/regressions/test_bug_2044235.py +++ b/nova/tests/functional/regressions/test_bug_2044235.py @@ -57,5 +57,4 @@ class TestMessagingTimeoutDuringLiveMigrationCheck( self._live_migrate, server, "failed" ) - # bug lp-2044235 - instance is in ERROR but it should not - self._wait_for_state_change(server, "ERROR") + self._wait_for_state_change(server, "ACTIVE") diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index e0c492fa15..fe7d236d5e 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -5233,7 +5233,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, 'nova.network.neutron.API.has_port_binding_extension', lambda *args: True)) self.assertRaises( - test.TestingException, + exception.MigrationPreCheckError, self._test_check_can_live_migrate_destination, do_raise=True) @@ -5414,7 +5414,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, 'check_can_live_migrate_source', side_effect=messaging.MessagingTimeout): self.assertRaises( - messaging.MessagingTimeout, + exception.MigrationPreCheckError, self.compute.check_can_live_migrate_destination, self.context, instance, None, None, None, None) _do_test() diff --git a/releasenotes/notes/bug-2044235-reset-instance-after-rpc-issue-during-live-migration.yaml b/releasenotes/notes/bug-2044235-reset-instance-after-rpc-issue-during-live-migration.yaml new file mode 100644 index 0000000000..7f8743afaf --- /dev/null +++ b/releasenotes/notes/bug-2044235-reset-instance-after-rpc-issue-during-live-migration.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + Fixed the issue + `bug 2044235 `__ where Nova + Conductor puts an instance into an error state if any errors occur during + execution of the 'check_can_live_migrate_source()' method in an RPC call. + Now, any error is caught and a MigrationPreCheckError exception is re-raised + to reset the instance state.