From 20a9f9d7406f00a0f68da00449642710347d999d Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Wed, 3 Dec 2025 09:11:00 -0800 Subject: [PATCH] Collect result of _live_migration_operation Without this, we won't notice errors raised in the operation thread. Before 1cd1c472bd8a2dafe4219c7b61b8d75a67f1bf16 the unit test actually forced such errors to be raised even if in the real code it would never be raised. But that patch fixed the unit test fixture to be more realistic without realizing that such fixture error also means that we might have wrong assumptions about the code under test. Now we know that exception from the live migration thread was never raised back to the monitor thread. To improve logging we added a future.result() call after the main monitoring code finished. Also the code had complex way to signal the monitoring thread that the migration thread returned early by registering a callback on the migration thread and setting an event. This can be simplified to just check the status of the future of the migration thread. So the event and the callback is removed. All this was found because commit 25fbf32f22c7f1a8082269782034bf9477e96e48 missed to add the new parallel arg to the mock of guest.migrate() on master, but the exception was never propagated to the unit test on master. Backporting that change showed that in the old unit test env there is a valid exception. Co-authored-by: Dan Smith Change-Id: I22683ad5118796c6406f80d8726053afa84fff56 Signed-off-by: Dan Smith Signed-off-by: Balazs Gibizer --- nova/tests/unit/virt/libvirt/test_driver.py | 34 ++++++++++----------- nova/virt/libvirt/driver.py | 17 ++++------- 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index debd2c4b24..2a96fb47a5 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -38,6 +38,7 @@ import ddt import eventlet from eventlet import greenthread import fixtures +import futurist from lxml import etree from os_brick import encryptors from os_brick import exception as brick_exception @@ -14607,7 +14608,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, drvr.active_migrations[instance.uuid] = collections.deque() dom = fakelibvirt.Domain(drvr._get_connection(), "", True) guest = libvirt_guest.Guest(dom) - finish_event = threading.Event() + migration_thread_future = futurist.Future() def fake_job_info(): while True: @@ -14616,7 +14617,8 @@ class LibvirtConnTestCase(test.NoDBTestCase, if type(rec) is str: if rec == "thread-finish": - finish_event.set() + migration_thread_future.set_result( + mock.sentinel.result) elif rec == "domain-stop": dom.destroy() elif rec == "force_complete": @@ -14658,7 +14660,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, fake_recover_method, False, migrate_data, - finish_event, + migration_thread_future, []) if scheduled_action_executed: if scheduled_action == 'pause': @@ -14746,8 +14748,8 @@ class LibvirtConnTestCase(test.NoDBTestCase, def test_live_migration_handle_pause_on_start(self): # A normal sequence where see all the normal job states, and pause - # scheduled in case of job type VIR_DOMAIN_JOB_NONE and finish_event is - # not ready yet + # scheduled in case of job type VIR_DOMAIN_JOB_NONE and the migration + # thread is not finished yet. domain_info_records = [ "force_complete", libvirt_guest.JobInfo( @@ -14772,8 +14774,8 @@ class LibvirtConnTestCase(test.NoDBTestCase, def test_live_migration_handle_pause_on_finish(self): # A normal sequence where see all the normal job states, and pause - # scheduled in case of job type VIR_DOMAIN_JOB_NONE and finish_event is - # ready + # scheduled in case of job type VIR_DOMAIN_JOB_NONE and the migration + # thread is not finished yet. domain_info_records = [ libvirt_guest.JobInfo( type=fakelibvirt.VIR_DOMAIN_JOB_NONE), @@ -14881,7 +14883,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_postcopy_enabled): # A normal sequence where see all the normal job states, and postcopy # switch scheduled in case of job type VIR_DOMAIN_JOB_NONE and - # finish_event is not ready yet + # the migration thread is not finished yet. mock_postcopy_enabled.return_value = True domain_info_records = [ "force_complete", @@ -15254,7 +15256,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, @mock.patch.object(libvirt_driver.LibvirtDriver, "live_migration_abort") def _test_live_migration_main(self, mock_abort, mock_copy_disk_path, mock_running, mock_guest, mock_monitor, - mock_thread, mock_conn, + mock_spawn, mock_conn, mon_side_effect=None): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) instance = objects.Instance(**self.test_instance) @@ -15288,18 +15290,14 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_copy_disk_path.assert_called_once_with(self.context, instance, guest) - class AnyEvent(object): - def __eq__(self, other): - return type(other) is threading.Event - - mock_thread.assert_called_once_with( + mock_spawn.assert_called_once_with( drvr._live_migration_operation, self.context, instance, "fakehost", True, migrate_data, guest, disks_to_copy[1]) mock_monitor.assert_called_once_with( self.context, instance, guest, "fakehost", fake_post, fake_recover, True, - migrate_data, AnyEvent(), disks_to_copy[0]) + migrate_data, mock_spawn.return_value, disks_to_copy[0]) def test_live_migration_main(self): self._test_live_migration_main() @@ -15308,13 +15306,15 @@ class LibvirtConnTestCase(test.NoDBTestCase, self._test_live_migration_main(mon_side_effect=Exception) @mock.patch.object(host.Host, "get_connection", new=mock.Mock()) - @mock.patch.object(utils, "spawn", new=mock.Mock()) + @mock.patch.object(utils, "spawn") @mock.patch.object(host.Host, "get_guest") @mock.patch.object( libvirt_driver.LibvirtDriver, "_live_migration_copy_disk_paths") def _test_live_migration_monitor_job_stats_exception( - self, exc, mock_copy_disk_paths, mock_get_guest, expect_success=True + self, exc, mock_copy_disk_paths, mock_get_guest, mock_spawn, + expect_success=True ): + mock_spawn.return_value.done.return_value = False # Verify behavior when various exceptions are raised inside of # Guest.get_job_info() during live migration monitoring. mock_domain = mock.Mock(fakelibvirt.virDomain) diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 4f51be8c1b..9789e82623 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -11379,7 +11379,7 @@ class LibvirtDriver(driver.ComputeDriver): def _live_migration_monitor(self, context, instance, guest, dest, post_method, recover_method, block_migration, - migrate_data, finish_event, + migrate_data, future, disk_paths): on_migration_failure: ty.Deque[str] = deque() @@ -11402,7 +11402,7 @@ class LibvirtDriver(driver.ComputeDriver): if info.type == libvirt.VIR_DOMAIN_JOB_NONE: # Either still running, or failed or completed, # lets untangle the mess - if not finish_event.is_set(): + if not future.done(): LOG.debug("Operation thread is still running", instance=instance) else: @@ -11589,16 +11589,8 @@ class LibvirtDriver(driver.ComputeDriver): migrate_data, guest, device_names) - finish_event = threading.Event() self.active_migrations[instance.uuid] = deque() - def thread_finished(_): - LOG.debug("Migration operation thread notification", - instance=instance) - - finish_event.set() - future.add_done_callback(thread_finished) - # Let eventlet schedule the new thread right away utils.cooperative_yield() @@ -11608,7 +11600,10 @@ class LibvirtDriver(driver.ComputeDriver): self._live_migration_monitor(context, instance, guest, dest, post_method, recover_method, block_migration, migrate_data, - finish_event, disk_paths) + future, disk_paths) + # Collect the result of the future to surface any exceptions + # that might have been raised in _live_migration_operation() + future.result() except Exception as ex: LOG.warning("Error monitoring migration: %(ex)s", {"ex": ex}, instance=instance, exc_info=True)