Collect result of _live_migration_operation

Without this, we won't notice errors raised in the operation thread.

Before 1cd1c472bd the unit test actually
forced such errors to be raised even if in the real code it would
never be raised. But that patch fixed the unit test fixture to be more
realistic without realizing that such fixture error also means that we
might have wrong assumptions about the code under test.

Now we know that exception from the live migration thread was
never raised back to the monitor thread. To improve logging we added a
future.result() call after the main monitoring code finished.

Also the code had complex way to signal the monitoring thread that the
migration thread returned early by registering a callback on the
migration thread and setting an event. This can be simplified to just
check the status of the future of the migration thread. So the event and
the callback is removed.

All this was found because commit 25fbf32f22
missed to add the new parallel arg to the mock of guest.migrate()
on master, but the exception was never propagated to the unit test on
master. Backporting that change showed that in the old unit test env
there is a valid exception.

Co-authored-by: Dan Smith <dms@danplanet.com>
Change-Id: I22683ad5118796c6406f80d8726053afa84fff56
Signed-off-by: Dan Smith <dansmith@redhat.com>
Signed-off-by: Balazs Gibizer <gibi@redhat.com>
This commit is contained in:
Dan Smith
2025-12-03 09:11:00 -08:00
committed by Balazs Gibizer
parent 6c940f4c3c
commit 20a9f9d740
2 changed files with 23 additions and 28 deletions
+17 -17
View File
@@ -38,6 +38,7 @@ import ddt
import eventlet
from eventlet import greenthread
import fixtures
import futurist
from lxml import etree
from os_brick import encryptors
from os_brick import exception as brick_exception
@@ -14607,7 +14608,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
drvr.active_migrations[instance.uuid] = collections.deque()
dom = fakelibvirt.Domain(drvr._get_connection(), "<domain/>", True)
guest = libvirt_guest.Guest(dom)
finish_event = threading.Event()
migration_thread_future = futurist.Future()
def fake_job_info():
while True:
@@ -14616,7 +14617,8 @@ class LibvirtConnTestCase(test.NoDBTestCase,
if type(rec) is str:
if rec == "thread-finish":
finish_event.set()
migration_thread_future.set_result(
mock.sentinel.result)
elif rec == "domain-stop":
dom.destroy()
elif rec == "force_complete":
@@ -14658,7 +14660,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
fake_recover_method,
False,
migrate_data,
finish_event,
migration_thread_future,
[])
if scheduled_action_executed:
if scheduled_action == 'pause':
@@ -14746,8 +14748,8 @@ class LibvirtConnTestCase(test.NoDBTestCase,
def test_live_migration_handle_pause_on_start(self):
# A normal sequence where see all the normal job states, and pause
# scheduled in case of job type VIR_DOMAIN_JOB_NONE and finish_event is
# not ready yet
# scheduled in case of job type VIR_DOMAIN_JOB_NONE and the migration
# thread is not finished yet.
domain_info_records = [
"force_complete",
libvirt_guest.JobInfo(
@@ -14772,8 +14774,8 @@ class LibvirtConnTestCase(test.NoDBTestCase,
def test_live_migration_handle_pause_on_finish(self):
# A normal sequence where see all the normal job states, and pause
# scheduled in case of job type VIR_DOMAIN_JOB_NONE and finish_event is
# ready
# scheduled in case of job type VIR_DOMAIN_JOB_NONE and the migration
# thread is not finished yet.
domain_info_records = [
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
@@ -14881,7 +14883,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
mock_postcopy_enabled):
# A normal sequence where see all the normal job states, and postcopy
# switch scheduled in case of job type VIR_DOMAIN_JOB_NONE and
# finish_event is not ready yet
# the migration thread is not finished yet.
mock_postcopy_enabled.return_value = True
domain_info_records = [
"force_complete",
@@ -15254,7 +15256,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
@mock.patch.object(libvirt_driver.LibvirtDriver, "live_migration_abort")
def _test_live_migration_main(self, mock_abort, mock_copy_disk_path,
mock_running, mock_guest, mock_monitor,
mock_thread, mock_conn,
mock_spawn, mock_conn,
mon_side_effect=None):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = objects.Instance(**self.test_instance)
@@ -15288,18 +15290,14 @@ class LibvirtConnTestCase(test.NoDBTestCase,
mock_copy_disk_path.assert_called_once_with(self.context, instance,
guest)
class AnyEvent(object):
def __eq__(self, other):
return type(other) is threading.Event
mock_thread.assert_called_once_with(
mock_spawn.assert_called_once_with(
drvr._live_migration_operation,
self.context, instance, "fakehost", True,
migrate_data, guest, disks_to_copy[1])
mock_monitor.assert_called_once_with(
self.context, instance, guest, "fakehost",
fake_post, fake_recover, True,
migrate_data, AnyEvent(), disks_to_copy[0])
migrate_data, mock_spawn.return_value, disks_to_copy[0])
def test_live_migration_main(self):
self._test_live_migration_main()
@@ -15308,13 +15306,15 @@ class LibvirtConnTestCase(test.NoDBTestCase,
self._test_live_migration_main(mon_side_effect=Exception)
@mock.patch.object(host.Host, "get_connection", new=mock.Mock())
@mock.patch.object(utils, "spawn", new=mock.Mock())
@mock.patch.object(utils, "spawn")
@mock.patch.object(host.Host, "get_guest")
@mock.patch.object(
libvirt_driver.LibvirtDriver, "_live_migration_copy_disk_paths")
def _test_live_migration_monitor_job_stats_exception(
self, exc, mock_copy_disk_paths, mock_get_guest, expect_success=True
self, exc, mock_copy_disk_paths, mock_get_guest, mock_spawn,
expect_success=True
):
mock_spawn.return_value.done.return_value = False
# Verify behavior when various exceptions are raised inside of
# Guest.get_job_info() during live migration monitoring.
mock_domain = mock.Mock(fakelibvirt.virDomain)
+6 -11
View File
@@ -11379,7 +11379,7 @@ class LibvirtDriver(driver.ComputeDriver):
def _live_migration_monitor(self, context, instance, guest,
dest, post_method,
recover_method, block_migration,
migrate_data, finish_event,
migrate_data, future,
disk_paths):
on_migration_failure: ty.Deque[str] = deque()
@@ -11402,7 +11402,7 @@ class LibvirtDriver(driver.ComputeDriver):
if info.type == libvirt.VIR_DOMAIN_JOB_NONE:
# Either still running, or failed or completed,
# lets untangle the mess
if not finish_event.is_set():
if not future.done():
LOG.debug("Operation thread is still running",
instance=instance)
else:
@@ -11589,16 +11589,8 @@ class LibvirtDriver(driver.ComputeDriver):
migrate_data, guest,
device_names)
finish_event = threading.Event()
self.active_migrations[instance.uuid] = deque()
def thread_finished(_):
LOG.debug("Migration operation thread notification",
instance=instance)
finish_event.set()
future.add_done_callback(thread_finished)
# Let eventlet schedule the new thread right away
utils.cooperative_yield()
@@ -11608,7 +11600,10 @@ class LibvirtDriver(driver.ComputeDriver):
self._live_migration_monitor(context, instance, guest, dest,
post_method, recover_method,
block_migration, migrate_data,
finish_event, disk_paths)
future, disk_paths)
# Collect the result of the future to surface any exceptions
# that might have been raised in _live_migration_operation()
future.result()
except Exception as ex:
LOG.warning("Error monitoring migration: %(ex)s",
{"ex": ex}, instance=instance, exc_info=True)