From 1cd1c472bd8a2dafe4219c7b61b8d75a67f1bf16 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Mon, 23 Jun 2025 14:42:10 +0200 Subject: [PATCH] Note on RPC error decorators around build_and_run_instance The RPC handler build_and_run_instance has a normal set of error handler decorators but that call immediately spawns a separate thread and does not wait for its result so it cannot get any exceptions. Therefore the error handling should happen inside the thread. And it does. Mostly. The _locked_do_build_and_run_instance is the function run by the thread and it calls _do_build_and_run_instance. The _do_build_and_run_instance has `except Exception` handling dropping the active exception and returning an error value instead. So most of the exceptions are handled by _do_build_and_run_instance without going through any of the error decorators. Whatever exception still bubbles up from _do_build_and_run_instance are going through the error handling decorators top of _do_build_and_run_instance. The only source of exception outside of _do_build_and_run_instance but within _locked_do_build_and_run_instance is delete_allocation_for_instance. Exception from that call does not go though any error handling decorators. These error decorators has multiple tasks: * wrap_exception calls _emit_legacy_exception_notification and _emit_versioned_exception_notification so sending error notifications * reverts_task_state sets instance.task_state to None * wrap_instance_fault records InstanceFault object to the DB So there are two cases when error happens but the decorators are not called: * _do_build_and_run_instance catches the exception, does its own error handling and cleanup and then returns an error value. The task state revert and the InstanceFault creation is replicated to the individual cleanup sections here. Note that notification sending is not missing either as that is handled by the _build_and_run_instance called from _do_build_and_run_instance. * delete_allocation_for_instance raises when called from _locked_do_build_and_run_instance. We reach this call if _do_build_and_run_instance failed already and did its own cleanup. So again task state are already reset and InstanceFault already recorded for the original fault and notification already sent. So the only changes here is to add a note to build_and_run_instance so that the real error handling are deeper in the stack so that readers will not think error handling happen on top of the call stack. There was two unit test cases that partially relied on the wrong assumption that exceptions propagate to build_and_run_instance. They could make that wrong assumption because the SpawnIsSynchronousFixture simulates the GreenThread interface wrongly. The goal of that fixture is to make any spawn (and spawn_n) call synchronous meaning the function passed in is executed right away before spawn returns. That is OK. But the fixture forgot that if the function raises then such exception is now raised by spawn. Such a thing never happens with a normal spawn. The exception (and the function return value) only propagated when GreenThread.wait() is called. So this patch fixes the fixture and corrects the two unit tests as well. Change-Id: I440fff6663d0663fb1630dd096f352403982aa37 --- nova/compute/manager.py | 5 +++++ nova/tests/fixtures/nova.py | 10 +++++++++- nova/tests/unit/compute/test_compute.py | 10 ++++++---- nova/tests/unit/compute/test_compute_mgr.py | 14 ++------------ 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 327ec6f9a8..313a0dbe5f 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -2338,6 +2338,11 @@ class ComputeManager(manager.Manager): def _build_succeeded(self, node): self.rt.build_succeeded(node) + # NOTE(gibi): The normal RPC handler decorators are here but please note + # that build_and_run_instance immediately spawns and returns. So any + # error from the real work is not propagated back to these decorators. + # All those error needs to be handled by _locked_do_build_and_run_instance + # and _do_build_and_run_instance @wrap_exception() @reverts_task_state @wrap_instance_fault diff --git a/nova/tests/fixtures/nova.py b/nova/tests/fixtures/nova.py index 3ceb8f1316..f5e9f95c8e 100644 --- a/nova/tests/fixtures/nova.py +++ b/nova/tests/fixtures/nova.py @@ -1245,7 +1245,12 @@ class IsolatedGreenPoolFixture(fixtures.Fixture): class _FakeGreenThread(object): def __init__(self, func, *args, **kwargs): - self._result = func(*args, **kwargs) + try: + self._result = func(*args, **kwargs) + self.raised = False + except Exception as e: + self.raised = True + self._result = e def cancel(self, *args, **kwargs): # This method doesn't make sense for a synchronous call, it's just @@ -1266,6 +1271,9 @@ class _FakeGreenThread(object): pass def wait(self): + if self.raised: + raise self._result + return self._result diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 0a64a47d74..e91973ae1a 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -14506,10 +14506,12 @@ class ComputeInjectedFilesTestCase(BaseTestCase): ] self.assertRaises(exception.Base64Exception, - self.compute.build_and_run_instance, - self.context, self.instance, {}, {}, {}, [], - block_device_mapping=[], - injected_files=injected_files) + self.compute._do_build_and_run_instance, + self.context, self.instance, {}, {}, {}, + block_device_mapping=[], injected_files=injected_files, + admin_password=None, requested_networks=None, + security_groups=None, node=None, limits=None, host_list=None, + accel_uuids=[]) class CheckConfigDriveTestCase(test.NoDBTestCase): diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 2a7eb012ea..1c5d501bb3 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -9577,14 +9577,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): mock_succeeded.assert_not_called() @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance') - @mock.patch( - 'nova.exception_wrapper._emit_versioned_exception_notification', - new=mock.Mock()) - @mock.patch( - 'nova.exception_wrapper._emit_legacy_exception_notification', - new=mock.Mock()) - @mock.patch( - 'nova.compute.utils.add_instance_fault_from_exc', new=mock.Mock()) @mock.patch.object(manager.ComputeManager, '_build_failed') @mock.patch.object(manager.ComputeManager, '_build_succeeded') def test_build_exceptions_reported( @@ -9594,10 +9586,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): instance = objects.Instance(uuid=uuids.instance, task_state=None) for i in range(0, 10): - self.assertRaises(test.TestingException, - self.compute.build_and_run_instance, - self.context, instance, None, - None, None, []) + self.compute.build_and_run_instance( + self.context, instance, None, None, None, []) self.assertEqual(10, mock_failed.call_count) mock_succeeded.assert_not_called()