From 1cd1c472bd8a2dafe4219c7b61b8d75a67f1bf16 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Mon, 23 Jun 2025 14:42:10 +0200
Subject: [PATCH] Note on RPC error decorators around build_and_run_instance

The RPC handler build_and_run_instance has a normal set of error handler
decorators but that call immediately spawns a separate thread and does not
wait for its result so it cannot get any exceptions. Therefore the error
handling should happen inside the thread. And it does. Mostly.

The _locked_do_build_and_run_instance is the function run by the thread
and it calls _do_build_and_run_instance. The _do_build_and_run_instance
has `except Exception` handling dropping the active exception and returning
an error value instead. So most of the exceptions are handled by
_do_build_and_run_instance without going through any of the error
decorators.

Whatever exception still bubbles up from _do_build_and_run_instance
are going through the error handling decorators top of
_do_build_and_run_instance.

The only source of exception outside of _do_build_and_run_instance but
within _locked_do_build_and_run_instance is
delete_allocation_for_instance. Exception from that call does not
go though any error handling decorators.

These error decorators has multiple tasks:
* wrap_exception calls _emit_legacy_exception_notification and
  _emit_versioned_exception_notification so sending error
  notifications
* reverts_task_state sets instance.task_state to None
* wrap_instance_fault records InstanceFault object to the DB

So there are two cases when error happens but the decorators are not
called:
* _do_build_and_run_instance catches the exception, does its own error
  handling and cleanup and then returns an error value. The task state
  revert and the InstanceFault creation is replicated to the individual
  cleanup sections here.
  Note that notification sending is not missing either as that is
  handled by the _build_and_run_instance called from
  _do_build_and_run_instance.

* delete_allocation_for_instance raises when called from
  _locked_do_build_and_run_instance. We reach this call if
  _do_build_and_run_instance failed already and did its own cleanup. So
  again task state are already reset and InstanceFault already recorded
  for the original fault and notification already sent.

So the only changes here is to add a note to build_and_run_instance so
that the real error handling are deeper in the stack so that readers will
not think error handling happen on top of the call stack.

There was two unit test cases that partially relied on the wrong
assumption that exceptions propagate to build_and_run_instance.
They could make that wrong assumption because the
SpawnIsSynchronousFixture simulates the GreenThread interface wrongly.
The goal of that fixture is to make any spawn (and spawn_n) call
synchronous meaning the function passed in is executed right away before
spawn returns. That is OK. But the fixture forgot that if the function
raises then such exception is now raised by spawn. Such a thing never
happens with a normal spawn. The exception (and the function return
value) only propagated when GreenThread.wait() is called.

So this patch fixes the fixture and corrects the two unit tests as well.

Change-Id: I440fff6663d0663fb1630dd096f352403982aa37
---
 nova/compute/manager.py                     |  5 +++++
 nova/tests/fixtures/nova.py                 | 10 +++++++++-
 nova/tests/unit/compute/test_compute.py     | 10 ++++++----
 nova/tests/unit/compute/test_compute_mgr.py | 14 ++------------
 4 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 327ec6f9a8..313a0dbe5f 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -2338,6 +2338,11 @@ class ComputeManager(manager.Manager):
     def _build_succeeded(self, node):
         self.rt.build_succeeded(node)
 
+    # NOTE(gibi): The normal RPC handler decorators are here but please note
+    # that build_and_run_instance immediately spawns and returns. So any
+    # error from the real work is not propagated back to these decorators.
+    # All those error needs to be handled by _locked_do_build_and_run_instance
+    # and _do_build_and_run_instance
     @wrap_exception()
     @reverts_task_state
     @wrap_instance_fault
diff --git a/nova/tests/fixtures/nova.py b/nova/tests/fixtures/nova.py
index 3ceb8f1316..f5e9f95c8e 100644
--- a/nova/tests/fixtures/nova.py
+++ b/nova/tests/fixtures/nova.py
@@ -1245,7 +1245,12 @@ class IsolatedGreenPoolFixture(fixtures.Fixture):
 
 class _FakeGreenThread(object):
     def __init__(self, func, *args, **kwargs):
-        self._result = func(*args, **kwargs)
+        try:
+            self._result = func(*args, **kwargs)
+            self.raised = False
+        except Exception as e:
+            self.raised = True
+            self._result = e
 
     def cancel(self, *args, **kwargs):
         # This method doesn't make sense for a synchronous call, it's just
@@ -1266,6 +1271,9 @@ class _FakeGreenThread(object):
         pass
 
     def wait(self):
+        if self.raised:
+            raise self._result
+
         return self._result
 
 
diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py
index 0a64a47d74..e91973ae1a 100644
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@@ -14506,10 +14506,12 @@ class ComputeInjectedFilesTestCase(BaseTestCase):
         ]
 
         self.assertRaises(exception.Base64Exception,
-                self.compute.build_and_run_instance,
-                self.context, self.instance, {}, {}, {}, [],
-                          block_device_mapping=[],
-                          injected_files=injected_files)
+            self.compute._do_build_and_run_instance,
+            self.context, self.instance, {}, {}, {},
+            block_device_mapping=[], injected_files=injected_files,
+            admin_password=None, requested_networks=None,
+            security_groups=None, node=None, limits=None, host_list=None,
+            accel_uuids=[])
 
 
 class CheckConfigDriveTestCase(test.NoDBTestCase):
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 2a7eb012ea..1c5d501bb3 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -9577,14 +9577,6 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
         mock_succeeded.assert_not_called()
 
     @mock.patch.object(manager.ComputeManager, '_do_build_and_run_instance')
-    @mock.patch(
-        'nova.exception_wrapper._emit_versioned_exception_notification',
-        new=mock.Mock())
-    @mock.patch(
-        'nova.exception_wrapper._emit_legacy_exception_notification',
-        new=mock.Mock())
-    @mock.patch(
-        'nova.compute.utils.add_instance_fault_from_exc', new=mock.Mock())
     @mock.patch.object(manager.ComputeManager, '_build_failed')
     @mock.patch.object(manager.ComputeManager, '_build_succeeded')
     def test_build_exceptions_reported(
@@ -9594,10 +9586,8 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
         instance = objects.Instance(uuid=uuids.instance,
                                     task_state=None)
         for i in range(0, 10):
-            self.assertRaises(test.TestingException,
-                              self.compute.build_and_run_instance,
-                              self.context, instance, None,
-                              None, None, [])
+            self.compute.build_and_run_instance(
+                self.context, instance, None, None, None, [])
 
         self.assertEqual(10, mock_failed.call_count)
         mock_succeeded.assert_not_called()