Merge "Compute manager to use thread pools selectively"

2026-01-16 21:03:28 +00:00
parent f43c64c56c 3c23390cc8
commit 68cec593a7
6 changed files with 119 additions and 22 deletions
@@ -47,7 +47,7 @@ Tunables for the native threading mode
 As native threads are more expensive resources than greenthreads Nova provides
 a set of configuration options to allow fine tuning the deployment based on
 load and resource constraints. The default values are selected to support a
-basic, small deployment without consuming substantially  more memory resources,
+basic, small deployment without consuming substantially more memory resources,
 than the legacy Eventlet mode. Increasing the size of the below thread pools
 means that the given service will consume more memory but will also allow more
 tasks to be executed concurrently.
@@ -79,6 +79,23 @@ tasks to be executed concurrently.

  This option is relevant to every nova service using ``nova.utils.spawn()``.

+* :oslo.config:option:`sync_power_state_pool_size`: Used by the
+  nova-compute service to sync the power state of each instance on the host
+  between the hypervisor and the DB. Since nova 33.0.0 (2026.1 Gazpacho) the
+  default value of this option is changed from 1000 to 5 to have a sane default
+  in native threading mode. Increasing this value in native threading mode
+  increases the nova-compute memory consumption on a host that has many
+  instances.
+
+* :oslo.config:option:`max_concurrent_live_migrations`: Used by the
+  nova-compute service to limit the number of outgoing concurrent live
+  migrations from the host. It is implemented via a thread pool. So increasing
+  the the number of concurrent live migrations will increase the nova-compute
+  service memory consumption in native threading mode. It is almost always
+  a bad idea to use change this config option from its default value, 1. If
+  more performant live migration is needed then enable
+  :oslo.config:option:`libvirt.live_migration_parallel_connections` instead.
+
 Seeing the usage of the pools
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -39,7 +39,6 @@ import typing as ty

 from cinderclient import exceptions as cinder_exception
 from cursive import exception as cursive_exception
-import futurist
 from keystoneauth1 import exceptions as keystone_exception
 from openstack import exceptions as sdk_exc
 import os_traits
@@ -667,9 +666,10 @@ class ComputeManager(manager.Manager):
        self.compute_task_api = conductor.ComputeTaskAPI()
        self.query_client = query.SchedulerQueryClient()
        self.instance_events = InstanceEvents()
-        self._sync_power_executor = futurist.GreenThreadPoolExecutor(
+        self._sync_power_executor = nova.utils.create_executor(
            max_workers=CONF.sync_power_state_pool_size)
-        self._syncs_in_progress = {}
+        self._syncs_in_progress: set[str] = set()
+        self._syncs_in_progress_lock = threading.Lock()
        self.send_instance_updates = (
            CONF.filter_scheduler.track_instance_changes)
        if CONF.max_concurrent_builds != 0:
@@ -683,11 +683,27 @@ class ComputeManager(manager.Manager):
        else:
            self._snapshot_semaphore = compute_utils.UnlimitedSemaphore()
        if CONF.max_concurrent_live_migrations > 0:
-            self._live_migration_executor = futurist.GreenThreadPoolExecutor(
+            self._live_migration_executor = nova.utils.create_executor(
                max_workers=CONF.max_concurrent_live_migrations)
        else:
-            # CONF.max_concurrent_live_migrations is 0 (unlimited)
-            self._live_migration_executor = futurist.GreenThreadPoolExecutor()
+            # setting CONF.max_concurrent_live_migrations to 0 (unlimited)
+            # is deprecated but still supported, so we need to use a sane
+            # default values for each threading mode
+            LOG.warning("Nova compute deprecated the support of unlimited "
+                        "parallel live migration so "
+                        "[DEFAULT]max_concurrent_live_migrations configured "
+                        "with value 0 is deprecated and will not be supported "
+                        "in future releases. Please set an explicit positive"
+                        "value to this config option instead.")
+            if utils.concurrency_mode_threading():
+                self._live_migration_executor = nova.utils.create_executor(
+                    max_workers=5)
+            else:
+                # In eventlet mode we need to keep backward compatibility and
+                # 1000 greenthreads to emulate unlimited.
+                self._live_migration_executor = nova.utils.create_executor(
+                    max_workers=1000)
+
        # This is a dict, keyed by instance uuid, to a two-item tuple of
        # migration object and Future for the queued live migration.
        self._waiting_live_migrations = {}
@@ -706,6 +722,11 @@ class ComputeManager(manager.Manager):
        self.rt = resource_tracker.ResourceTracker(
            self.host, self.driver, reportclient=self.reportclient)

+    @contextlib.contextmanager
+    def syncs_in_progress(self) -> ty.Iterator[set[str]]:
+        with self._syncs_in_progress_lock:
+            yield self._syncs_in_progress
+
    def reset(self):
        LOG.info('Reloading compute RPC API')
        compute_rpcapi.reset_globals()
@@ -11031,20 +11052,21 @@ class ComputeManager(manager.Manager):
                LOG.exception("Periodic sync_power_state task had an "
                              "error while processing an instance.",
                              instance=db_instance)
-
-            self._syncs_in_progress.pop(db_instance.uuid)
+            with self.syncs_in_progress() as syncs:
+                syncs.remove(db_instance.uuid)

        for db_instance in db_instances:
            # process syncs asynchronously - don't want instance locking to
            # block entire periodic task thread
            uuid = db_instance.uuid
-            if uuid in self._syncs_in_progress:
-                LOG.debug('Sync already in progress for %s', uuid)
-            else:
-                LOG.debug('Triggering sync for uuid %s', uuid)
-                self._syncs_in_progress[uuid] = True
-                nova.utils.spawn_on(
-                    self._sync_power_executor, _sync, db_instance)
+            with self.syncs_in_progress() as syncs:
+                if uuid in syncs:
+                    LOG.debug('Sync already in progress for %s', uuid)
+                else:
+                    LOG.debug('Triggering sync for uuid %s', uuid)
+                    syncs.add(uuid)
+                    nova.utils.spawn_on(
+                        self._sync_power_executor, _sync, db_instance)

    def _query_driver_power_state_and_sync(self, context, db_instance):
        if db_instance.task_state is not None:
@@ -694,7 +694,12 @@ that doing so is safe and stable in your environment.

 Possible values:

-* 0 : treated as unlimited.
+* ``0``: Deprecated since 33.0.0 (2026.1 Gazpacho). This value was previously
+  documented as meaning unlimited but the actual implementation used maximum
+  1000 greenthreads. Since this release, the implementation keep using 1000
+  greenthreads in eventlet mode and will use 5 native threads in threading
+  mode. In the future release when eventlet support is removed, 0 as a valid
+  value will also be removed.
 * Any positive integer representing maximum number of live migrations
  to run concurrently.
 """),
@@ -732,9 +737,9 @@ Related options:
  checks
 """),
    cfg.IntOpt('sync_power_state_pool_size',
-        default=1000,
+        default=5,
        help="""
-Number of greenthreads available for use to sync power states.
+Number of threads available for use to sync instance power states.

 This option can be used to reduce the number of concurrent requests
 made to the hypervisor or system with real instance power states
@@ -742,8 +747,8 @@ for performance reasons, for example, with Ironic.

 Possible values:

-* Any positive integer representing greenthreads count.
-""")
+* Any positive integer representing threads count.
+"""),
 ]

 compute_group_opts = [
@@ -18,6 +18,8 @@
 """Tests for compute service."""

 import datetime
+import threading
+
 import fixtures as std_fixtures
 from itertools import chain
 import operator
@@ -1661,7 +1663,14 @@ class ComputeTestCase(BaseTestCase,
    def setUp(self):
        super(ComputeTestCase, self).setUp()
        self.compute._live_migration_executor = futurist.SynchronousExecutor()
+        # NOTE(gibi): the _sync_power_states periodic task in the
+        # ComputeManager spawning concurrent tasks and uses a lock to
+        # synchronize a shared data structure. As the spawn is made
+        # synchronous meaning the tasks runs on the caller thread. This means
+        # the simple lock causes a deadlock in the unit test. Upgrade that lock
+        # to be reentrant so the test can pass with synchronous spawn.
        self.useFixture(fixtures.SpawnIsSynchronousFixture())
+        self.compute._syncs_in_progress_lock = threading.RLock()

        self.image_api = image_api.API()
        self.default_flavor = objects.Flavor.get_by_name(self.context,
@@ -72,6 +72,7 @@ from nova.tests.unit import fake_network_cache_model
 from nova.tests.unit.objects import test_instance_fault
 from nova.tests.unit.objects import test_instance_info_cache
 from nova.tests.unit.objects import test_instance_numa
+from nova import utils
 from nova.virt.block_device import DriverVolumeBlockDevice as driver_bdm_volume
 from nova.virt import driver as virt_driver
 from nova.virt import event as virtevent
@@ -4288,6 +4289,18 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
                                                          power_state.NOSTATE,
                                                          use_slave=True)

+    def test_syncs_in_progress(self):
+        self.assertFalse(self.compute._syncs_in_progress_lock.locked())
+        self.compute._syncs_in_progress.add("fake-uuid")
+
+        with self.compute.syncs_in_progress() as syncs:
+            self.assertTrue(self.compute._syncs_in_progress_lock.locked())
+            self.assertEqual({"fake-uuid"}, syncs)
+            syncs.remove("fake-uuid")
+
+        self.assertFalse(self.compute._syncs_in_progress_lock.locked())
+        self.assertEqual(set(), self.compute._syncs_in_progress)
+
    def test_cleanup_running_deleted_instances_virt_driver_not_ready(self):
        """Tests the scenario that the driver raises VirtDriverNotReady
        when listing instances so the task returns early.
@@ -11743,7 +11756,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
    def test_max_concurrent_live_semaphore_unlimited(self):
        self.flags(max_concurrent_live_migrations=0)
        mgr = manager.ComputeManager()
-        self.assertEqual(1000, mgr._live_migration_executor._max_workers)
+        if utils.concurrency_mode_threading():
+            self.assertEqual(5, mgr._live_migration_executor._max_workers)
+        else:
+            self.assertEqual(1000, mgr._live_migration_executor._max_workers)

    @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock(
        side_effect=exception.InstanceGroupNotFound(group_uuid='')))
@@ -0,0 +1,28 @@
+---
+upgrade:
+  - |
+    The meaning of the 0 value of the config option
+    ``[DEFAULT]max_concurrent_live_migrations`` has been changed. In the past
+    the implementation of the meaning of "unlimited" used maximum 1000
+    concurrent worker greenthreads. For eventlet mode this behavior is kept but
+    for the native threading mode it is now reduced to 5 native threads. It is
+    almost always a bad idea to change this config option from its default value, 1.
+    Please read the `concurrency
+    <https://docs.openstack.org/nova/latest/admin/concurrency.html>`__
+    guide for more details.
+  - |
+    The default value of the configuration option
+    ``[DEFAULT]sync_power_state_thread_pool_size`` is changed from 1000 to 5 to
+    have a value that is safe to use in native threading mode. If you are still
+    using the eventlet mode and relying on a higher value then configure that
+    higher value explicitly before the upgrade. Please read the
+    `concurrency <https://docs.openstack.org/nova/latest/admin/concurrency.html>`__
+    guide for more details.
+deprecations:
+  - |
+    The possible 0 value of the configuration option
+    ``[DEFAULT]max_concurrent_live_migrations`` is deprecated and will be
+    removed in a future release. It is almost always a bad idea to change the
+    default value, 1, of this config option. If more performant live migration
+    is needed, use the ``live_migration_parallel_connections`` config option
+    instead.