From 3c23390cc8e9690a995b07c3ecec18a2ea381f59 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Mon, 3 Nov 2025 18:25:27 +0100 Subject: [PATCH] Compute manager to use thread pools selectively This changes the thread pool usage of the ComputeManager to go through the concurrency mode aware util functions. The concurrent live migration pool had a seemingly unlimited option when configured with value 0, but in reality GreenThreadPool has a default worker size of 1000. In reality it is almost never right to have more than one live migration running concurrently. Also with native threading having 1000 worker is just too costly. So we decided to deprecate the value 0 and changed the implementation of unlimited to mean 5 threads in native threading mode. We kept the 1000 greenthread in eventlet mode for backward compatibility. The _sync_power_states periodic task also spawn tasks for each instance to be synced. As it uses a shared data structure across these tasks and the caller a lock is needed to avoid race conditions. Also the default pool size is 1000 for these tasks in our configuration. That would use a lot of memory on a busy host in native threading mode. So we changed the default value from 1000 to 5. Change-Id: I9567d5fabdf086b5d0493103d9f6bde4f66af387 Signed-off-by: Balazs Gibizer --- doc/source/admin/concurrency.rst | 19 ++++++- nova/compute/manager.py | 52 +++++++++++++------ nova/conf/compute.py | 15 ++++-- nova/tests/unit/compute/test_compute.py | 9 ++++ nova/tests/unit/compute/test_compute_mgr.py | 18 ++++++- ...rent_live_migrations-29c54c7eeb77041c.yaml | 28 ++++++++++ 6 files changed, 119 insertions(+), 22 deletions(-) create mode 100644 releasenotes/notes/deprecate-unlimited-max_concurrent_live_migrations-29c54c7eeb77041c.yaml diff --git a/doc/source/admin/concurrency.rst b/doc/source/admin/concurrency.rst index 52b12fa7cf..6b2563fdbf 100644 --- a/doc/source/admin/concurrency.rst +++ b/doc/source/admin/concurrency.rst @@ -43,7 +43,7 @@ Tunables for the native threading mode As native threads are more expensive resources than greenthreads Nova provides a set of configuration options to allow fine tuning the deployment based on load and resource constraints. The default values are selected to support a -basic, small deployment without consuming substantially more memory resources, +basic, small deployment without consuming substantially more memory resources, than the legacy Eventlet mode. Increasing the size of the below thread pools means that the given service will consume more memory but will also allow more tasks to be executed concurrently. @@ -75,6 +75,23 @@ tasks to be executed concurrently. This option is relevant to every nova service using ``nova.utils.spawn()``. +* :oslo.config:option:`sync_power_state_pool_size`: Used by the + nova-compute service to sync the power state of each instance on the host + between the hypervisor and the DB. Since nova 33.0.0 (2026.1 Gazpacho) the + default value of this option is changed from 1000 to 5 to have a sane default + in native threading mode. Increasing this value in native threading mode + increases the nova-compute memory consumption on a host that has many + instances. + +* :oslo.config:option:`max_concurrent_live_migrations`: Used by the + nova-compute service to limit the number of outgoing concurrent live + migrations from the host. It is implemented via a thread pool. So increasing + the the number of concurrent live migrations will increase the nova-compute + service memory consumption in native threading mode. It is almost always + a bad idea to use change this config option from its default value, 1. If + more performant live migration is needed then enable + :oslo.config:option:`libvirt.live_migration_parallel_connections` instead. + Seeing the usage of the pools ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 834e2889c2..9961838e45 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -39,7 +39,6 @@ import typing as ty from cinderclient import exceptions as cinder_exception from cursive import exception as cursive_exception -import futurist from keystoneauth1 import exceptions as keystone_exception from openstack import exceptions as sdk_exc import os_traits @@ -667,9 +666,10 @@ class ComputeManager(manager.Manager): self.compute_task_api = conductor.ComputeTaskAPI() self.query_client = query.SchedulerQueryClient() self.instance_events = InstanceEvents() - self._sync_power_executor = futurist.GreenThreadPoolExecutor( + self._sync_power_executor = nova.utils.create_executor( max_workers=CONF.sync_power_state_pool_size) - self._syncs_in_progress = {} + self._syncs_in_progress: set[str] = set() + self._syncs_in_progress_lock = threading.Lock() self.send_instance_updates = ( CONF.filter_scheduler.track_instance_changes) if CONF.max_concurrent_builds != 0: @@ -683,11 +683,27 @@ class ComputeManager(manager.Manager): else: self._snapshot_semaphore = compute_utils.UnlimitedSemaphore() if CONF.max_concurrent_live_migrations > 0: - self._live_migration_executor = futurist.GreenThreadPoolExecutor( + self._live_migration_executor = nova.utils.create_executor( max_workers=CONF.max_concurrent_live_migrations) else: - # CONF.max_concurrent_live_migrations is 0 (unlimited) - self._live_migration_executor = futurist.GreenThreadPoolExecutor() + # setting CONF.max_concurrent_live_migrations to 0 (unlimited) + # is deprecated but still supported, so we need to use a sane + # default values for each threading mode + LOG.warning("Nova compute deprecated the support of unlimited " + "parallel live migration so " + "[DEFAULT]max_concurrent_live_migrations configured " + "with value 0 is deprecated and will not be supported " + "in future releases. Please set an explicit positive" + "value to this config option instead.") + if utils.concurrency_mode_threading(): + self._live_migration_executor = nova.utils.create_executor( + max_workers=5) + else: + # In eventlet mode we need to keep backward compatibility and + # 1000 greenthreads to emulate unlimited. + self._live_migration_executor = nova.utils.create_executor( + max_workers=1000) + # This is a dict, keyed by instance uuid, to a two-item tuple of # migration object and Future for the queued live migration. self._waiting_live_migrations = {} @@ -706,6 +722,11 @@ class ComputeManager(manager.Manager): self.rt = resource_tracker.ResourceTracker( self.host, self.driver, reportclient=self.reportclient) + @contextlib.contextmanager + def syncs_in_progress(self) -> ty.Iterator[set[str]]: + with self._syncs_in_progress_lock: + yield self._syncs_in_progress + def reset(self): LOG.info('Reloading compute RPC API') compute_rpcapi.reset_globals() @@ -11031,20 +11052,21 @@ class ComputeManager(manager.Manager): LOG.exception("Periodic sync_power_state task had an " "error while processing an instance.", instance=db_instance) - - self._syncs_in_progress.pop(db_instance.uuid) + with self.syncs_in_progress() as syncs: + syncs.remove(db_instance.uuid) for db_instance in db_instances: # process syncs asynchronously - don't want instance locking to # block entire periodic task thread uuid = db_instance.uuid - if uuid in self._syncs_in_progress: - LOG.debug('Sync already in progress for %s', uuid) - else: - LOG.debug('Triggering sync for uuid %s', uuid) - self._syncs_in_progress[uuid] = True - nova.utils.spawn_on( - self._sync_power_executor, _sync, db_instance) + with self.syncs_in_progress() as syncs: + if uuid in syncs: + LOG.debug('Sync already in progress for %s', uuid) + else: + LOG.debug('Triggering sync for uuid %s', uuid) + syncs.add(uuid) + nova.utils.spawn_on( + self._sync_power_executor, _sync, db_instance) def _query_driver_power_state_and_sync(self, context, db_instance): if db_instance.task_state is not None: diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 382415c245..076060857b 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -694,7 +694,12 @@ that doing so is safe and stable in your environment. Possible values: -* 0 : treated as unlimited. +* ``0``: Deprecated since 33.0.0 (2026.1 Gazpacho). This value was previously + documented as meaning unlimited but the actual implementation used maximum + 1000 greenthreads. Since this release, the implementation keep using 1000 + greenthreads in eventlet mode and will use 5 native threads in threading + mode. In the future release when eventlet support is removed, 0 as a valid + value will also be removed. * Any positive integer representing maximum number of live migrations to run concurrently. """), @@ -732,9 +737,9 @@ Related options: checks """), cfg.IntOpt('sync_power_state_pool_size', - default=1000, + default=5, help=""" -Number of greenthreads available for use to sync power states. +Number of threads available for use to sync instance power states. This option can be used to reduce the number of concurrent requests made to the hypervisor or system with real instance power states @@ -742,8 +747,8 @@ for performance reasons, for example, with Ironic. Possible values: -* Any positive integer representing greenthreads count. -""") +* Any positive integer representing threads count. +"""), ] compute_group_opts = [ diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 3aa0e138ea..0eb7be87aa 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -18,6 +18,8 @@ """Tests for compute service.""" import datetime +import threading + import fixtures as std_fixtures from itertools import chain import operator @@ -1661,7 +1663,14 @@ class ComputeTestCase(BaseTestCase, def setUp(self): super(ComputeTestCase, self).setUp() self.compute._live_migration_executor = futurist.SynchronousExecutor() + # NOTE(gibi): the _sync_power_states periodic task in the + # ComputeManager spawning concurrent tasks and uses a lock to + # synchronize a shared data structure. As the spawn is made + # synchronous meaning the tasks runs on the caller thread. This means + # the simple lock causes a deadlock in the unit test. Upgrade that lock + # to be reentrant so the test can pass with synchronous spawn. self.useFixture(fixtures.SpawnIsSynchronousFixture()) + self.compute._syncs_in_progress_lock = threading.RLock() self.image_api = image_api.API() self.default_flavor = objects.Flavor.get_by_name(self.context, diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index b5d2b3820c..ceb04aa336 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -72,6 +72,7 @@ from nova.tests.unit import fake_network_cache_model from nova.tests.unit.objects import test_instance_fault from nova.tests.unit.objects import test_instance_info_cache from nova.tests.unit.objects import test_instance_numa +from nova import utils from nova.virt.block_device import DriverVolumeBlockDevice as driver_bdm_volume from nova.virt import driver as virt_driver from nova.virt import event as virtevent @@ -4288,6 +4289,18 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, power_state.NOSTATE, use_slave=True) + def test_syncs_in_progress(self): + self.assertFalse(self.compute._syncs_in_progress_lock.locked()) + self.compute._syncs_in_progress.add("fake-uuid") + + with self.compute.syncs_in_progress() as syncs: + self.assertTrue(self.compute._syncs_in_progress_lock.locked()) + self.assertEqual({"fake-uuid"}, syncs) + syncs.remove("fake-uuid") + + self.assertFalse(self.compute._syncs_in_progress_lock.locked()) + self.assertEqual(set(), self.compute._syncs_in_progress) + def test_cleanup_running_deleted_instances_virt_driver_not_ready(self): """Tests the scenario that the driver raises VirtDriverNotReady when listing instances so the task returns early. @@ -11743,7 +11756,10 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, def test_max_concurrent_live_semaphore_unlimited(self): self.flags(max_concurrent_live_migrations=0) mgr = manager.ComputeManager() - self.assertEqual(1000, mgr._live_migration_executor._max_workers) + if utils.concurrency_mode_threading(): + self.assertEqual(5, mgr._live_migration_executor._max_workers) + else: + self.assertEqual(1000, mgr._live_migration_executor._max_workers) @mock.patch('nova.objects.InstanceGroup.get_by_instance_uuid', mock.Mock( side_effect=exception.InstanceGroupNotFound(group_uuid=''))) diff --git a/releasenotes/notes/deprecate-unlimited-max_concurrent_live_migrations-29c54c7eeb77041c.yaml b/releasenotes/notes/deprecate-unlimited-max_concurrent_live_migrations-29c54c7eeb77041c.yaml new file mode 100644 index 0000000000..fc37edecda --- /dev/null +++ b/releasenotes/notes/deprecate-unlimited-max_concurrent_live_migrations-29c54c7eeb77041c.yaml @@ -0,0 +1,28 @@ +--- +upgrade: + - | + The meaning of the 0 value of the config option + ``[DEFAULT]max_concurrent_live_migrations`` has been changed. In the past + the implementation of the meaning of "unlimited" used maximum 1000 + concurrent worker greenthreads. For eventlet mode this behavior is kept but + for the native threading mode it is now reduced to 5 native threads. It is + almost always a bad idea to change this config option from its default value, 1. + Please read the `concurrency + `__ + guide for more details. + - | + The default value of the configuration option + ``[DEFAULT]sync_power_state_thread_pool_size`` is changed from 1000 to 5 to + have a value that is safe to use in native threading mode. If you are still + using the eventlet mode and relying on a higher value then configure that + higher value explicitly before the upgrade. Please read the + `concurrency `__ + guide for more details. +deprecations: + - | + The possible 0 value of the configuration option + ``[DEFAULT]max_concurrent_live_migrations`` is deprecated and will be + removed in a future release. It is almost always a bad idea to change the + default value, 1, of this config option. If more performant live migration + is needed, use the ``live_migration_parallel_connections`` config option + instead.