From cbf400df1dc3e26f4f2e655ca47db7fb097b3531 Mon Sep 17 00:00:00 2001 From: John Garbutt Date: Wed, 17 May 2023 11:26:53 +0100 Subject: [PATCH] Deprecate ironic.peer_list As part of the move to using Ironic shards, we document that the best practice for scaling Ironic and Nova deployments is to shard Ironic nodes between nova-compute processes, rather than attempting to user the peer_list. Currently, we only allow users to do this using conductor groups. This works well for those wanting a conductor group per L2 network domain. But in general, conductor groups per nova-compute are a very poor trade off in terms of ironic deployment complexity. Futher patches will look to enable the use of ironic shards, alongside conductor groups, to more easily shard your ironic nodes between nova-compute processes. To avoid confusion, we rename the partition_key configuration value to conductor_group. blueprint ironic-shards Change-Id: Ia2e23a59dbd2f13c6f74ca975c249751bebf54b2 --- .../admin/configuration/hypervisor-ironic.rst | 21 +++++++++++-- nova/conf/ironic.py | 14 ++++++--- nova/tests/unit/virt/ironic/test_driver.py | 24 +++++++------- nova/virt/ironic/driver.py | 31 +++++++++++-------- ...ate-ironic-peer-list-ff8a502935faa045.yaml | 16 ++++++++++ 5 files changed, 76 insertions(+), 30 deletions(-) create mode 100644 releasenotes/notes/deprecate-ironic-peer-list-ff8a502935faa045.yaml diff --git a/doc/source/admin/configuration/hypervisor-ironic.rst b/doc/source/admin/configuration/hypervisor-ironic.rst index bba01deffa..931de0008f 100644 --- a/doc/source/admin/configuration/hypervisor-ironic.rst +++ b/doc/source/admin/configuration/hypervisor-ironic.rst @@ -40,11 +40,28 @@ Configuration Scaling and performance issues ------------------------------ +- It is typical for a single nova-compute process to support several + hundred Ironic nodes. There are known issues when you attempt to + support more than 1000 Ironic nodes associated with a single + nova-compute process, even though Ironic is able to scale out + a single conductor group to much larger sizes. There are many + other factors that can affect what is the maximum practical size of + a conductor group within your deployment. - The ``update_available_resource`` periodic task reports all the resources managed by Ironic. Depending the number of nodes, it can take a lot of time. The nova-compute will not perform any other operations when this task is - running. You can use conductor groups to help scale, by setting - :oslo.config:option:`ironic.partition_key`. + running. You can use conductor groups to help shard your deployment + between multiple nova-compute processes by setting + :oslo.config:option:`ironic.conductor_group`. +- The nova-compute process using the Ironic driver can be moved between + different physical servers using active/passive failover. But when doing + this failover, you must ensure :oslo.config:option:`host` is the same + no matter where the nova-compute process is running. Similarly you must + ensure there are at most one nova-compute processes running for each + conductor group. +- Running multiple nova-compute processes that point at the same + conductor group is now deprecated. Please never have more than one + host in the peer list: :oslo.config:option:`ironic.peer_list` Known limitations / Missing features diff --git a/nova/conf/ironic.py b/nova/conf/ironic.py index 2734f2b78a..ca1655dcc2 100644 --- a/nova/conf/ironic.py +++ b/nova/conf/ironic.py @@ -69,7 +69,8 @@ Related options: help='Timeout (seconds) to wait for node serial console state ' 'changed. Set to 0 to disable timeout.'), cfg.StrOpt( - 'partition_key', + 'conductor_group', + deprecated_name='partition_key', default=None, mutable=True, max_length=255, @@ -83,13 +84,18 @@ Related options: 'leaving the option unset.'), cfg.ListOpt( 'peer_list', + deprecated_for_removal=True, + deprecated_since='28.0.0', + deprecated_reason="""\ + We do not recommend using nova-compute HA, please use passive + failover of a single nova-compute service instead.""", default=[], mutable=True, help='List of hostnames for all nova-compute services (including ' - 'this host) with this partition_key config value. ' - 'Nodes matching the partition_key value will be distributed ' + 'this host) with this conductor_group config value. ' + 'Nodes matching the conductor_group value will be distributed ' 'between all services specified here. ' - 'If partition_key is unset, this option is ignored.'), + 'If conductor_group is unset, this option is ignored.'), ] diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 52aa37ac13..e0bc80c428 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -3021,7 +3021,7 @@ class HashRingTestCase(test.NoDBTestCase): services = ['host1', 'host2', 'host3'] expected_hosts = {'host1', 'host2'} self.mock_is_up.return_value = True - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host1', 'host2'], group='ironic') self._test__refresh_hash_ring(services, expected_hosts, uncalled=['host3']) @@ -3034,7 +3034,7 @@ class HashRingTestCase(test.NoDBTestCase): services = ['host1', 'host2', 'host3'] expected_hosts = {'host1', 'host2', 'host3'} self.mock_is_up.return_value = True - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host1', 'host2'], group='ironic') self._test__refresh_hash_ring(services, expected_hosts, uncalled=['host3']) @@ -3043,17 +3043,17 @@ class HashRingTestCase(test.NoDBTestCase): @mock.patch.object(ironic_driver, 'LOG', autospec=True) def test__check_peer_list(self, mock_log): self.flags(host='host1') - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host1', 'host2'], group='ironic') ironic_driver._check_peer_list() - # happy path, nothing happens + # warn as we have two hosts in the list + self.assertTrue(mock_log.warning.called) self.assertFalse(mock_log.error.called) - self.assertFalse(mock_log.warning.called) @mock.patch.object(ironic_driver, 'LOG', autospec=True) def test__check_peer_list_empty(self, mock_log): self.flags(host='host1') - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=[], group='ironic') self.assertRaises(exception.InvalidPeerList, ironic_driver._check_peer_list) @@ -3062,7 +3062,7 @@ class HashRingTestCase(test.NoDBTestCase): @mock.patch.object(ironic_driver, 'LOG', autospec=True) def test__check_peer_list_missing_self(self, mock_log): self.flags(host='host1') - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host2'], group='ironic') self.assertRaises(exception.InvalidPeerList, ironic_driver._check_peer_list) @@ -3071,10 +3071,12 @@ class HashRingTestCase(test.NoDBTestCase): @mock.patch.object(ironic_driver, 'LOG', autospec=True) def test__check_peer_list_only_self(self, mock_log): self.flags(host='host1') - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host1'], group='ironic') ironic_driver._check_peer_list() - self.assertTrue(mock_log.warning.called) + # happy path, nothing happens + self.assertFalse(mock_log.error.called) + self.assertFalse(mock_log.warning.called) class NodeCacheTestCase(test.NoDBTestCase): @@ -3184,7 +3186,7 @@ class NodeCacheTestCase(test.NoDBTestCase): ] hosts = [self.host, self.host, self.host] partition_key = 'some-group' - self.flags(partition_key=partition_key, group='ironic') + self.flags(conductor_group=partition_key, group='ironic') self._test__refresh_cache(instances, nodes, hosts, partition_key=partition_key) @@ -3205,7 +3207,7 @@ class NodeCacheTestCase(test.NoDBTestCase): ] hosts = [self.host, self.host, self.host] partition_key = 'some-group' - self.flags(partition_key=partition_key, group='ironic') + self.flags(conductor_group=partition_key, group='ironic') self._test__refresh_cache(instances, nodes, hosts, partition_key=partition_key, diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 960acf06a8..48fc349191 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -133,7 +133,7 @@ def _log_ironic_polling(what, node, instance): def _check_peer_list(): # these configs are mutable; need to check at runtime and init - if CONF.ironic.partition_key is not None: + if CONF.ironic.conductor_group is not None: peer_list = set(CONF.ironic.peer_list) if not peer_list: LOG.error('FATAL: Peer list is not configured in the ' @@ -145,11 +145,10 @@ def _check_peer_list(): 'compute service hostname (%s); add it to ' 'the [ironic]/peer_list option.', CONF.host) raise exception.InvalidPeerList(host=CONF.host) - if set([CONF.host]) == peer_list: - LOG.warning('This compute service (%s) is the only service ' - 'present in the [ironic]/peer_list option. ' - 'Are you sure this should not include more ' - 'hosts?', CONF.host) + if len(peer_list) > 1: + LOG.warning('Having multiple compute services in your ' + 'peer_list is now deprecated. We recommend moving ' + 'to just a single node in your peer list.') class IronicDriver(virt_driver.ComputeDriver): @@ -706,8 +705,8 @@ class IronicDriver(virt_driver.ComputeDriver): # NOTE(jroll) if this is set, we need to limit the set of other # compute services in the hash ring to hosts that are currently up # and specified in the peer_list config option, as there's no way - # to check which partition_key other compute services are using. - if CONF.ironic.partition_key is not None: + # to check which conductor_group other compute services are using. + if CONF.ironic.conductor_group is not None: try: # NOTE(jroll) first we need to make sure the Ironic API can # filter by conductor_group. If it cannot, limiting to @@ -741,6 +740,12 @@ class IronicDriver(virt_driver.ComputeDriver): # table will be here so far, and we might be brand new. services.add(CONF.host.lower()) + if len(services) > 1: + LOG.warning('Having multiple compute services in your ' + 'deployment, for a single conductor group, ' + 'is now deprecated. We recommend moving ' + 'to just a single ironic nova compute service.') + self.hash_ring = hash_ring.HashRing(services, partitions=_HASH_RING_PARTITIONS) LOG.debug('Hash ring members are %s', services) @@ -759,18 +764,18 @@ class IronicDriver(virt_driver.ComputeDriver): # depending on if any filter parameters are applied. return self._get_node_list(fields=_NODE_FIELDS, **kwargs) - # NOTE(jroll) if partition_key is set, we need to limit nodes that + # NOTE(jroll) if conductor_group is set, we need to limit nodes that # can be managed to nodes that have a matching conductor_group # attribute. If the API isn't new enough to support conductor groups, # we fall back to managing all nodes. If it is new enough, we can # filter it in the API. - partition_key = CONF.ironic.partition_key - if partition_key is not None: + conductor_group = CONF.ironic.conductor_group + if conductor_group is not None: try: self._can_send_version(min_version='1.46') - nodes = _get_node_list(conductor_group=partition_key) + nodes = _get_node_list(conductor_group=conductor_group) LOG.debug('Limiting manageable ironic nodes to conductor ' - 'group %s', partition_key) + 'group %s', conductor_group) except exception.IronicAPIVersionNotAvailable: LOG.error('Required Ironic API version 1.46 is not ' 'available to filter nodes by conductor group. ' diff --git a/releasenotes/notes/deprecate-ironic-peer-list-ff8a502935faa045.yaml b/releasenotes/notes/deprecate-ironic-peer-list-ff8a502935faa045.yaml new file mode 100644 index 0000000000..f4a23b63bd --- /dev/null +++ b/releasenotes/notes/deprecate-ironic-peer-list-ff8a502935faa045.yaml @@ -0,0 +1,16 @@ +--- +deprecations: + - | + We have renamed ``[ironic]partition_key`` to ``[ironic]conductor_group``. + The config option is still used to specify which Ironic conductor group + the ironic driver in the nova compute process should target. + - | + We have deprecated the configuration ``[ironic]peer_list``, along with + our support for a group of ironic nova-compute processes targeting + a shared set of Ironic nodes. + There are so many bugs in this support we now prefer statically + sharding the nodes between multiple nova-compute processes. + Note that the ironic nova-compute process is stateless, and the + identity of the service is defined by the config option ``[DEFAULT]host``. + As such, you can use an active-passive HA solution to ensure at most + one nova-compute process is running for each Ironic node shard.