diff --git a/doc/source/admin/configuration/hypervisor-ironic.rst b/doc/source/admin/configuration/hypervisor-ironic.rst index bba01deffa..931de0008f 100644 --- a/doc/source/admin/configuration/hypervisor-ironic.rst +++ b/doc/source/admin/configuration/hypervisor-ironic.rst @@ -40,11 +40,28 @@ Configuration Scaling and performance issues ------------------------------ +- It is typical for a single nova-compute process to support several + hundred Ironic nodes. There are known issues when you attempt to + support more than 1000 Ironic nodes associated with a single + nova-compute process, even though Ironic is able to scale out + a single conductor group to much larger sizes. There are many + other factors that can affect what is the maximum practical size of + a conductor group within your deployment. - The ``update_available_resource`` periodic task reports all the resources managed by Ironic. Depending the number of nodes, it can take a lot of time. The nova-compute will not perform any other operations when this task is - running. You can use conductor groups to help scale, by setting - :oslo.config:option:`ironic.partition_key`. + running. You can use conductor groups to help shard your deployment + between multiple nova-compute processes by setting + :oslo.config:option:`ironic.conductor_group`. +- The nova-compute process using the Ironic driver can be moved between + different physical servers using active/passive failover. But when doing + this failover, you must ensure :oslo.config:option:`host` is the same + no matter where the nova-compute process is running. Similarly you must + ensure there are at most one nova-compute processes running for each + conductor group. +- Running multiple nova-compute processes that point at the same + conductor group is now deprecated. Please never have more than one + host in the peer list: :oslo.config:option:`ironic.peer_list` Known limitations / Missing features diff --git a/nova/conf/ironic.py b/nova/conf/ironic.py index 2734f2b78a..ca1655dcc2 100644 --- a/nova/conf/ironic.py +++ b/nova/conf/ironic.py @@ -69,7 +69,8 @@ Related options: help='Timeout (seconds) to wait for node serial console state ' 'changed. Set to 0 to disable timeout.'), cfg.StrOpt( - 'partition_key', + 'conductor_group', + deprecated_name='partition_key', default=None, mutable=True, max_length=255, @@ -83,13 +84,18 @@ Related options: 'leaving the option unset.'), cfg.ListOpt( 'peer_list', + deprecated_for_removal=True, + deprecated_since='28.0.0', + deprecated_reason="""\ + We do not recommend using nova-compute HA, please use passive + failover of a single nova-compute service instead.""", default=[], mutable=True, help='List of hostnames for all nova-compute services (including ' - 'this host) with this partition_key config value. ' - 'Nodes matching the partition_key value will be distributed ' + 'this host) with this conductor_group config value. ' + 'Nodes matching the conductor_group value will be distributed ' 'between all services specified here. ' - 'If partition_key is unset, this option is ignored.'), + 'If conductor_group is unset, this option is ignored.'), ] diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 52aa37ac13..e0bc80c428 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -3021,7 +3021,7 @@ class HashRingTestCase(test.NoDBTestCase): services = ['host1', 'host2', 'host3'] expected_hosts = {'host1', 'host2'} self.mock_is_up.return_value = True - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host1', 'host2'], group='ironic') self._test__refresh_hash_ring(services, expected_hosts, uncalled=['host3']) @@ -3034,7 +3034,7 @@ class HashRingTestCase(test.NoDBTestCase): services = ['host1', 'host2', 'host3'] expected_hosts = {'host1', 'host2', 'host3'} self.mock_is_up.return_value = True - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host1', 'host2'], group='ironic') self._test__refresh_hash_ring(services, expected_hosts, uncalled=['host3']) @@ -3043,17 +3043,17 @@ class HashRingTestCase(test.NoDBTestCase): @mock.patch.object(ironic_driver, 'LOG', autospec=True) def test__check_peer_list(self, mock_log): self.flags(host='host1') - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host1', 'host2'], group='ironic') ironic_driver._check_peer_list() - # happy path, nothing happens + # warn as we have two hosts in the list + self.assertTrue(mock_log.warning.called) self.assertFalse(mock_log.error.called) - self.assertFalse(mock_log.warning.called) @mock.patch.object(ironic_driver, 'LOG', autospec=True) def test__check_peer_list_empty(self, mock_log): self.flags(host='host1') - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=[], group='ironic') self.assertRaises(exception.InvalidPeerList, ironic_driver._check_peer_list) @@ -3062,7 +3062,7 @@ class HashRingTestCase(test.NoDBTestCase): @mock.patch.object(ironic_driver, 'LOG', autospec=True) def test__check_peer_list_missing_self(self, mock_log): self.flags(host='host1') - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host2'], group='ironic') self.assertRaises(exception.InvalidPeerList, ironic_driver._check_peer_list) @@ -3071,10 +3071,12 @@ class HashRingTestCase(test.NoDBTestCase): @mock.patch.object(ironic_driver, 'LOG', autospec=True) def test__check_peer_list_only_self(self, mock_log): self.flags(host='host1') - self.flags(partition_key='not-none', group='ironic') + self.flags(conductor_group='not-none', group='ironic') self.flags(peer_list=['host1'], group='ironic') ironic_driver._check_peer_list() - self.assertTrue(mock_log.warning.called) + # happy path, nothing happens + self.assertFalse(mock_log.error.called) + self.assertFalse(mock_log.warning.called) class NodeCacheTestCase(test.NoDBTestCase): @@ -3184,7 +3186,7 @@ class NodeCacheTestCase(test.NoDBTestCase): ] hosts = [self.host, self.host, self.host] partition_key = 'some-group' - self.flags(partition_key=partition_key, group='ironic') + self.flags(conductor_group=partition_key, group='ironic') self._test__refresh_cache(instances, nodes, hosts, partition_key=partition_key) @@ -3205,7 +3207,7 @@ class NodeCacheTestCase(test.NoDBTestCase): ] hosts = [self.host, self.host, self.host] partition_key = 'some-group' - self.flags(partition_key=partition_key, group='ironic') + self.flags(conductor_group=partition_key, group='ironic') self._test__refresh_cache(instances, nodes, hosts, partition_key=partition_key, diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 960acf06a8..48fc349191 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -133,7 +133,7 @@ def _log_ironic_polling(what, node, instance): def _check_peer_list(): # these configs are mutable; need to check at runtime and init - if CONF.ironic.partition_key is not None: + if CONF.ironic.conductor_group is not None: peer_list = set(CONF.ironic.peer_list) if not peer_list: LOG.error('FATAL: Peer list is not configured in the ' @@ -145,11 +145,10 @@ def _check_peer_list(): 'compute service hostname (%s); add it to ' 'the [ironic]/peer_list option.', CONF.host) raise exception.InvalidPeerList(host=CONF.host) - if set([CONF.host]) == peer_list: - LOG.warning('This compute service (%s) is the only service ' - 'present in the [ironic]/peer_list option. ' - 'Are you sure this should not include more ' - 'hosts?', CONF.host) + if len(peer_list) > 1: + LOG.warning('Having multiple compute services in your ' + 'peer_list is now deprecated. We recommend moving ' + 'to just a single node in your peer list.') class IronicDriver(virt_driver.ComputeDriver): @@ -706,8 +705,8 @@ class IronicDriver(virt_driver.ComputeDriver): # NOTE(jroll) if this is set, we need to limit the set of other # compute services in the hash ring to hosts that are currently up # and specified in the peer_list config option, as there's no way - # to check which partition_key other compute services are using. - if CONF.ironic.partition_key is not None: + # to check which conductor_group other compute services are using. + if CONF.ironic.conductor_group is not None: try: # NOTE(jroll) first we need to make sure the Ironic API can # filter by conductor_group. If it cannot, limiting to @@ -741,6 +740,12 @@ class IronicDriver(virt_driver.ComputeDriver): # table will be here so far, and we might be brand new. services.add(CONF.host.lower()) + if len(services) > 1: + LOG.warning('Having multiple compute services in your ' + 'deployment, for a single conductor group, ' + 'is now deprecated. We recommend moving ' + 'to just a single ironic nova compute service.') + self.hash_ring = hash_ring.HashRing(services, partitions=_HASH_RING_PARTITIONS) LOG.debug('Hash ring members are %s', services) @@ -759,18 +764,18 @@ class IronicDriver(virt_driver.ComputeDriver): # depending on if any filter parameters are applied. return self._get_node_list(fields=_NODE_FIELDS, **kwargs) - # NOTE(jroll) if partition_key is set, we need to limit nodes that + # NOTE(jroll) if conductor_group is set, we need to limit nodes that # can be managed to nodes that have a matching conductor_group # attribute. If the API isn't new enough to support conductor groups, # we fall back to managing all nodes. If it is new enough, we can # filter it in the API. - partition_key = CONF.ironic.partition_key - if partition_key is not None: + conductor_group = CONF.ironic.conductor_group + if conductor_group is not None: try: self._can_send_version(min_version='1.46') - nodes = _get_node_list(conductor_group=partition_key) + nodes = _get_node_list(conductor_group=conductor_group) LOG.debug('Limiting manageable ironic nodes to conductor ' - 'group %s', partition_key) + 'group %s', conductor_group) except exception.IronicAPIVersionNotAvailable: LOG.error('Required Ironic API version 1.46 is not ' 'available to filter nodes by conductor group. ' diff --git a/releasenotes/notes/deprecate-ironic-peer-list-ff8a502935faa045.yaml b/releasenotes/notes/deprecate-ironic-peer-list-ff8a502935faa045.yaml new file mode 100644 index 0000000000..f4a23b63bd --- /dev/null +++ b/releasenotes/notes/deprecate-ironic-peer-list-ff8a502935faa045.yaml @@ -0,0 +1,16 @@ +--- +deprecations: + - | + We have renamed ``[ironic]partition_key`` to ``[ironic]conductor_group``. + The config option is still used to specify which Ironic conductor group + the ironic driver in the nova compute process should target. + - | + We have deprecated the configuration ``[ironic]peer_list``, along with + our support for a group of ironic nova-compute processes targeting + a shared set of Ironic nodes. + There are so many bugs in this support we now prefer statically + sharding the nodes between multiple nova-compute processes. + Note that the ironic nova-compute process is stateless, and the + identity of the service is defined by the config option ``[DEFAULT]host``. + As such, you can use an active-passive HA solution to ensure at most + one nova-compute process is running for each Ironic node shard.