From ae064caf169b6d5888a5aec692b911c57c439ca4 Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Wed, 28 May 2025 15:20:44 +0200 Subject: [PATCH] Validate [pci]alias at service startup Both nova-api and nova-compute depends on the [pci]alias configuration. These services loaded and validated the config lazily when it was needed. This can late and hard to troubleshoot failures during instance lifecycle operations due to simple config errors. So this patch adds an early load of this config to nova-api and nova-compute. Related-Bug: #2102038 Related-Bug: #2111440 Change-Id: I5d5dc912ca24979067984c7cb53ceaded7daf236 --- nova/api/openstack/wsgi_app.py | 6 ++++++ nova/compute/manager.py | 4 ++++ nova/pci/request.py | 4 ++-- .../tests/unit/api/openstack/test_wsgi_app.py | 9 +++++++++ nova/tests/unit/compute/test_compute_mgr.py | 12 +++++++++++ nova/tests/unit/pci/test_request.py | 20 +++++++++---------- 6 files changed, 43 insertions(+), 12 deletions(-) diff --git a/nova/api/openstack/wsgi_app.py b/nova/api/openstack/wsgi_app.py index d6f1030f83..648d529e5f 100644 --- a/nova/api/openstack/wsgi_app.py +++ b/nova/api/openstack/wsgi_app.py @@ -26,6 +26,7 @@ from nova import config from nova import context from nova import exception from nova import objects +from nova.pci import request from nova import service from nova import utils from nova import version @@ -51,6 +52,11 @@ def _get_config_files(env=None): def _setup_service(host, name): + + # NOTE(gibi): validate the [pci]alias config early to avoid late failures + # at instance creation due to config errors. + request.get_alias_from_config() + try: utils.raise_if_old_compute() except exception.TooOldComputeService as e: diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 947de463b3..327ec6f9a8 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1620,6 +1620,10 @@ class ComputeManager(manager.Manager): # if the configuration is wrong. whitelist.Whitelist(CONF.pci.device_spec) + # NOTE(gibi): validate the [pci]alias config early to avoid late + # failures at instance lifecycle operations due to config errors. + pci_req_module.get_alias_from_config() + nova.conf.neutron.register_dynamic_opts(CONF) # Even if only libvirt uses them, make it available for all drivers nova.conf.devices.register_dynamic_opts(CONF) diff --git a/nova/pci/request.py b/nova/pci/request.py index d7c9e79243..7c7c418f01 100644 --- a/nova/pci/request.py +++ b/nova/pci/request.py @@ -173,7 +173,7 @@ def _validate_aliases(aliases): _validate_required_ids(aliases) -def _get_alias_from_config() -> Alias: +def get_alias_from_config() -> Alias: """Parse and validate PCI aliases from the nova config. :returns: A dictionary where the keys are alias names and the values are @@ -237,7 +237,7 @@ def _translate_alias_to_requests( alias_spec: str, affinity_policy: ty.Optional[str] = None, ) -> ty.List['objects.InstancePCIRequest']: """Generate complete pci requests from pci aliases in extra_spec.""" - pci_aliases = _get_alias_from_config() + pci_aliases = get_alias_from_config() pci_requests: ty.List[objects.InstancePCIRequest] = [] for name, count in [spec.split(':') for spec in alias_spec.split(',')]: diff --git a/nova/tests/unit/api/openstack/test_wsgi_app.py b/nova/tests/unit/api/openstack/test_wsgi_app.py index 7fa1eacc62..b744e4e269 100644 --- a/nova/tests/unit/api/openstack/test_wsgi_app.py +++ b/nova/tests/unit/api/openstack/test_wsgi_app.py @@ -15,6 +15,7 @@ from unittest import mock import fixtures from oslo_config import fixture as config_fixture +from oslo_serialization import jsonutils from oslotest import base from nova.api.openstack import wsgi_app @@ -127,6 +128,14 @@ document_root = /tmp group='workarounds') wsgi_app._setup_service('myhost', 'api') + def test_setup_service_pci_alias_validation(self): + wsgi_app.CONF.set_override( + 'alias', jsonutils.dumps({'name': 'foo'}), + group='pci') + self.assertRaises( + exception.PciInvalidAlias, + wsgi_app._setup_service, 'myhost', 'api') + def test__get_config_files_empty_env(self): env = {} result = wsgi_app._get_config_files(env) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 712b9f2349..2a7eb012ea 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -6954,6 +6954,18 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.assertRaises(exception.PciDeviceInvalidDeviceName, self.compute.init_host, None) + def test_init_host_pci_alias_validation_failure(self): + # Tests that we fail init_host if the pci.alias is + # configured incorrectly. + self.flags( + alias=[ + jsonutils.dumps({'name': 'foo'}) + ], + group='pci' + ) + self.assertRaises( + exception.PciInvalidAlias, self.compute.init_host, None) + @mock.patch('nova.compute.manager.ComputeManager._instance_update') def test_error_out_instance_on_exception_not_implemented_err(self, inst_update_mock): diff --git a/nova/tests/unit/pci/test_request.py b/nova/tests/unit/pci/test_request.py index d6b759f591..4a736193e8 100644 --- a/nova/tests/unit/pci/test_request.py +++ b/nova/tests/unit/pci/test_request.py @@ -81,7 +81,7 @@ class PciRequestTestCase(test.NoDBTestCase): def test_get_alias_from_config_valid(self): self.flags(alias=[_fake_alias1], group='pci') - result = request._get_alias_from_config() + result = request.get_alias_from_config() expected_result = ( 'legacy', [{ @@ -103,7 +103,7 @@ class PciRequestTestCase(test.NoDBTestCase): }) self.flags(alias=[_fake_alias1, _fake_alias], group='pci') - result = request._get_alias_from_config() + result = request.get_alias_from_config() expected_result = ( 'legacy', [{ @@ -133,7 +133,7 @@ class PciRequestTestCase(test.NoDBTestCase): self.flags(alias=[_fake_alias1, _fake_alias], group='pci') ex = self.assertRaises( - exception.PciInvalidAlias, request._get_alias_from_config) + exception.PciInvalidAlias, request.get_alias_from_config) self.assertEqual( "The PCI alias(es) QuickAssist have multiple specs but " "[filter_scheduler]pci_in_placement is True. The PCI in Placement " @@ -146,7 +146,7 @@ class PciRequestTestCase(test.NoDBTestCase): self.flags(alias=[alias], group='pci') self.assertRaises( exception.PciInvalidAlias, - request._get_alias_from_config) + request.get_alias_from_config) def test_get_alias_from_config_invalid_device_type(self): fake_alias = jsonutils.dumps({ @@ -215,7 +215,7 @@ class PciRequestTestCase(test.NoDBTestCase): "numa_policy": policy, }) self.flags(alias=[fake_alias], group='pci') - aliases = request._get_alias_from_config() + aliases = request.get_alias_from_config() self.assertIsNotNone(aliases) self.assertIn("xxx", aliases) self.assertEqual(policy, aliases["xxx"][0]) @@ -228,7 +228,7 @@ class PciRequestTestCase(test.NoDBTestCase): }) self.flags(pci_in_placement=True, group='filter_scheduler') self.flags(alias=[fake_alias], group='pci') - aliases = request._get_alias_from_config() + aliases = request.get_alias_from_config() self.assertIsNotNone(aliases) self.assertIn("xxx", aliases) self.assertEqual( @@ -256,7 +256,7 @@ class PciRequestTestCase(test.NoDBTestCase): self.flags(alias=[fake_alias_a, fake_alias_b], group='pci') self.assertRaises( exception.PciInvalidAlias, - request._get_alias_from_config) + request.get_alias_from_config) def test_get_alias_from_config_conflicting_numa_policy(self): """Check behavior when numa_policy conflicts occur.""" @@ -277,7 +277,7 @@ class PciRequestTestCase(test.NoDBTestCase): self.flags(alias=[fake_alias_a, fake_alias_b], group='pci') self.assertRaises( exception.PciInvalidAlias, - request._get_alias_from_config) + request.get_alias_from_config) def test_get_alias_from_config_missing_ids(self): a1 = jsonutils.dumps({ @@ -304,7 +304,7 @@ class PciRequestTestCase(test.NoDBTestCase): self.flags(alias=[a1, a2, a3, a4, a5], group='pci') ex = self.assertRaises( - exception.PciInvalidAlias, request._get_alias_from_config) + exception.PciInvalidAlias, request.get_alias_from_config) self.assertEqual( "The PCI alias(es) a1,a2,a3,a4 does not have vendor_id and " "product_id fields set.", @@ -336,7 +336,7 @@ class PciRequestTestCase(test.NoDBTestCase): self.flags(alias=[a1, a2, a3, a4, a5], group='pci') ex = self.assertRaises( - exception.PciInvalidAlias, request._get_alias_from_config) + exception.PciInvalidAlias, request.get_alias_from_config) self.assertEqual( "The PCI alias(es) a1,a2,a3 does not have vendor_id and " "product_id fields set or resource_class field set.",