diff --git a/nova/compute/pci_placement_translator.py b/nova/compute/pci_placement_translator.py index 36ab2152b1..7ee2e097ec 100644 --- a/nova/compute/pci_placement_translator.py +++ b/nova/compute/pci_placement_translator.py @@ -11,11 +11,14 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. +import collections +import copy import typing as ty import os_resource_classes import os_traits from oslo_log import log as logging +from oslo_utils import uuidutils from nova.compute import provider_tree from nova import exception @@ -126,6 +129,10 @@ class PciResourceProvider: def devs(self) -> ty.List[pci_device.PciDevice]: return [self.parent_dev] if self.parent_dev else self.children_devs + @property + def to_be_deleted(self): + return not bool(self.devs) + def add_child(self, dev, dev_spec_tags: ty.Dict[str, str]) -> None: if self.parent_dev: raise exception.PlacementPciDependentDeviceException( @@ -177,13 +184,34 @@ class PciResourceProvider: # Nothing to do here. The update_provider_tree we handle full RP pass + def _get_allocations(self) -> ty.Mapping[str, int]: + """Return a dict of used resources keyed by consumer UUID. + + Note that: + 1) a single consumer can consume more than one resource from a single + RP. I.e. A VM with two VFs from the same parent PF + 2) multiple consumers can consume resources from a single RP. I.e. two + VMs consuming one VF from the same PF each + 3) regardless of how many consumers we have on a single PCI RP, they + are always consuming resources from the same resource class as + we are not supporting dependent devices modelled by the same RP but + different resource classes. + """ + return collections.Counter( + [ + dev.instance_uuid + for dev in self.devs + if "instance_uuid" in dev and dev.instance_uuid + ] + ) + def update_provider_tree( self, provider_tree: provider_tree.ProviderTree, parent_rp_name: str, ) -> None: - if not self.parent_dev and not self.children_devs: + if self.to_be_deleted: # This means we need to delete the RP from placement if exists if provider_tree.exists(self.name): # NOTE(gibi): If there are allocations on this RP then @@ -194,7 +222,16 @@ class PciResourceProvider: return if not provider_tree.exists(self.name): - provider_tree.new_child(self.name, parent_rp_name) + # NOTE(gibi): We need to generate UUID for the new provider in Nova + # instead of letting Placement assign one. We are potentially + # healing a missing RP along with missing allocations on that RP. + # The allocation healing happens with POST /reshape, and that API + # only takes RP UUIDs. + provider_tree.new_child( + self.name, + parent_rp_name, + uuid=uuidutils.generate_uuid(dashed=True) + ) provider_tree.update_inventory( self.name, @@ -214,6 +251,43 @@ class PciResourceProvider: ) provider_tree.update_traits(self.name, self.traits) + def update_allocations( + self, + allocations: dict, + provider_tree: provider_tree.ProviderTree + ) -> bool: + updated = False + + if self.to_be_deleted: + # the RP is going away because either removed from the hypervisor + # or the compute's config is changed to ignore the device. + return updated + + # we assume here that if this RP has been created in the current round + # of healing then it already has a UUID assigned. + rp_uuid = provider_tree.data(self.name).uuid + + for consumer, amount in self._get_allocations().items(): + current_allocs = allocations[consumer]['allocations'] + current_rp_allocs = current_allocs.get(rp_uuid) + + if current_rp_allocs: + # update an existing allocation if the current one differs + current_rc_allocs = current_rp_allocs["resources"].get( + self.resource_class, 0) + if current_rc_allocs != amount: + current_rp_allocs[ + "resources"][self.resource_class] = amount + updated = True + else: + # insert a new allocation as it is missing + current_allocs[rp_uuid] = { + "resources": {self.resource_class: amount} + } + updated = True + + return updated + def __str__(self) -> str: if self.devs: return ( @@ -348,6 +422,21 @@ class PlacementView: for rp_name, rp in self.rps.items(): rp.update_provider_tree(provider_tree, self.root_rp_name) + def update_allocations( + self, + allocations: dict, + provider_tree: provider_tree.ProviderTree + ) -> bool: + """Updates the passed in allocations dict inplace with any PCI + allocations that is inferred from the PciDevice objects already added + to the view. It returns True if the allocations dict has been changed, + False otherwise. + """ + updated = False + for rp in self.rps.values(): + updated |= rp.update_allocations(allocations, provider_tree) + return updated + def ensure_no_dev_spec_with_devname(dev_specs: ty.List[devspec.PciDeviceSpec]): for dev_spec in dev_specs: @@ -437,7 +526,16 @@ def update_provider_tree_for_pci( LOG.info("Placement PCI resource view: %s", pv) pv.update_provider_tree(provider_tree) - # FIXME(gibi): Check allocations too based on pci_dev.instance_uuid and - # if here was any update then we have to return True to trigger a reshape. + old_alloc = copy.deepcopy(allocations) + updated = pv.update_allocations(allocations, provider_tree) - return False + if updated: + LOG.debug( + "Placement PCI view needs allocation healing. This should only " + "happen if [scheduler]pci_in_placement is still disabled. " + "Original allocations: %s New allocations: %s", + old_alloc, + allocations, + ) + + return updated diff --git a/nova/tests/functional/libvirt/test_pci_in_placement.py b/nova/tests/functional/libvirt/test_pci_in_placement.py index 065da2b147..e2b69cb1e2 100644 --- a/nova/tests/functional/libvirt/test_pci_in_placement.py +++ b/nova/tests/functional/libvirt/test_pci_in_placement.py @@ -62,6 +62,9 @@ class PlacementPCIReportingTests(test_pci_sriov_servers._PCIServersTestBase): def _to_device_spec_conf(spec_list): return [jsonutils.dumps(x) for x in spec_list] + +class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests): + def test_new_compute_init_with_pci_devs(self): """A brand new compute is started with multiple pci devices configured for nova. @@ -748,3 +751,188 @@ class PlacementPCIReportingTests(test_pci_sriov_servers._PCIServersTestBase): "enabled.", str(ex) ) + + +class PlacementPCIAllocationHealingTests(PlacementPCIReportingTests): + def setUp(self): + super().setUp() + # Pre-configure a PCI alias to consume our devs + alias_pci = { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.PCI_PROD_ID, + "name": "a-pci-dev", + } + alias_pf = { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.PF_PROD_ID, + "device_type": "type-PF", + "name": "a-pf", + } + alias_vf = { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.VF_PROD_ID, + "device_type": "type-VF", + "name": "a-vf", + } + self.flags( + group='pci', + alias=self._to_pci_alias_conf([alias_pci, alias_pf, alias_vf])) + + @staticmethod + def _to_pci_alias_conf(alias_list): + return [jsonutils.dumps(x) for x in alias_list] + + def test_heal_single_pci_allocation(self): + # The fake libvirt will emulate on the host: + # * one type-PCI in slot 0 + pci_info = fakelibvirt.HostPCIDevicesInfo( + num_pci=1, num_pfs=0, num_vfs=0) + # the config matches the PCI dev + device_spec = self._to_device_spec_conf( + [ + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.PCI_PROD_ID, + }, + ] + ) + self.flags(group='pci', device_spec=device_spec) + + # Start a compute *without* PCI tracking in placement + self.mock_pci_report_in_placement.return_value = False + self.start_compute(hostname="compute1", pci_info=pci_info) + self.assertPCIDeviceCounts("compute1", total=1, free=1) + + # Create an instance that consume our PCI dev + extra_spec = {"pci_passthrough:alias": "a-pci-dev:1"} + flavor_id = self._create_flavor(extra_spec=extra_spec) + server = self._create_server(flavor_id=flavor_id, networks=[]) + self.assertPCIDeviceCounts("compute1", total=1, free=0) + + # Restart the compute but now with PCI tracking enabled + self.mock_pci_report_in_placement.return_value = True + self.restart_compute_service("compute1") + # Assert that the PCI allocation is healed in placement + self.assertPCIDeviceCounts("compute1", total=1, free=0) + expected_placement_view = { + "inventories": { + "0000:81:00.0": {self.PCI_RC: 1}, + }, + "traits": { + "0000:81:00.0": [], + }, + "usages": { + "0000:81:00.0": {self.PCI_RC: 1} + }, + "allocations": { + server['id']: { + "0000:81:00.0": {self.PCI_RC: 1} + } + } + } + self.assert_placement_pci_view("compute1", **expected_placement_view) + + # run an update_available_resources periodic and assert that the usage + # and allocation stays + self._run_periodics() + self.assert_placement_pci_view("compute1", **expected_placement_view) + + def test_heal_multiple_allocations(self): + # The fake libvirt will emulate on the host: + # * two type-PCI devs (slot 0 and 1) + # * two type-PFs (slot 2 and 3) with 4 type-VFs each + pci_info = fakelibvirt.HostPCIDevicesInfo( + num_pci=2, num_pfs=2, num_vfs=8) + # the config matches: + device_spec = self._to_device_spec_conf( + [ + # both type-PCI + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.PCI_PROD_ID, + }, + # the PF in slot 2 + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.PF_PROD_ID, + "address": "0000:81:02.0", + }, + # the VFs in slot 3 + { + "vendor_id": fakelibvirt.PCI_VEND_ID, + "product_id": fakelibvirt.VF_PROD_ID, + "address": "0000:81:03.*", + }, + ] + ) + self.flags(group='pci', device_spec=device_spec) + + # Start a compute *without* PCI tracking in placement + self.mock_pci_report_in_placement.return_value = False + self.start_compute(hostname="compute1", pci_info=pci_info) + # 2 PCI + 1 PF + 4 VFs + self.assertPCIDeviceCounts("compute1", total=7, free=7) + + # Create three instances consuming devices: + # * server_2pci: two type-PCI + # * server_pf_vf: one PF and one VF + # * server_2vf: two VFs + extra_spec = {"pci_passthrough:alias": "a-pci-dev:2"} + flavor_id = self._create_flavor(extra_spec=extra_spec) + server_2pci = self._create_server(flavor_id=flavor_id, networks=[]) + self.assertPCIDeviceCounts("compute1", total=7, free=5) + + extra_spec = {"pci_passthrough:alias": "a-pf:1,a-vf:1"} + flavor_id = self._create_flavor(extra_spec=extra_spec) + server_pf_vf = self._create_server(flavor_id=flavor_id, networks=[]) + self.assertPCIDeviceCounts("compute1", total=7, free=3) + + extra_spec = {"pci_passthrough:alias": "a-vf:2"} + flavor_id = self._create_flavor(extra_spec=extra_spec) + server_2vf = self._create_server(flavor_id=flavor_id, networks=[]) + self.assertPCIDeviceCounts("compute1", total=7, free=1) + + # Restart the compute but now with PCI tracking enabled + self.mock_pci_report_in_placement.return_value = True + self.restart_compute_service("compute1") + # Assert that the PCI allocation is healed in placement + self.assertPCIDeviceCounts("compute1", total=7, free=1) + expected_placement_view = { + "inventories": { + "0000:81:00.0": {self.PCI_RC: 1}, + "0000:81:01.0": {self.PCI_RC: 1}, + "0000:81:02.0": {self.PF_RC: 1}, + "0000:81:03.0": {self.VF_RC: 4}, + }, + "traits": { + "0000:81:00.0": [], + "0000:81:01.0": [], + "0000:81:02.0": [], + "0000:81:03.0": [], + }, + "usages": { + "0000:81:00.0": {self.PCI_RC: 1}, + "0000:81:01.0": {self.PCI_RC: 1}, + "0000:81:02.0": {self.PF_RC: 1}, + "0000:81:03.0": {self.VF_RC: 3}, + }, + "allocations": { + server_2pci['id']: { + "0000:81:00.0": {self.PCI_RC: 1}, + "0000:81:01.0": {self.PCI_RC: 1}, + }, + server_pf_vf['id']: { + "0000:81:02.0": {self.PF_RC: 1}, + "0000:81:03.0": {self.VF_RC: 1}, + }, + server_2vf['id']: { + "0000:81:03.0": {self.VF_RC: 2} + }, + }, + } + self.assert_placement_pci_view("compute1", **expected_placement_view) + + # run an update_available_resources periodic and assert that the usage + # and allocation stays + self._run_periodics() + self.assert_placement_pci_view("compute1", **expected_placement_view) diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py index 4b4abb209f..20d3014eb0 100644 --- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py +++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py @@ -80,7 +80,15 @@ class _PCIServersTestBase(base.ServersTestBase): return rp self.fail(f'RP {name} is not found in Placement {rps}') - def assert_placement_pci_view(self, hostname, inventories, traits): + def assert_placement_pci_view( + self, hostname, inventories, traits, usages=None, allocations=None + ): + if not usages: + usages = {} + + if not allocations: + allocations = {} + compute_rp_uuid = self.compute_rp_uuids[hostname] rps = self._get_all_rps_in_a_tree(compute_rp_uuid) @@ -120,6 +128,47 @@ class _PCIServersTestBase(base.ServersTestBase): f"Traits on RP {real_rp_name} does not match with expectation" ) + for rp_name, usage in usages.items(): + real_rp_name = f'{hostname}_{rp_name}' + rp = self._get_rp_by_name(real_rp_name, rps) + rp_usage = self._get_provider_usages(rp['uuid']) + self.assertEqual( + usage, + rp_usage, + f"Usage on RP {real_rp_name} does not match with expectation" + ) + + for consumer, expected_allocations in allocations.items(): + actual_allocations = self._get_allocations_by_server_uuid(consumer) + self.assertEqual( + len(expected_allocations), + # actual_allocations also contains allocations against the + # root provider for VCPU, MEMORY_MB, and DISK_GB so subtract + # one + len(actual_allocations) - 1, + f"The consumer {consumer} allocates from different number of " + f"RPs than expected. Expected: {expected_allocations}, " + f"Actual: {actual_allocations}" + ) + for rp_name, expected_rp_allocs in expected_allocations.items(): + real_rp_name = f'{hostname}_{rp_name}' + rp = self._get_rp_by_name(real_rp_name, rps) + self.assertIn( + rp['uuid'], + actual_allocations, + f"The consumer {consumer} expected to allocate from " + f"{rp['uuid']}. Expected: {expected_allocations}, " + f"Actual: {actual_allocations}" + ) + actual_rp_allocs = actual_allocations[rp['uuid']]['resources'] + self.assertEqual( + expected_rp_allocs, + actual_rp_allocs, + f"The consumer {consumer} expected to have allocation " + f"{expected_rp_allocs} on {rp_name} but it has " + f"{actual_rp_allocs} instead." + ) + class _PCIServersWithMigrationTestBase(_PCIServersTestBase):