diff --git a/nova/compute/pci_placement_translator.py b/nova/compute/pci_placement_translator.py index 087369cbd1..5f7bf12151 100644 --- a/nova/compute/pci_placement_translator.py +++ b/nova/compute/pci_placement_translator.py @@ -136,14 +136,24 @@ class PciResourceProvider: self.resource_class: ty.Optional[str] = None self.traits: ty.Optional[ty.Set[str]] = None self.is_otu = False + # This is an adjustment for the total inventory based on normal device + # due to possibility of devices held in the tracker even though they + # are removed from the configuration due to still having allocations. + # This number will be calculated based on the existing allocations + # during update_provider_tree call. + self.adjustment = 0 @property def devs(self) -> ty.List[pci_device.PciDevice]: return [self.parent_dev] if self.parent_dev else self.children_devs + @property + def total(self): + return len(self.devs) + self.adjustment + @property def to_be_deleted(self): - return not bool(self.devs) + return self.total == 0 def add_child(self, dev, dev_spec_tags: ty.Dict[str, str]) -> None: if self.parent_dev: @@ -242,8 +252,8 @@ class PciResourceProvider: # one_time_use=true flag, but otherwise the operator controls # reserved and nova will not override that value periodically. inventory = { - "total": len(self.devs), - "max_unit": len(self.devs), + "total": self.total, + "max_unit": self.total, } self._handle_one_time_use(inventory) @@ -260,28 +270,62 @@ class PciResourceProvider: # If we are an allocated parent device, and our one-time-use flag # is set, we need to also set our inventory to reserved. # NOTE(danms): VERY IMPORTANT: we never *ever* want to update - # reserved to anything other than len(self.devs), and definitely + # reserved to anything other than self.total, and definitely # not if we are not allocated. These devices are intended to go # from unallocated to allocated AND reserved. They may be # unreserved by an external entity, but never nova. - inventory['reserved'] = len(self.devs) + inventory['reserved'] = self.total + + def _adjust_for_removals_and_held_devices( + self, + provider_tree: provider_tree.ProviderTree, + rp_rc_usage: ty.Dict[str, ty.Dict[str, int]], + ) -> None: + + rp_uuid = provider_tree.data(self.name).uuid + rc_usage = rp_rc_usage[rp_uuid] + + if not self.resource_class: + # The resource_class is undefined when there are no normal devices + # exists any more on this RP. If no normal devs exists then there + # is no device_spec to derive the RC and traits from. But if we + # still have allocations in placement against this RP that means + # there are devices removed from the configuration but kept in the + # tracker as they are still allocated. In this case we + # need to recover the resource class and traits from the + # existing allocation. + if len(rc_usage) == 0: + # no usage so nothing to adjust here + return + else: + # The len > 1 case should not happen for PCI RPs as we either + # track the parent PF or the child VFs there on the RP but + # never both. + self.resource_class = list(rc_usage.keys())[0] + self.traits = provider_tree.data(rp_uuid).traits + + # If device being removed but still held due to still having + # allocations then we need to adjust the total inventory to never go + # below the current usage otherwise Placement will reject the update. + usage = rc_usage[self.resource_class] + inventory = self.total + if usage > inventory: + LOG.warning( + "Needed to adjust inventories of %s on " + "resource provider %s from %d to %d due to existing " + "placement allocations. This should only happen while " + "VMs using already removed devices.", + self.resource_class, self.name, inventory, usage) + # This is counted into self.total to adjust the inventory + self.adjustment += usage - inventory def update_provider_tree( self, provider_tree: provider_tree.ProviderTree, parent_rp_name: str, + rp_rc_usage: ty.Dict[str, ty.Dict[str, int]], ) -> None: - if self.to_be_deleted: - # This means we need to delete the RP from placement if exists - if provider_tree.exists(self.name): - # NOTE(gibi): If there are allocations on this RP then - # Placement will reject the update the provider_tree is - # synced up. - provider_tree.remove(self.name) - - return - if not provider_tree.exists(self.name): # NOTE(gibi): We need to generate UUID for the new provider in Nova # instead of letting Placement assign one. We are potentially @@ -294,6 +338,14 @@ class PciResourceProvider: uuid=uuidutils.generate_uuid(dashed=True) ) + self._adjust_for_removals_and_held_devices(provider_tree, rp_rc_usage) + + # if after the adjustment no inventory left then we need to delete + # the RP explicitly + if self.total == 0: + provider_tree.remove(self.name) + return + provider_tree.update_inventory( self.name, self._get_inventories(), @@ -385,13 +437,13 @@ class PciResourceProvider: return updated def __str__(self) -> str: - if self.devs: + if not self.to_be_deleted: return ( - f"RP({self.name}, {self.resource_class}={len(self.devs)}, " + f"RP({self.name}, {self.resource_class}={self.total}, " f"traits={','.join(sorted(self.traits or set()))})" ) else: - return f"RP({self.name}, )" + return f"RP({self.name}, )" class PlacementView: @@ -425,18 +477,6 @@ class PlacementView: return self._get_rp_name_for_address(dev.parent_addr) - def _add_child( - self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str] - ) -> None: - rp_name = self._get_rp_name_for_child(dev) - self._ensure_rp(rp_name).add_child(dev, dev_spec_tags) - - def _add_parent( - self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str] - ) -> None: - rp_name = self._get_rp_name_for_address(dev.address) - self._ensure_rp(rp_name).add_parent(dev, dev_spec_tags) - def _add_dev( self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str] ) -> None: @@ -447,10 +487,11 @@ class PlacementView: # devices in placement. return + rp = self._ensure_rp_for_dev(dev) if dev.dev_type in PARENT_TYPES: - self._add_parent(dev, dev_spec_tags) + rp.add_parent(dev, dev_spec_tags) elif dev.dev_type in CHILD_TYPES: - self._add_child(dev, dev_spec_tags) + rp.add_child(dev, dev_spec_tags) else: msg = _( "Unhandled PCI device type %(type)s for %(dev)s. Please " @@ -461,51 +502,92 @@ class PlacementView: } raise exception.PlacementPciException(error=msg) - def _remove_child(self, dev: pci_device.PciDevice) -> None: - rp_name = self._get_rp_name_for_child(dev) - self._ensure_rp(rp_name).remove_child(dev) - - def _remove_parent(self, dev: pci_device.PciDevice) -> None: - rp_name = self._get_rp_name_for_address(dev.address) - self._ensure_rp(rp_name).remove_parent(dev) - def _remove_dev(self, dev: pci_device.PciDevice) -> None: """Remove PCI devices from Placement that existed before but now deleted from the hypervisor or unlisted from [pci]device_spec """ + rp = self._ensure_rp_for_dev(dev) if dev.dev_type in PARENT_TYPES: - self._remove_parent(dev) + rp.remove_parent(dev) elif dev.dev_type in CHILD_TYPES: - self._remove_child(dev) + rp.remove_child(dev) + + def _ensure_rp_for_dev( + self, dev: pci_device.PciDevice + ) -> PciResourceProvider: + """Ensures that the RP exists for the device and returns it + but does not do any inventory accounting for the given device on + the RP. + """ + if dev.dev_type in PARENT_TYPES: + rp_name = self._get_rp_name_for_address(dev.address) + return self._ensure_rp(rp_name) + elif dev.dev_type in CHILD_TYPES: + rp_name = self._get_rp_name_for_child(dev) + return self._ensure_rp(rp_name) + else: + raise ValueError( + f"Unhandled PCI device type {dev.dev_type} " + f"for dev {dev.address}.") def process_dev( self, dev: pci_device.PciDevice, dev_spec: ty.Optional[devspec.PciDeviceSpec], ) -> None: + # NOTE(gibi): We never observer dev.status DELETED as when that is set + # the device is also removed from the PCI tracker. So we can ignore + # that state. + if dev.status == fields.PciDeviceStatus.REMOVED: + # NOTE(gibi): We need to handle the situation when an instance + # uses a device where a dev_spec is removed. Here we need to keep + # the device in the Placement view similarly how the PCI tracker + # does it. + # However, we also need to handle the situation when such VM is + # being deleted. In that case we are called after the dev is freed + # and marked as removed by the tracker so dev.instance_uuid is + # None and dev.status is REMOVED. At this point the Placement + # allocation for this dev is still not deleted so we still have to + # keep the device in our view. The device will be deleted when the + # PCI tracker is saved which happens after us. + # However, we cannot overly eagerly keep devices here as a + # device in REMOVED state might be a device that had no allocation + # in Placement so it can be removed already without waiting for + # the next periodic update when the device disappears from the + # PCI tracker's list. If we are over eagerly keeping such device + # when it is not allocated then that will prevent a single step + # reconfiguration from whitelisting a VF to whitelisting its + # parent PF, because the VF will be kept at restart and conflict + # with the PF being added. - if dev.status in ( - fields.PciDeviceStatus.DELETED, - fields.PciDeviceStatus.REMOVED, - ): - # If the PCI tracker marked the device DELETED or REMOVED then - # such device is not allocated, so we are free to drop it from - # placement too. + # We choose to remove these devs so the happy path of removing + # not allocated devs is simple. And then we do an extra + # step later in update_provider_tree to reconcile Placement + # allocations with our view and add back some inventories to handle + # removed but allocated devs. self._remove_dev(dev) else: if not dev_spec: if dev.instance_uuid: LOG.warning( "Device spec is not found for device %s in " - "[pci]device_spec. We are skipping this devices " - "during Placement update. The device is allocated by " - "%s. You should not remove an allocated device from " - "the configuration. Please restore the configuration " - "or cold migrate the instance to resolve the " - "inconsistency.", + "[pci]device_spec. The device is allocated by " + "%s. We are keeping this device in the Placement " + "view. You should not remove an allocated device from " + "the configuration. Please restore the configuration. " + "If you cannot restore the configuration as the " + "device is dead then delete or cold migrate the " + "instance and then restart the nova-compute service " + "to resolve the inconsistency.", dev.address, dev.instance_uuid ) + # We need to keep the RP, but we cannot just use _add_dev + # to generate the inventory on the RP as that would require + # to know the dev_spec to e.g. have the RC. So we only + # ensure that the RP exists, the inventory will be adjusted + # based on the existing allocation in a later step. + self._ensure_rp_for_dev(dev) else: LOG.warning( "Device spec is not found for device %s in " @@ -525,11 +607,50 @@ class PlacementView: f"{', '.join(str(rp) for rp in self.rps.values())}" ) - def update_provider_tree( + @staticmethod + def get_usage_per_rc_and_rp( + allocations + ) -> ty.Dict[str, ty.Dict[str, int]]: + """Returns a dict keyed by RP uuid and the value is a dict of + resource class: usage pairs telling how much total usage the given RP + has from the given resource class across all the allocations. + """ + rp_rc_usage: ty.Dict[str, ty.Dict[str, int]] = ( + collections.defaultdict(lambda: collections.defaultdict(int))) + for consumer in allocations.values(): + for rp_uuid, alloc in consumer["allocations"].items(): + for rc, amount in alloc["resources"].items(): + rp_rc_usage[rp_uuid][rc] += amount + + return rp_rc_usage + + def _remove_managed_rps_from_tree_not_in_view( self, provider_tree: provider_tree.ProviderTree ) -> None: + """Removes PCI RPs from the provider_tree that are not present in the + current PlacementView. + """ + rp_names_in_view = {rp.name for rp in self.rps.values()} + uuids_in_tree = provider_tree.get_provider_uuids_in_tree( + self.root_rp_name) + for rp_uuid in uuids_in_tree: + rp_data = provider_tree.data(rp_uuid) + is_pci_rp = provider_tree.has_traits( + rp_uuid, [os_traits.COMPUTE_MANAGED_PCI_DEVICE]) + if is_pci_rp and rp_data.name not in rp_names_in_view: + provider_tree.remove(rp_uuid) + + def update_provider_tree( + self, + provider_tree: provider_tree.ProviderTree, + allocations: dict, + ) -> None: + self._remove_managed_rps_from_tree_not_in_view(provider_tree) + + rp_rc_usage = self.get_usage_per_rc_and_rp(allocations) for rp_name, rp in self.rps.items(): - rp.update_provider_tree(provider_tree, self.root_rp_name) + rp.update_provider_tree( + provider_tree, self.root_rp_name, rp_rc_usage) def update_allocations( self, @@ -639,9 +760,10 @@ def update_provider_tree_for_pci( dev_spec = pci_tracker.dev_filter.get_devspec(dev) pv.process_dev(dev, dev_spec) + pv.update_provider_tree(provider_tree, allocations) + LOG.info("Placement PCI resource view: %s", pv) - pv.update_provider_tree(provider_tree) old_alloc = copy.deepcopy(allocations) # update_provider_tree correlated the PciDevice objects with RPs in # placement and recorded the RP UUID in the PciDevice object. We need to diff --git a/nova/tests/functional/libvirt/test_pci_in_placement.py b/nova/tests/functional/libvirt/test_pci_in_placement.py index 14557d1bfb..1e04b02a8d 100644 --- a/nova/tests/functional/libvirt/test_pci_in_placement.py +++ b/nova/tests/functional/libvirt/test_pci_in_placement.py @@ -23,7 +23,6 @@ from oslo_serialization import jsonutils from nova import exception from nova.tests.fixtures import libvirt as fakelibvirt -from nova.tests.functional.api import client from nova.tests.functional.libvirt import test_pci_sriov_servers CONF = cfg.CONF @@ -42,12 +41,16 @@ WARN_PCI_TRACKER_HELD_DEVICE = ( "this warning.") WARN_PCI_PLACEMENT_HELD_DEVICE = ( - "WARNING [nova.compute.pci_placement_translator] Device spec is " - "not found for device %s in [pci]device_spec. We are " - "skipping this devices during Placement update. The device is " - "allocated by %s. You should not remove an allocated device from " - "the configuration. Please restore the configuration or cold " - "migrate the instance to resolve the inconsistency." + "WARNING [nova.compute.pci_placement_translator] " + "Device spec is not found for device %s in " + "[pci]device_spec. The device is allocated by " + "%s. We are keeping this device in the Placement " + "view. You should not remove an allocated device from " + "the configuration. Please restore the configuration. " + "If you cannot restore the configuration as the " + "device is dead then delete or cold migrate the " + "instance and then restart the nova-compute service " + "to resolve the inconsistency." ) @@ -799,6 +802,34 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests): self.stdlog.logger.output, ) + # Now delete the service as the warning suggested. It should work. + self._delete_server(server) + + # The allocation successfully removed + compute1_expected_placement_view["usages"] = { + "0000:81:00.0": { + "CUSTOM_PCI_8086_1528": 0, + } + } + compute1_expected_placement_view["allocations"].pop(server["id"]) + self.assert_placement_pci_view( + "compute1", **compute1_expected_placement_view) + + self.stdlog.delete_stored_logs() + self.restart_compute_service(hostname="compute1") + + # The next compute restart won't trigger any warning + self.assertNotIn( + "WARNING [nova.compute.pci_placement_translator] ", + self.stdlog.logger.output, + ) + # And the device is now removed from Placement + compute1_expected_placement_view["inventories"].pop("0000:81:00.0") + compute1_expected_placement_view["traits"].pop("0000:81:00.0") + compute1_expected_placement_view["usages"].pop("0000:81:00.0") + self.assert_placement_pci_view( + "compute1", **compute1_expected_placement_view) + def test_device_reconfiguration_with_allocations_config_change_stop(self): self._create_one_compute_with_a_pf_consumed_by_an_instance() @@ -892,28 +923,48 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests): WARN_PCI_PLACEMENT_HELD_DEVICE % ("0000:81:00.0", server['id']), self.stdlog.logger.output, ) - - self.stdlog.delete_stored_logs() - # Delete the server as the warning suggests. Unfortunately the deletion - # fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905 - ex = self.assertRaises( - client.OpenStackApiException, self._delete_server, server) - self.assertIn("Unexpected API Error. Please report this", str(ex)) - - # The sever delete fails as nova tries to delete the RP while it still - # has allocations. - self.assertRegex( + # no placement error is reported + self.assertNotRegex( self.stdlog.logger.output, "ERROR .nova.scheduler.client.report..*Failed to delete " "resource provider with UUID.*from the placement API. " "Got 409.*Unable to delete resource provider.*Resource " "provider has allocations.") - # The instance is put into ERROR state. - server = self.api.get_server(server['id']) - self.assertEqual(server['status'], 'ERROR') + self.stdlog.delete_stored_logs() + # the deletion succeeds + self._delete_server(server) + # no placement error is reported + self.assertNotRegex( + self.stdlog.logger.output, + "ERROR .nova.scheduler.client.report..*Failed to delete " + "resource provider with UUID.*from the placement API. " + "Got 409.*Unable to delete resource provider.*Resource " + "provider has allocations.") - # And the allocation is not removed. + # The allocation is removed from placement + compute1_expected_placement_view["usages"] = { + "0000:81:00.0": { + self.PF_RC: 0, + } + } + compute1_expected_placement_view["allocations"].pop(server["id"]) + self.assert_placement_pci_view( + "compute1", **compute1_expected_placement_view) + + # The PCI device is removed from the PCI tracker when the device is + # freed, but not from Placement yet because during VM deletion the + # resource tracker updated before the Placement allocation is dropped + # so we cannot drop the Placement inventory during delete. + self.assertPCIDeviceCounts("compute1", total=0, free=0) + + # We need a periodics run to trigger the deletion of the device in + # Placement + self._run_periodics() + + compute1_expected_placement_view["inventories"].pop("0000:81:00.0") + compute1_expected_placement_view["traits"].pop("0000:81:00.0") + compute1_expected_placement_view["usages"].pop("0000:81:00.0") self.assert_placement_pci_view( "compute1", **compute1_expected_placement_view) @@ -1002,26 +1053,40 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests): self.stdlog.logger.output, ) - # Delete the server as the warning suggests. Unfortunately the deletion - # fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905 - ex = self.assertRaises( - client.OpenStackApiException, self._delete_server, server) - self.assertIn("Unexpected API Error. Please report this", str(ex)) - - # The sever delete fails as nova tries to delete the RP while it still - # has allocations. - self.assertRegex( + self.stdlog.delete_stored_logs() + # the deletion succeeds + self._delete_server(server) + # no placement error is reported + self.assertNotRegex( self.stdlog.logger.output, "ERROR .nova.scheduler.client.report..*Failed to delete " "resource provider with UUID.*from the placement API. " "Got 409.*Unable to delete resource provider.*Resource " "provider has allocations.") - # The instance is put into ERROR state. - server = self.api.get_server(server['id']) - self.assertEqual(server['status'], 'ERROR') + # The allocation is removed from placement + compute1_expected_placement_view["usages"] = { + "0000:81:00.0": { + self.VF_RC: 0, + } + } + compute1_expected_placement_view["allocations"].pop(server["id"]) + self.assert_placement_pci_view( + "compute1", **compute1_expected_placement_view) - # And the allocation is not removed. + # The PCI device is removed from the PCI tracker when the device is + # freed, but not from Placement yet because during VM deletion the + # resource tracker updated before the Placement allocation is dropped + # so we cannot drop the Placement inventory during delete. + self.assertPCIDeviceCounts("compute1", total=0, free=0) + + # We need a periodics run to trigger the deletion of the device in + # Placement + self._run_periodics() + + compute1_expected_placement_view["inventories"].pop("0000:81:00.0") + compute1_expected_placement_view["traits"].pop("0000:81:00.0") + compute1_expected_placement_view["usages"].pop("0000:81:00.0") self.assert_placement_pci_view( "compute1", **compute1_expected_placement_view) @@ -1033,14 +1098,17 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests): # remove 0000:81:00.* VFs from the device spec and restart the compute device_spec = self._to_list_of_json_str([]) self.flags(group='pci', device_spec=device_spec) + self.restart_compute_service(hostname="compute1") # One of the VFs is used but all of them is removed from the config. # The PciTracker warns but keeps the allocated device so the placement # logic mimic this and only warns but keeps the RP and the allocation # in placement intact. - self.restart_compute_service(hostname="compute1") - self.assert_placement_pci_view( - "compute1", **compute1_expected_placement_view) - self._run_periodics() + # The non allocated VF is removed while the allocated one is kept + compute1_expected_placement_view["inventories"] = { + "0000:81:00.0": { + self.VF_RC: 1 + }, + } self.assert_placement_pci_view( "compute1", **compute1_expected_placement_view) # the warning from the PciTracker @@ -1048,41 +1116,54 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests): WARN_PCI_TRACKER_HELD_DEVICE % (server['id'], "1:0000:81:00.2"), self.stdlog.logger.output, ) - # the warning from the placement PCI tracking logic + # two warnings from the placement PCI tracking logic self.assertIn( WARN_PCI_PLACEMENT_HELD_DEVICE % ("0000:81:00.2", server['id']), self.stdlog.logger.output, ) - # however there is an error as well as nova tries to remove the - # RP that has allocations this already a signal of the bug - # https://bugs.launchpad.net/nova/+bug/2115905 - self.assertRegex( + self.assertIn( + "WARNING [nova.compute.pci_placement_translator] " + "Needed to adjust inventories of CUSTOM_PCI_8086_1515 on " + "resource provider compute1_0000:81:00.0 from 0 to 1 due to " + "existing placement allocations. This should only happen while " + "VMs using already removed devices.", self.stdlog.logger.output, - "ERROR .nova.scheduler.client.report..*Failed to delete " - "resource provider with UUID.*from the placement API. " - "Got 409.*Unable to delete resource provider.*Resource " - "provider has allocations.") + ) self.stdlog.delete_stored_logs() - # Delete the server as the warning suggests. Unfortunately the deletion - # fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905 - ex = self.assertRaises( - client.OpenStackApiException, self._delete_server, server) - self.assertIn("Unexpected API Error. Please report this", str(ex)) - - # We have the same RP deletion error as before. - self.assertRegex( + # the deletion succeeds + self._delete_server(server) + # no placement error is reported + self.assertNotRegex( self.stdlog.logger.output, "ERROR .nova.scheduler.client.report..*Failed to delete " "resource provider with UUID.*from the placement API. " "Got 409.*Unable to delete resource provider.*Resource " "provider has allocations.") - # The instance is put into ERROR state. - server = self.api.get_server(server['id']) - self.assertEqual(server['status'], 'ERROR') + # The allocation is removed from placement + compute1_expected_placement_view["usages"] = { + "0000:81:00.0": { + self.VF_RC: 0, + } + } + compute1_expected_placement_view["allocations"].pop(server["id"]) + self.assert_placement_pci_view( + "compute1", **compute1_expected_placement_view) - # And the allocation is not removed. + # The PCI device is removed from the PCI tracker when the device is + # freed, but not from Placement yet because during VM deletion the + # resource tracker updated before the Placement allocation is dropped + # so we cannot drop the Placement inventory during delete. + self.assertPCIDeviceCounts("compute1", total=0, free=0) + + # We need a periodics run to trigger the deletion of the device in + # Placement + self._run_periodics() + + compute1_expected_placement_view["inventories"].pop("0000:81:00.0") + compute1_expected_placement_view["traits"].pop("0000:81:00.0") + compute1_expected_placement_view["usages"].pop("0000:81:00.0") self.assert_placement_pci_view( "compute1", **compute1_expected_placement_view) diff --git a/nova/tests/unit/compute/test_pci_placement_translator.py b/nova/tests/unit/compute/test_pci_placement_translator.py index 3b2104449e..c8f0b8ab33 100644 --- a/nova/tests/unit/compute/test_pci_placement_translator.py +++ b/nova/tests/unit/compute/test_pci_placement_translator.py @@ -11,7 +11,10 @@ # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. + +import collections import ddt +import os_traits from oslo_utils.fixture import uuidsentinel as uuids from unittest import mock @@ -60,14 +63,15 @@ class TestTranslator(test.NoDBTestCase): # So we have a device but there is no spec for it pci_tracker.dev_filter.get_devspec = mock.Mock(return_value=None) pci_tracker.dev_filter.specs = [] - # we expect that the provider_tree is not touched as the device without - # spec is skipped, we assert that with the NonCallableMock - provider_tree = mock.NonCallableMock() + pt = provider_tree.ProviderTree() + pt.new_root("fake-node", uuids.compute_rp) - ppt.update_provider_tree_for_pci( - provider_tree, "fake-node", pci_tracker, {}, []) + with mock.patch.object(pt, "remove", new=mock.NonCallableMock()): + ppt.update_provider_tree_for_pci( + pt, "fake-node", pci_tracker, {}, []) self.assertIn( + "WARNING [nova.compute.pci_placement_translator] " "Device spec is not found for device 0000:81:00.0 in " "[pci]device_spec. Ignoring device in Placement resource view. " "This should not happen. Please file a bug.", @@ -261,7 +265,7 @@ class TestTranslator(test.NoDBTestCase): pv._add_dev(vf, {}) pv._add_dev(pf, {}) - pv.update_provider_tree(pt) + pv.update_provider_tree(pt, {}) self.assertEqual( pt.data("fake-node_0000:71:00.0").uuid, vf.extra_info["rp_uuid"] @@ -331,7 +335,7 @@ class TestTranslator(test.NoDBTestCase): self.assertRaisesRegex(exception.PlacementPciException, 'Only.*may set one_time_use', pv._add_dev, vf2, {'one_time_use': 'false'}) - pv.update_provider_tree(pt) + pv.update_provider_tree(pt, {}) # These are both OTU, make sure we get the trait added self.assertIn('HW_PCI_ONE_TIME_USE', @@ -375,21 +379,21 @@ class TestTranslator(test.NoDBTestCase): ).inventory['CUSTOM_PCI_DEAD_BEEF'].get('reserved', 0)) # Before allocation, reserved is unset - pv.update_provider_tree(pt) + pv.update_provider_tree(pt, {}) assert_inventory(71, 0) assert_inventory(72, 0) # After allocation, reserved gets set to total (only for the device # that is used) pf.instance_uuid = uuids.instance - pv.update_provider_tree(pt) + pv.update_provider_tree(pt, {}) assert_inventory(71, 0) assert_inventory(72, 1) # After deallocation, reserved is again unchanged (i.e. never # decremented) pf.instance_uuid = None - pv.update_provider_tree(pt) + pv.update_provider_tree(pt, {}) assert_inventory(71, 0) assert_inventory(72, 1) @@ -412,3 +416,237 @@ class TestTranslator(test.NoDBTestCase): pci_tracker.stats.populate_pools_metadata_from_assigned_devices.\ assert_called_once_with() + + def _convert_defaultdict_to_dict(self, d): + if not isinstance(d, collections.defaultdict): + return d + return {k: self._convert_defaultdict_to_dict(v) for k, v in d.items()} + + def test_get_usage_per_rc_and_rp_no_allocations(self): + actual = ppt.PlacementView.get_usage_per_rc_and_rp({}) + + self.assertEqual({}, self._convert_defaultdict_to_dict(actual)) + + def test_get_usage_per_rc_and_rp_empty_consumer(self): + actual = ppt.PlacementView.get_usage_per_rc_and_rp({ + uuids.consumer1: {"allocations": {}} + }) + + self.assertEqual({}, self._convert_defaultdict_to_dict(actual)) + + def test_get_usage_per_rc_and_rp(self): + allocations = { + uuids.consumer1: { + "allocations": { + uuids.rp1: { + "resources": { + "RC1": 1, + "RC2": 3, + }, + }, + uuids.rp2: { + "resources": { + "RC2": 5, + "RC3": 1, + }, + }, + }, + }, + uuids.consumer2: { + "allocations": { + uuids.rp2: { + "resources": { + "RC2": 1, + "RC3": 3, + }, + }, + uuids.rp3: { + "resources": { + "RC1": 1, + }, + }, + }, + }, + uuids.consumer3: { + "allocations": { + uuids.rp1: { + "resources": { + "RC2": 3, + }, + }, + }, + }, + } + actual = ppt.PlacementView.get_usage_per_rc_and_rp(allocations) + + expected = { + uuids.rp1: { + "RC1": 1, # only from consumer1 + "RC2": 6, # from consumer1 (3) + consumer3 (3) + }, + uuids.rp2: { + "RC2": 6, # from consumer1 (5) + consumer2 (1) + "RC3": 4, # from consumer1 (1) + consumer2 (3) + }, + uuids.rp3: { + "RC1": 1 # only from consumer2 + }, + } + self.assertEqual(expected, self._convert_defaultdict_to_dict(actual)) + + def test_remove_managed_rps_empty_view(self): + pt = provider_tree.ProviderTree() + pt.new_root("fake-node", uuids.compute_rp) + pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci) + + pv = ppt.PlacementView("fake-node", []) + pv._remove_managed_rps_from_tree_not_in_view(pt) + + # No changes expected in the tree + self.assertTrue(pt.exists("fake-node")) + self.assertTrue(pt.exists("no-pci")) + + def test_remove_managed_rps_new_rp_in_view(self): + pt = provider_tree.ProviderTree() + pt.new_root("fake-node", uuids.compute_rp) + pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci) + + pv = ppt.PlacementView("fake-node", []) + pf = pci_device.PciDevice( + address="0000:72:00.0", + parent_addr=None, + dev_type=fields.PciDeviceType.SRIOV_PF, + vendor_id="dead", + product_id="beef", + status=fields.PciDeviceStatus.AVAILABLE, + ) + + pv.process_dev(pf, devspec.PciDeviceSpec({})) + pv._remove_managed_rps_from_tree_not_in_view(pt) + + # No RPs is removed + self.assertTrue(pt.exists("fake-node")) + self.assertTrue(pt.exists("no-pci")) + + def test_remove_managed_rps_no_rp_change(self): + pt = provider_tree.ProviderTree() + pt.new_root("fake-node", uuids.compute_rp) + pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci) + pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci) + pt.add_traits( + "fake-node_0000:72:00.0", os_traits.COMPUTE_MANAGED_PCI_DEVICE) + + pv = ppt.PlacementView("fake-node", []) + pf = pci_device.PciDevice( + address="0000:72:00.0", + parent_addr=None, + dev_type=fields.PciDeviceType.SRIOV_PF, + vendor_id="dead", + product_id="beef", + status=fields.PciDeviceStatus.AVAILABLE, + ) + + pv.process_dev(pf, devspec.PciDeviceSpec({})) + pv._remove_managed_rps_from_tree_not_in_view(pt) + + # The existing PCI RP is kept as it exists in the View as well + self.assertTrue(pt.exists("fake-node")) + self.assertTrue(pt.exists("no-pci")) + self.assertTrue(pt.exists("fake-node_0000:72:00.0")) + + def test_remove_managed_rps_rp_removed_from_view(self): + pt = provider_tree.ProviderTree() + pt.new_root("fake-node", uuids.compute_rp) + pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci) + pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci) + pt.add_traits( + "fake-node_0000:72:00.0", os_traits.COMPUTE_MANAGED_PCI_DEVICE) + + pv = ppt.PlacementView("fake-node", []) + pv._remove_managed_rps_from_tree_not_in_view(pt) + + self.assertTrue(pt.exists("fake-node")) + self.assertTrue(pt.exists("no-pci")) + # The PCI RP is removed as it is not in the View anymore + self.assertFalse(pt.exists("fake-node_0000:72:00.0")) + + def test_adjust_for_removals_no_allocation_no_adjustment(self): + rp = ppt.PciResourceProvider("pci") + pt = provider_tree.ProviderTree() + pt.new_root("fake-node", uuids.compute_rp) + pt.new_child("pci", uuids.compute_rp, uuids.pci) + usage = {uuids.pci: {}} + + rp._adjust_for_removals_and_held_devices(pt, usage) + + self.assertEqual(0, rp.adjustment) + + def test_adjust_for_removals_allocated_configured_no_adjustment(self): + rp = ppt.PciResourceProvider("fake-node_0000:72:00.0") + pf = pci_device.PciDevice( + address="0000:72:00.0", + parent_addr=None, + dev_type=fields.PciDeviceType.SRIOV_PF, + vendor_id="dead", + product_id="beef", + status=fields.PciDeviceStatus.ALLOCATED, + ) + rp.add_parent(pf, {}) + + pt = provider_tree.ProviderTree() + pt.new_root("fake-node", uuids.compute_rp) + pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci) + + usage = {uuids.pci: {"CUSTOM_PCI_DEAD_BEEF": 1}} + + rp._adjust_for_removals_and_held_devices(pt, usage) + + self.assertEqual(0, rp.adjustment) + self.assertEqual(1, rp.total) + + def test_adjust_for_removals_allocated_removed(self): + rp = ppt.PciResourceProvider("fake-node_0000:72:00.0") + vf = pci_device.PciDevice( + address="0000:72:00.1", + parent_addr="0000:72:00.0", + dev_type=fields.PciDeviceType.SRIOV_VF, + vendor_id="dead", + product_id="beef", + status=fields.PciDeviceStatus.ALLOCATED, + ) + rp.add_child(vf, {}) + + pt = provider_tree.ProviderTree() + pt.new_root("fake-node", uuids.compute_rp) + pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci) + + # We have two allocations but only one VF left so one VF inventory + # is missing. We expect an adjustment for that + usage = {uuids.pci: {"CUSTOM_PCI_DEAD_BEEF": 2}} + + rp._adjust_for_removals_and_held_devices(pt, usage) + + self.assertEqual(1, len(rp.devs)) + self.assertEqual(1, rp.adjustment) + self.assertEqual(2, rp.total) + self.assertIn( + "Needed to adjust inventories", self.stdlog.logger.output) + + def test_adjust_for_removals_allocated_removed_no_inventory(self): + rp = ppt.PciResourceProvider("fake-node_0000:72:00.0") + + pt = provider_tree.ProviderTree() + pt.new_root("fake-node", uuids.compute_rp) + pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci) + + # One allocation exists but no inventory at all. The RC will be + # inferred from the allocation and the inventory is adjusted + usage = {uuids.pci: {"CUSTOM_PCI_DEAD_BEEF": 1}} + + rp._adjust_for_removals_and_held_devices(pt, usage) + + self.assertEqual(0, len(rp.devs)) + self.assertEqual(1, rp.adjustment) + self.assertEqual(1, rp.total) + self.assertIn( + "Needed to adjust inventories", self.stdlog.logger.output)