Merge "[pci]Keep used dev in Placement regardless of dev_spec"

This commit is contained in:
Zuul
2025-09-05 15:36:30 +00:00
committed by Gerrit Code Review
3 changed files with 569 additions and 128 deletions
+180 -58
View File
@@ -136,14 +136,24 @@ class PciResourceProvider:
self.resource_class: ty.Optional[str] = None
self.traits: ty.Optional[ty.Set[str]] = None
self.is_otu = False
# This is an adjustment for the total inventory based on normal device
# due to possibility of devices held in the tracker even though they
# are removed from the configuration due to still having allocations.
# This number will be calculated based on the existing allocations
# during update_provider_tree call.
self.adjustment = 0
@property
def devs(self) -> ty.List[pci_device.PciDevice]:
return [self.parent_dev] if self.parent_dev else self.children_devs
@property
def total(self):
return len(self.devs) + self.adjustment
@property
def to_be_deleted(self):
return not bool(self.devs)
return self.total == 0
def add_child(self, dev, dev_spec_tags: ty.Dict[str, str]) -> None:
if self.parent_dev:
@@ -242,8 +252,8 @@ class PciResourceProvider:
# one_time_use=true flag, but otherwise the operator controls
# reserved and nova will not override that value periodically.
inventory = {
"total": len(self.devs),
"max_unit": len(self.devs),
"total": self.total,
"max_unit": self.total,
}
self._handle_one_time_use(inventory)
@@ -260,28 +270,62 @@ class PciResourceProvider:
# If we are an allocated parent device, and our one-time-use flag
# is set, we need to also set our inventory to reserved.
# NOTE(danms): VERY IMPORTANT: we never *ever* want to update
# reserved to anything other than len(self.devs), and definitely
# reserved to anything other than self.total, and definitely
# not if we are not allocated. These devices are intended to go
# from unallocated to allocated AND reserved. They may be
# unreserved by an external entity, but never nova.
inventory['reserved'] = len(self.devs)
inventory['reserved'] = self.total
def _adjust_for_removals_and_held_devices(
self,
provider_tree: provider_tree.ProviderTree,
rp_rc_usage: ty.Dict[str, ty.Dict[str, int]],
) -> None:
rp_uuid = provider_tree.data(self.name).uuid
rc_usage = rp_rc_usage[rp_uuid]
if not self.resource_class:
# The resource_class is undefined when there are no normal devices
# exists any more on this RP. If no normal devs exists then there
# is no device_spec to derive the RC and traits from. But if we
# still have allocations in placement against this RP that means
# there are devices removed from the configuration but kept in the
# tracker as they are still allocated. In this case we
# need to recover the resource class and traits from the
# existing allocation.
if len(rc_usage) == 0:
# no usage so nothing to adjust here
return
else:
# The len > 1 case should not happen for PCI RPs as we either
# track the parent PF or the child VFs there on the RP but
# never both.
self.resource_class = list(rc_usage.keys())[0]
self.traits = provider_tree.data(rp_uuid).traits
# If device being removed but still held due to still having
# allocations then we need to adjust the total inventory to never go
# below the current usage otherwise Placement will reject the update.
usage = rc_usage[self.resource_class]
inventory = self.total
if usage > inventory:
LOG.warning(
"Needed to adjust inventories of %s on "
"resource provider %s from %d to %d due to existing "
"placement allocations. This should only happen while "
"VMs using already removed devices.",
self.resource_class, self.name, inventory, usage)
# This is counted into self.total to adjust the inventory
self.adjustment += usage - inventory
def update_provider_tree(
self,
provider_tree: provider_tree.ProviderTree,
parent_rp_name: str,
rp_rc_usage: ty.Dict[str, ty.Dict[str, int]],
) -> None:
if self.to_be_deleted:
# This means we need to delete the RP from placement if exists
if provider_tree.exists(self.name):
# NOTE(gibi): If there are allocations on this RP then
# Placement will reject the update the provider_tree is
# synced up.
provider_tree.remove(self.name)
return
if not provider_tree.exists(self.name):
# NOTE(gibi): We need to generate UUID for the new provider in Nova
# instead of letting Placement assign one. We are potentially
@@ -294,6 +338,14 @@ class PciResourceProvider:
uuid=uuidutils.generate_uuid(dashed=True)
)
self._adjust_for_removals_and_held_devices(provider_tree, rp_rc_usage)
# if after the adjustment no inventory left then we need to delete
# the RP explicitly
if self.total == 0:
provider_tree.remove(self.name)
return
provider_tree.update_inventory(
self.name,
self._get_inventories(),
@@ -385,13 +437,13 @@ class PciResourceProvider:
return updated
def __str__(self) -> str:
if self.devs:
if not self.to_be_deleted:
return (
f"RP({self.name}, {self.resource_class}={len(self.devs)}, "
f"RP({self.name}, {self.resource_class}={self.total}, "
f"traits={','.join(sorted(self.traits or set()))})"
)
else:
return f"RP({self.name}, <EMPTY>)"
return f"RP({self.name}, <to be deleted>)"
class PlacementView:
@@ -425,18 +477,6 @@ class PlacementView:
return self._get_rp_name_for_address(dev.parent_addr)
def _add_child(
self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str]
) -> None:
rp_name = self._get_rp_name_for_child(dev)
self._ensure_rp(rp_name).add_child(dev, dev_spec_tags)
def _add_parent(
self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str]
) -> None:
rp_name = self._get_rp_name_for_address(dev.address)
self._ensure_rp(rp_name).add_parent(dev, dev_spec_tags)
def _add_dev(
self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str]
) -> None:
@@ -447,10 +487,11 @@ class PlacementView:
# devices in placement.
return
rp = self._ensure_rp_for_dev(dev)
if dev.dev_type in PARENT_TYPES:
self._add_parent(dev, dev_spec_tags)
rp.add_parent(dev, dev_spec_tags)
elif dev.dev_type in CHILD_TYPES:
self._add_child(dev, dev_spec_tags)
rp.add_child(dev, dev_spec_tags)
else:
msg = _(
"Unhandled PCI device type %(type)s for %(dev)s. Please "
@@ -461,51 +502,92 @@ class PlacementView:
}
raise exception.PlacementPciException(error=msg)
def _remove_child(self, dev: pci_device.PciDevice) -> None:
rp_name = self._get_rp_name_for_child(dev)
self._ensure_rp(rp_name).remove_child(dev)
def _remove_parent(self, dev: pci_device.PciDevice) -> None:
rp_name = self._get_rp_name_for_address(dev.address)
self._ensure_rp(rp_name).remove_parent(dev)
def _remove_dev(self, dev: pci_device.PciDevice) -> None:
"""Remove PCI devices from Placement that existed before but now
deleted from the hypervisor or unlisted from [pci]device_spec
"""
rp = self._ensure_rp_for_dev(dev)
if dev.dev_type in PARENT_TYPES:
self._remove_parent(dev)
rp.remove_parent(dev)
elif dev.dev_type in CHILD_TYPES:
self._remove_child(dev)
rp.remove_child(dev)
def _ensure_rp_for_dev(
self, dev: pci_device.PciDevice
) -> PciResourceProvider:
"""Ensures that the RP exists for the device and returns it
but does not do any inventory accounting for the given device on
the RP.
"""
if dev.dev_type in PARENT_TYPES:
rp_name = self._get_rp_name_for_address(dev.address)
return self._ensure_rp(rp_name)
elif dev.dev_type in CHILD_TYPES:
rp_name = self._get_rp_name_for_child(dev)
return self._ensure_rp(rp_name)
else:
raise ValueError(
f"Unhandled PCI device type {dev.dev_type} "
f"for dev {dev.address}.")
def process_dev(
self,
dev: pci_device.PciDevice,
dev_spec: ty.Optional[devspec.PciDeviceSpec],
) -> None:
# NOTE(gibi): We never observer dev.status DELETED as when that is set
# the device is also removed from the PCI tracker. So we can ignore
# that state.
if dev.status == fields.PciDeviceStatus.REMOVED:
# NOTE(gibi): We need to handle the situation when an instance
# uses a device where a dev_spec is removed. Here we need to keep
# the device in the Placement view similarly how the PCI tracker
# does it.
# However, we also need to handle the situation when such VM is
# being deleted. In that case we are called after the dev is freed
# and marked as removed by the tracker so dev.instance_uuid is
# None and dev.status is REMOVED. At this point the Placement
# allocation for this dev is still not deleted so we still have to
# keep the device in our view. The device will be deleted when the
# PCI tracker is saved which happens after us.
# However, we cannot overly eagerly keep devices here as a
# device in REMOVED state might be a device that had no allocation
# in Placement so it can be removed already without waiting for
# the next periodic update when the device disappears from the
# PCI tracker's list. If we are over eagerly keeping such device
# when it is not allocated then that will prevent a single step
# reconfiguration from whitelisting a VF to whitelisting its
# parent PF, because the VF will be kept at restart and conflict
# with the PF being added.
if dev.status in (
fields.PciDeviceStatus.DELETED,
fields.PciDeviceStatus.REMOVED,
):
# If the PCI tracker marked the device DELETED or REMOVED then
# such device is not allocated, so we are free to drop it from
# placement too.
# We choose to remove these devs so the happy path of removing
# not allocated devs is simple. And then we do an extra
# step later in update_provider_tree to reconcile Placement
# allocations with our view and add back some inventories to handle
# removed but allocated devs.
self._remove_dev(dev)
else:
if not dev_spec:
if dev.instance_uuid:
LOG.warning(
"Device spec is not found for device %s in "
"[pci]device_spec. We are skipping this devices "
"during Placement update. The device is allocated by "
"%s. You should not remove an allocated device from "
"the configuration. Please restore the configuration "
"or cold migrate the instance to resolve the "
"inconsistency.",
"[pci]device_spec. The device is allocated by "
"%s. We are keeping this device in the Placement "
"view. You should not remove an allocated device from "
"the configuration. Please restore the configuration. "
"If you cannot restore the configuration as the "
"device is dead then delete or cold migrate the "
"instance and then restart the nova-compute service "
"to resolve the inconsistency.",
dev.address,
dev.instance_uuid
)
# We need to keep the RP, but we cannot just use _add_dev
# to generate the inventory on the RP as that would require
# to know the dev_spec to e.g. have the RC. So we only
# ensure that the RP exists, the inventory will be adjusted
# based on the existing allocation in a later step.
self._ensure_rp_for_dev(dev)
else:
LOG.warning(
"Device spec is not found for device %s in "
@@ -525,11 +607,50 @@ class PlacementView:
f"{', '.join(str(rp) for rp in self.rps.values())}"
)
def update_provider_tree(
@staticmethod
def get_usage_per_rc_and_rp(
allocations
) -> ty.Dict[str, ty.Dict[str, int]]:
"""Returns a dict keyed by RP uuid and the value is a dict of
resource class: usage pairs telling how much total usage the given RP
has from the given resource class across all the allocations.
"""
rp_rc_usage: ty.Dict[str, ty.Dict[str, int]] = (
collections.defaultdict(lambda: collections.defaultdict(int)))
for consumer in allocations.values():
for rp_uuid, alloc in consumer["allocations"].items():
for rc, amount in alloc["resources"].items():
rp_rc_usage[rp_uuid][rc] += amount
return rp_rc_usage
def _remove_managed_rps_from_tree_not_in_view(
self, provider_tree: provider_tree.ProviderTree
) -> None:
"""Removes PCI RPs from the provider_tree that are not present in the
current PlacementView.
"""
rp_names_in_view = {rp.name for rp in self.rps.values()}
uuids_in_tree = provider_tree.get_provider_uuids_in_tree(
self.root_rp_name)
for rp_uuid in uuids_in_tree:
rp_data = provider_tree.data(rp_uuid)
is_pci_rp = provider_tree.has_traits(
rp_uuid, [os_traits.COMPUTE_MANAGED_PCI_DEVICE])
if is_pci_rp and rp_data.name not in rp_names_in_view:
provider_tree.remove(rp_uuid)
def update_provider_tree(
self,
provider_tree: provider_tree.ProviderTree,
allocations: dict,
) -> None:
self._remove_managed_rps_from_tree_not_in_view(provider_tree)
rp_rc_usage = self.get_usage_per_rc_and_rp(allocations)
for rp_name, rp in self.rps.items():
rp.update_provider_tree(provider_tree, self.root_rp_name)
rp.update_provider_tree(
provider_tree, self.root_rp_name, rp_rc_usage)
def update_allocations(
self,
@@ -639,9 +760,10 @@ def update_provider_tree_for_pci(
dev_spec = pci_tracker.dev_filter.get_devspec(dev)
pv.process_dev(dev, dev_spec)
pv.update_provider_tree(provider_tree, allocations)
LOG.info("Placement PCI resource view: %s", pv)
pv.update_provider_tree(provider_tree)
old_alloc = copy.deepcopy(allocations)
# update_provider_tree correlated the PciDevice objects with RPs in
# placement and recorded the RP UUID in the PciDevice object. We need to
@@ -23,7 +23,6 @@ from oslo_serialization import jsonutils
from nova import exception
from nova.tests.fixtures import libvirt as fakelibvirt
from nova.tests.functional.api import client
from nova.tests.functional.libvirt import test_pci_sriov_servers
CONF = cfg.CONF
@@ -42,12 +41,16 @@ WARN_PCI_TRACKER_HELD_DEVICE = (
"this warning.")
WARN_PCI_PLACEMENT_HELD_DEVICE = (
"WARNING [nova.compute.pci_placement_translator] Device spec is "
"not found for device %s in [pci]device_spec. We are "
"skipping this devices during Placement update. The device is "
"allocated by %s. You should not remove an allocated device from "
"the configuration. Please restore the configuration or cold "
"migrate the instance to resolve the inconsistency."
"WARNING [nova.compute.pci_placement_translator] "
"Device spec is not found for device %s in "
"[pci]device_spec. The device is allocated by "
"%s. We are keeping this device in the Placement "
"view. You should not remove an allocated device from "
"the configuration. Please restore the configuration. "
"If you cannot restore the configuration as the "
"device is dead then delete or cold migrate the "
"instance and then restart the nova-compute service "
"to resolve the inconsistency."
)
@@ -799,6 +802,34 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
self.stdlog.logger.output,
)
# Now delete the service as the warning suggested. It should work.
self._delete_server(server)
# The allocation successfully removed
compute1_expected_placement_view["usages"] = {
"0000:81:00.0": {
"CUSTOM_PCI_8086_1528": 0,
}
}
compute1_expected_placement_view["allocations"].pop(server["id"])
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
self.stdlog.delete_stored_logs()
self.restart_compute_service(hostname="compute1")
# The next compute restart won't trigger any warning
self.assertNotIn(
"WARNING [nova.compute.pci_placement_translator] ",
self.stdlog.logger.output,
)
# And the device is now removed from Placement
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
def test_device_reconfiguration_with_allocations_config_change_stop(self):
self._create_one_compute_with_a_pf_consumed_by_an_instance()
@@ -892,28 +923,48 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
WARN_PCI_PLACEMENT_HELD_DEVICE % ("0000:81:00.0", server['id']),
self.stdlog.logger.output,
)
self.stdlog.delete_stored_logs()
# Delete the server as the warning suggests. Unfortunately the deletion
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
ex = self.assertRaises(
client.OpenStackApiException, self._delete_server, server)
self.assertIn("Unexpected API Error. Please report this", str(ex))
# The sever delete fails as nova tries to delete the RP while it still
# has allocations.
self.assertRegex(
# no placement error is reported
self.assertNotRegex(
self.stdlog.logger.output,
"ERROR .nova.scheduler.client.report..*Failed to delete "
"resource provider with UUID.*from the placement API. "
"Got 409.*Unable to delete resource provider.*Resource "
"provider has allocations.")
# The instance is put into ERROR state.
server = self.api.get_server(server['id'])
self.assertEqual(server['status'], 'ERROR')
self.stdlog.delete_stored_logs()
# the deletion succeeds
self._delete_server(server)
# no placement error is reported
self.assertNotRegex(
self.stdlog.logger.output,
"ERROR .nova.scheduler.client.report..*Failed to delete "
"resource provider with UUID.*from the placement API. "
"Got 409.*Unable to delete resource provider.*Resource "
"provider has allocations.")
# And the allocation is not removed.
# The allocation is removed from placement
compute1_expected_placement_view["usages"] = {
"0000:81:00.0": {
self.PF_RC: 0,
}
}
compute1_expected_placement_view["allocations"].pop(server["id"])
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
# The PCI device is removed from the PCI tracker when the device is
# freed, but not from Placement yet because during VM deletion the
# resource tracker updated before the Placement allocation is dropped
# so we cannot drop the Placement inventory during delete.
self.assertPCIDeviceCounts("compute1", total=0, free=0)
# We need a periodics run to trigger the deletion of the device in
# Placement
self._run_periodics()
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
@@ -1002,26 +1053,40 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
self.stdlog.logger.output,
)
# Delete the server as the warning suggests. Unfortunately the deletion
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
ex = self.assertRaises(
client.OpenStackApiException, self._delete_server, server)
self.assertIn("Unexpected API Error. Please report this", str(ex))
# The sever delete fails as nova tries to delete the RP while it still
# has allocations.
self.assertRegex(
self.stdlog.delete_stored_logs()
# the deletion succeeds
self._delete_server(server)
# no placement error is reported
self.assertNotRegex(
self.stdlog.logger.output,
"ERROR .nova.scheduler.client.report..*Failed to delete "
"resource provider with UUID.*from the placement API. "
"Got 409.*Unable to delete resource provider.*Resource "
"provider has allocations.")
# The instance is put into ERROR state.
server = self.api.get_server(server['id'])
self.assertEqual(server['status'], 'ERROR')
# The allocation is removed from placement
compute1_expected_placement_view["usages"] = {
"0000:81:00.0": {
self.VF_RC: 0,
}
}
compute1_expected_placement_view["allocations"].pop(server["id"])
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
# And the allocation is not removed.
# The PCI device is removed from the PCI tracker when the device is
# freed, but not from Placement yet because during VM deletion the
# resource tracker updated before the Placement allocation is dropped
# so we cannot drop the Placement inventory during delete.
self.assertPCIDeviceCounts("compute1", total=0, free=0)
# We need a periodics run to trigger the deletion of the device in
# Placement
self._run_periodics()
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
@@ -1033,14 +1098,17 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
# remove 0000:81:00.* VFs from the device spec and restart the compute
device_spec = self._to_list_of_json_str([])
self.flags(group='pci', device_spec=device_spec)
self.restart_compute_service(hostname="compute1")
# One of the VFs is used but all of them is removed from the config.
# The PciTracker warns but keeps the allocated device so the placement
# logic mimic this and only warns but keeps the RP and the allocation
# in placement intact.
self.restart_compute_service(hostname="compute1")
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
self._run_periodics()
# The non allocated VF is removed while the allocated one is kept
compute1_expected_placement_view["inventories"] = {
"0000:81:00.0": {
self.VF_RC: 1
},
}
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
# the warning from the PciTracker
@@ -1048,41 +1116,54 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
WARN_PCI_TRACKER_HELD_DEVICE % (server['id'], "1:0000:81:00.2"),
self.stdlog.logger.output,
)
# the warning from the placement PCI tracking logic
# two warnings from the placement PCI tracking logic
self.assertIn(
WARN_PCI_PLACEMENT_HELD_DEVICE % ("0000:81:00.2", server['id']),
self.stdlog.logger.output,
)
# however there is an error as well as nova tries to remove the
# RP that has allocations this already a signal of the bug
# https://bugs.launchpad.net/nova/+bug/2115905
self.assertRegex(
self.assertIn(
"WARNING [nova.compute.pci_placement_translator] "
"Needed to adjust inventories of CUSTOM_PCI_8086_1515 on "
"resource provider compute1_0000:81:00.0 from 0 to 1 due to "
"existing placement allocations. This should only happen while "
"VMs using already removed devices.",
self.stdlog.logger.output,
"ERROR .nova.scheduler.client.report..*Failed to delete "
"resource provider with UUID.*from the placement API. "
"Got 409.*Unable to delete resource provider.*Resource "
"provider has allocations.")
)
self.stdlog.delete_stored_logs()
# Delete the server as the warning suggests. Unfortunately the deletion
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
ex = self.assertRaises(
client.OpenStackApiException, self._delete_server, server)
self.assertIn("Unexpected API Error. Please report this", str(ex))
# We have the same RP deletion error as before.
self.assertRegex(
# the deletion succeeds
self._delete_server(server)
# no placement error is reported
self.assertNotRegex(
self.stdlog.logger.output,
"ERROR .nova.scheduler.client.report..*Failed to delete "
"resource provider with UUID.*from the placement API. "
"Got 409.*Unable to delete resource provider.*Resource "
"provider has allocations.")
# The instance is put into ERROR state.
server = self.api.get_server(server['id'])
self.assertEqual(server['status'], 'ERROR')
# The allocation is removed from placement
compute1_expected_placement_view["usages"] = {
"0000:81:00.0": {
self.VF_RC: 0,
}
}
compute1_expected_placement_view["allocations"].pop(server["id"])
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
# And the allocation is not removed.
# The PCI device is removed from the PCI tracker when the device is
# freed, but not from Placement yet because during VM deletion the
# resource tracker updated before the Placement allocation is dropped
# so we cannot drop the Placement inventory during delete.
self.assertPCIDeviceCounts("compute1", total=0, free=0)
# We need a periodics run to trigger the deletion of the device in
# Placement
self._run_periodics()
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
@@ -11,7 +11,10 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
import ddt
import os_traits
from oslo_utils.fixture import uuidsentinel as uuids
from unittest import mock
@@ -60,14 +63,15 @@ class TestTranslator(test.NoDBTestCase):
# So we have a device but there is no spec for it
pci_tracker.dev_filter.get_devspec = mock.Mock(return_value=None)
pci_tracker.dev_filter.specs = []
# we expect that the provider_tree is not touched as the device without
# spec is skipped, we assert that with the NonCallableMock
provider_tree = mock.NonCallableMock()
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
ppt.update_provider_tree_for_pci(
provider_tree, "fake-node", pci_tracker, {}, [])
with mock.patch.object(pt, "remove", new=mock.NonCallableMock()):
ppt.update_provider_tree_for_pci(
pt, "fake-node", pci_tracker, {}, [])
self.assertIn(
"WARNING [nova.compute.pci_placement_translator] "
"Device spec is not found for device 0000:81:00.0 in "
"[pci]device_spec. Ignoring device in Placement resource view. "
"This should not happen. Please file a bug.",
@@ -261,7 +265,7 @@ class TestTranslator(test.NoDBTestCase):
pv._add_dev(vf, {})
pv._add_dev(pf, {})
pv.update_provider_tree(pt)
pv.update_provider_tree(pt, {})
self.assertEqual(
pt.data("fake-node_0000:71:00.0").uuid, vf.extra_info["rp_uuid"]
@@ -331,7 +335,7 @@ class TestTranslator(test.NoDBTestCase):
self.assertRaisesRegex(exception.PlacementPciException,
'Only.*may set one_time_use',
pv._add_dev, vf2, {'one_time_use': 'false'})
pv.update_provider_tree(pt)
pv.update_provider_tree(pt, {})
# These are both OTU, make sure we get the trait added
self.assertIn('HW_PCI_ONE_TIME_USE',
@@ -375,21 +379,21 @@ class TestTranslator(test.NoDBTestCase):
).inventory['CUSTOM_PCI_DEAD_BEEF'].get('reserved', 0))
# Before allocation, reserved is unset
pv.update_provider_tree(pt)
pv.update_provider_tree(pt, {})
assert_inventory(71, 0)
assert_inventory(72, 0)
# After allocation, reserved gets set to total (only for the device
# that is used)
pf.instance_uuid = uuids.instance
pv.update_provider_tree(pt)
pv.update_provider_tree(pt, {})
assert_inventory(71, 0)
assert_inventory(72, 1)
# After deallocation, reserved is again unchanged (i.e. never
# decremented)
pf.instance_uuid = None
pv.update_provider_tree(pt)
pv.update_provider_tree(pt, {})
assert_inventory(71, 0)
assert_inventory(72, 1)
@@ -412,3 +416,237 @@ class TestTranslator(test.NoDBTestCase):
pci_tracker.stats.populate_pools_metadata_from_assigned_devices.\
assert_called_once_with()
def _convert_defaultdict_to_dict(self, d):
if not isinstance(d, collections.defaultdict):
return d
return {k: self._convert_defaultdict_to_dict(v) for k, v in d.items()}
def test_get_usage_per_rc_and_rp_no_allocations(self):
actual = ppt.PlacementView.get_usage_per_rc_and_rp({})
self.assertEqual({}, self._convert_defaultdict_to_dict(actual))
def test_get_usage_per_rc_and_rp_empty_consumer(self):
actual = ppt.PlacementView.get_usage_per_rc_and_rp({
uuids.consumer1: {"allocations": {}}
})
self.assertEqual({}, self._convert_defaultdict_to_dict(actual))
def test_get_usage_per_rc_and_rp(self):
allocations = {
uuids.consumer1: {
"allocations": {
uuids.rp1: {
"resources": {
"RC1": 1,
"RC2": 3,
},
},
uuids.rp2: {
"resources": {
"RC2": 5,
"RC3": 1,
},
},
},
},
uuids.consumer2: {
"allocations": {
uuids.rp2: {
"resources": {
"RC2": 1,
"RC3": 3,
},
},
uuids.rp3: {
"resources": {
"RC1": 1,
},
},
},
},
uuids.consumer3: {
"allocations": {
uuids.rp1: {
"resources": {
"RC2": 3,
},
},
},
},
}
actual = ppt.PlacementView.get_usage_per_rc_and_rp(allocations)
expected = {
uuids.rp1: {
"RC1": 1, # only from consumer1
"RC2": 6, # from consumer1 (3) + consumer3 (3)
},
uuids.rp2: {
"RC2": 6, # from consumer1 (5) + consumer2 (1)
"RC3": 4, # from consumer1 (1) + consumer2 (3)
},
uuids.rp3: {
"RC1": 1 # only from consumer2
},
}
self.assertEqual(expected, self._convert_defaultdict_to_dict(actual))
def test_remove_managed_rps_empty_view(self):
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci)
pv = ppt.PlacementView("fake-node", [])
pv._remove_managed_rps_from_tree_not_in_view(pt)
# No changes expected in the tree
self.assertTrue(pt.exists("fake-node"))
self.assertTrue(pt.exists("no-pci"))
def test_remove_managed_rps_new_rp_in_view(self):
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci)
pv = ppt.PlacementView("fake-node", [])
pf = pci_device.PciDevice(
address="0000:72:00.0",
parent_addr=None,
dev_type=fields.PciDeviceType.SRIOV_PF,
vendor_id="dead",
product_id="beef",
status=fields.PciDeviceStatus.AVAILABLE,
)
pv.process_dev(pf, devspec.PciDeviceSpec({}))
pv._remove_managed_rps_from_tree_not_in_view(pt)
# No RPs is removed
self.assertTrue(pt.exists("fake-node"))
self.assertTrue(pt.exists("no-pci"))
def test_remove_managed_rps_no_rp_change(self):
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci)
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
pt.add_traits(
"fake-node_0000:72:00.0", os_traits.COMPUTE_MANAGED_PCI_DEVICE)
pv = ppt.PlacementView("fake-node", [])
pf = pci_device.PciDevice(
address="0000:72:00.0",
parent_addr=None,
dev_type=fields.PciDeviceType.SRIOV_PF,
vendor_id="dead",
product_id="beef",
status=fields.PciDeviceStatus.AVAILABLE,
)
pv.process_dev(pf, devspec.PciDeviceSpec({}))
pv._remove_managed_rps_from_tree_not_in_view(pt)
# The existing PCI RP is kept as it exists in the View as well
self.assertTrue(pt.exists("fake-node"))
self.assertTrue(pt.exists("no-pci"))
self.assertTrue(pt.exists("fake-node_0000:72:00.0"))
def test_remove_managed_rps_rp_removed_from_view(self):
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci)
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
pt.add_traits(
"fake-node_0000:72:00.0", os_traits.COMPUTE_MANAGED_PCI_DEVICE)
pv = ppt.PlacementView("fake-node", [])
pv._remove_managed_rps_from_tree_not_in_view(pt)
self.assertTrue(pt.exists("fake-node"))
self.assertTrue(pt.exists("no-pci"))
# The PCI RP is removed as it is not in the View anymore
self.assertFalse(pt.exists("fake-node_0000:72:00.0"))
def test_adjust_for_removals_no_allocation_no_adjustment(self):
rp = ppt.PciResourceProvider("pci")
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pt.new_child("pci", uuids.compute_rp, uuids.pci)
usage = {uuids.pci: {}}
rp._adjust_for_removals_and_held_devices(pt, usage)
self.assertEqual(0, rp.adjustment)
def test_adjust_for_removals_allocated_configured_no_adjustment(self):
rp = ppt.PciResourceProvider("fake-node_0000:72:00.0")
pf = pci_device.PciDevice(
address="0000:72:00.0",
parent_addr=None,
dev_type=fields.PciDeviceType.SRIOV_PF,
vendor_id="dead",
product_id="beef",
status=fields.PciDeviceStatus.ALLOCATED,
)
rp.add_parent(pf, {})
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
usage = {uuids.pci: {"CUSTOM_PCI_DEAD_BEEF": 1}}
rp._adjust_for_removals_and_held_devices(pt, usage)
self.assertEqual(0, rp.adjustment)
self.assertEqual(1, rp.total)
def test_adjust_for_removals_allocated_removed(self):
rp = ppt.PciResourceProvider("fake-node_0000:72:00.0")
vf = pci_device.PciDevice(
address="0000:72:00.1",
parent_addr="0000:72:00.0",
dev_type=fields.PciDeviceType.SRIOV_VF,
vendor_id="dead",
product_id="beef",
status=fields.PciDeviceStatus.ALLOCATED,
)
rp.add_child(vf, {})
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
# We have two allocations but only one VF left so one VF inventory
# is missing. We expect an adjustment for that
usage = {uuids.pci: {"CUSTOM_PCI_DEAD_BEEF": 2}}
rp._adjust_for_removals_and_held_devices(pt, usage)
self.assertEqual(1, len(rp.devs))
self.assertEqual(1, rp.adjustment)
self.assertEqual(2, rp.total)
self.assertIn(
"Needed to adjust inventories", self.stdlog.logger.output)
def test_adjust_for_removals_allocated_removed_no_inventory(self):
rp = ppt.PciResourceProvider("fake-node_0000:72:00.0")
pt = provider_tree.ProviderTree()
pt.new_root("fake-node", uuids.compute_rp)
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
# One allocation exists but no inventory at all. The RC will be
# inferred from the allocation and the inventory is adjusted
usage = {uuids.pci: {"CUSTOM_PCI_DEAD_BEEF": 1}}
rp._adjust_for_removals_and_held_devices(pt, usage)
self.assertEqual(0, len(rp.devs))
self.assertEqual(1, rp.adjustment)
self.assertEqual(1, rp.total)
self.assertIn(
"Needed to adjust inventories", self.stdlog.logger.output)