Merge "[pci]Keep used dev in Placement regardless of dev_spec"
This commit is contained in:
@@ -136,14 +136,24 @@ class PciResourceProvider:
|
||||
self.resource_class: ty.Optional[str] = None
|
||||
self.traits: ty.Optional[ty.Set[str]] = None
|
||||
self.is_otu = False
|
||||
# This is an adjustment for the total inventory based on normal device
|
||||
# due to possibility of devices held in the tracker even though they
|
||||
# are removed from the configuration due to still having allocations.
|
||||
# This number will be calculated based on the existing allocations
|
||||
# during update_provider_tree call.
|
||||
self.adjustment = 0
|
||||
|
||||
@property
|
||||
def devs(self) -> ty.List[pci_device.PciDevice]:
|
||||
return [self.parent_dev] if self.parent_dev else self.children_devs
|
||||
|
||||
@property
|
||||
def total(self):
|
||||
return len(self.devs) + self.adjustment
|
||||
|
||||
@property
|
||||
def to_be_deleted(self):
|
||||
return not bool(self.devs)
|
||||
return self.total == 0
|
||||
|
||||
def add_child(self, dev, dev_spec_tags: ty.Dict[str, str]) -> None:
|
||||
if self.parent_dev:
|
||||
@@ -242,8 +252,8 @@ class PciResourceProvider:
|
||||
# one_time_use=true flag, but otherwise the operator controls
|
||||
# reserved and nova will not override that value periodically.
|
||||
inventory = {
|
||||
"total": len(self.devs),
|
||||
"max_unit": len(self.devs),
|
||||
"total": self.total,
|
||||
"max_unit": self.total,
|
||||
}
|
||||
|
||||
self._handle_one_time_use(inventory)
|
||||
@@ -260,28 +270,62 @@ class PciResourceProvider:
|
||||
# If we are an allocated parent device, and our one-time-use flag
|
||||
# is set, we need to also set our inventory to reserved.
|
||||
# NOTE(danms): VERY IMPORTANT: we never *ever* want to update
|
||||
# reserved to anything other than len(self.devs), and definitely
|
||||
# reserved to anything other than self.total, and definitely
|
||||
# not if we are not allocated. These devices are intended to go
|
||||
# from unallocated to allocated AND reserved. They may be
|
||||
# unreserved by an external entity, but never nova.
|
||||
inventory['reserved'] = len(self.devs)
|
||||
inventory['reserved'] = self.total
|
||||
|
||||
def _adjust_for_removals_and_held_devices(
|
||||
self,
|
||||
provider_tree: provider_tree.ProviderTree,
|
||||
rp_rc_usage: ty.Dict[str, ty.Dict[str, int]],
|
||||
) -> None:
|
||||
|
||||
rp_uuid = provider_tree.data(self.name).uuid
|
||||
rc_usage = rp_rc_usage[rp_uuid]
|
||||
|
||||
if not self.resource_class:
|
||||
# The resource_class is undefined when there are no normal devices
|
||||
# exists any more on this RP. If no normal devs exists then there
|
||||
# is no device_spec to derive the RC and traits from. But if we
|
||||
# still have allocations in placement against this RP that means
|
||||
# there are devices removed from the configuration but kept in the
|
||||
# tracker as they are still allocated. In this case we
|
||||
# need to recover the resource class and traits from the
|
||||
# existing allocation.
|
||||
if len(rc_usage) == 0:
|
||||
# no usage so nothing to adjust here
|
||||
return
|
||||
else:
|
||||
# The len > 1 case should not happen for PCI RPs as we either
|
||||
# track the parent PF or the child VFs there on the RP but
|
||||
# never both.
|
||||
self.resource_class = list(rc_usage.keys())[0]
|
||||
self.traits = provider_tree.data(rp_uuid).traits
|
||||
|
||||
# If device being removed but still held due to still having
|
||||
# allocations then we need to adjust the total inventory to never go
|
||||
# below the current usage otherwise Placement will reject the update.
|
||||
usage = rc_usage[self.resource_class]
|
||||
inventory = self.total
|
||||
if usage > inventory:
|
||||
LOG.warning(
|
||||
"Needed to adjust inventories of %s on "
|
||||
"resource provider %s from %d to %d due to existing "
|
||||
"placement allocations. This should only happen while "
|
||||
"VMs using already removed devices.",
|
||||
self.resource_class, self.name, inventory, usage)
|
||||
# This is counted into self.total to adjust the inventory
|
||||
self.adjustment += usage - inventory
|
||||
|
||||
def update_provider_tree(
|
||||
self,
|
||||
provider_tree: provider_tree.ProviderTree,
|
||||
parent_rp_name: str,
|
||||
rp_rc_usage: ty.Dict[str, ty.Dict[str, int]],
|
||||
) -> None:
|
||||
|
||||
if self.to_be_deleted:
|
||||
# This means we need to delete the RP from placement if exists
|
||||
if provider_tree.exists(self.name):
|
||||
# NOTE(gibi): If there are allocations on this RP then
|
||||
# Placement will reject the update the provider_tree is
|
||||
# synced up.
|
||||
provider_tree.remove(self.name)
|
||||
|
||||
return
|
||||
|
||||
if not provider_tree.exists(self.name):
|
||||
# NOTE(gibi): We need to generate UUID for the new provider in Nova
|
||||
# instead of letting Placement assign one. We are potentially
|
||||
@@ -294,6 +338,14 @@ class PciResourceProvider:
|
||||
uuid=uuidutils.generate_uuid(dashed=True)
|
||||
)
|
||||
|
||||
self._adjust_for_removals_and_held_devices(provider_tree, rp_rc_usage)
|
||||
|
||||
# if after the adjustment no inventory left then we need to delete
|
||||
# the RP explicitly
|
||||
if self.total == 0:
|
||||
provider_tree.remove(self.name)
|
||||
return
|
||||
|
||||
provider_tree.update_inventory(
|
||||
self.name,
|
||||
self._get_inventories(),
|
||||
@@ -385,13 +437,13 @@ class PciResourceProvider:
|
||||
return updated
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.devs:
|
||||
if not self.to_be_deleted:
|
||||
return (
|
||||
f"RP({self.name}, {self.resource_class}={len(self.devs)}, "
|
||||
f"RP({self.name}, {self.resource_class}={self.total}, "
|
||||
f"traits={','.join(sorted(self.traits or set()))})"
|
||||
)
|
||||
else:
|
||||
return f"RP({self.name}, <EMPTY>)"
|
||||
return f"RP({self.name}, <to be deleted>)"
|
||||
|
||||
|
||||
class PlacementView:
|
||||
@@ -425,18 +477,6 @@ class PlacementView:
|
||||
|
||||
return self._get_rp_name_for_address(dev.parent_addr)
|
||||
|
||||
def _add_child(
|
||||
self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str]
|
||||
) -> None:
|
||||
rp_name = self._get_rp_name_for_child(dev)
|
||||
self._ensure_rp(rp_name).add_child(dev, dev_spec_tags)
|
||||
|
||||
def _add_parent(
|
||||
self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str]
|
||||
) -> None:
|
||||
rp_name = self._get_rp_name_for_address(dev.address)
|
||||
self._ensure_rp(rp_name).add_parent(dev, dev_spec_tags)
|
||||
|
||||
def _add_dev(
|
||||
self, dev: pci_device.PciDevice, dev_spec_tags: ty.Dict[str, str]
|
||||
) -> None:
|
||||
@@ -447,10 +487,11 @@ class PlacementView:
|
||||
# devices in placement.
|
||||
return
|
||||
|
||||
rp = self._ensure_rp_for_dev(dev)
|
||||
if dev.dev_type in PARENT_TYPES:
|
||||
self._add_parent(dev, dev_spec_tags)
|
||||
rp.add_parent(dev, dev_spec_tags)
|
||||
elif dev.dev_type in CHILD_TYPES:
|
||||
self._add_child(dev, dev_spec_tags)
|
||||
rp.add_child(dev, dev_spec_tags)
|
||||
else:
|
||||
msg = _(
|
||||
"Unhandled PCI device type %(type)s for %(dev)s. Please "
|
||||
@@ -461,51 +502,92 @@ class PlacementView:
|
||||
}
|
||||
raise exception.PlacementPciException(error=msg)
|
||||
|
||||
def _remove_child(self, dev: pci_device.PciDevice) -> None:
|
||||
rp_name = self._get_rp_name_for_child(dev)
|
||||
self._ensure_rp(rp_name).remove_child(dev)
|
||||
|
||||
def _remove_parent(self, dev: pci_device.PciDevice) -> None:
|
||||
rp_name = self._get_rp_name_for_address(dev.address)
|
||||
self._ensure_rp(rp_name).remove_parent(dev)
|
||||
|
||||
def _remove_dev(self, dev: pci_device.PciDevice) -> None:
|
||||
"""Remove PCI devices from Placement that existed before but now
|
||||
deleted from the hypervisor or unlisted from [pci]device_spec
|
||||
"""
|
||||
rp = self._ensure_rp_for_dev(dev)
|
||||
if dev.dev_type in PARENT_TYPES:
|
||||
self._remove_parent(dev)
|
||||
rp.remove_parent(dev)
|
||||
elif dev.dev_type in CHILD_TYPES:
|
||||
self._remove_child(dev)
|
||||
rp.remove_child(dev)
|
||||
|
||||
def _ensure_rp_for_dev(
|
||||
self, dev: pci_device.PciDevice
|
||||
) -> PciResourceProvider:
|
||||
"""Ensures that the RP exists for the device and returns it
|
||||
but does not do any inventory accounting for the given device on
|
||||
the RP.
|
||||
"""
|
||||
if dev.dev_type in PARENT_TYPES:
|
||||
rp_name = self._get_rp_name_for_address(dev.address)
|
||||
return self._ensure_rp(rp_name)
|
||||
elif dev.dev_type in CHILD_TYPES:
|
||||
rp_name = self._get_rp_name_for_child(dev)
|
||||
return self._ensure_rp(rp_name)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unhandled PCI device type {dev.dev_type} "
|
||||
f"for dev {dev.address}.")
|
||||
|
||||
def process_dev(
|
||||
self,
|
||||
dev: pci_device.PciDevice,
|
||||
dev_spec: ty.Optional[devspec.PciDeviceSpec],
|
||||
) -> None:
|
||||
# NOTE(gibi): We never observer dev.status DELETED as when that is set
|
||||
# the device is also removed from the PCI tracker. So we can ignore
|
||||
# that state.
|
||||
if dev.status == fields.PciDeviceStatus.REMOVED:
|
||||
# NOTE(gibi): We need to handle the situation when an instance
|
||||
# uses a device where a dev_spec is removed. Here we need to keep
|
||||
# the device in the Placement view similarly how the PCI tracker
|
||||
# does it.
|
||||
# However, we also need to handle the situation when such VM is
|
||||
# being deleted. In that case we are called after the dev is freed
|
||||
# and marked as removed by the tracker so dev.instance_uuid is
|
||||
# None and dev.status is REMOVED. At this point the Placement
|
||||
# allocation for this dev is still not deleted so we still have to
|
||||
# keep the device in our view. The device will be deleted when the
|
||||
# PCI tracker is saved which happens after us.
|
||||
# However, we cannot overly eagerly keep devices here as a
|
||||
# device in REMOVED state might be a device that had no allocation
|
||||
# in Placement so it can be removed already without waiting for
|
||||
# the next periodic update when the device disappears from the
|
||||
# PCI tracker's list. If we are over eagerly keeping such device
|
||||
# when it is not allocated then that will prevent a single step
|
||||
# reconfiguration from whitelisting a VF to whitelisting its
|
||||
# parent PF, because the VF will be kept at restart and conflict
|
||||
# with the PF being added.
|
||||
|
||||
if dev.status in (
|
||||
fields.PciDeviceStatus.DELETED,
|
||||
fields.PciDeviceStatus.REMOVED,
|
||||
):
|
||||
# If the PCI tracker marked the device DELETED or REMOVED then
|
||||
# such device is not allocated, so we are free to drop it from
|
||||
# placement too.
|
||||
# We choose to remove these devs so the happy path of removing
|
||||
# not allocated devs is simple. And then we do an extra
|
||||
# step later in update_provider_tree to reconcile Placement
|
||||
# allocations with our view and add back some inventories to handle
|
||||
# removed but allocated devs.
|
||||
self._remove_dev(dev)
|
||||
else:
|
||||
if not dev_spec:
|
||||
if dev.instance_uuid:
|
||||
LOG.warning(
|
||||
"Device spec is not found for device %s in "
|
||||
"[pci]device_spec. We are skipping this devices "
|
||||
"during Placement update. The device is allocated by "
|
||||
"%s. You should not remove an allocated device from "
|
||||
"the configuration. Please restore the configuration "
|
||||
"or cold migrate the instance to resolve the "
|
||||
"inconsistency.",
|
||||
"[pci]device_spec. The device is allocated by "
|
||||
"%s. We are keeping this device in the Placement "
|
||||
"view. You should not remove an allocated device from "
|
||||
"the configuration. Please restore the configuration. "
|
||||
"If you cannot restore the configuration as the "
|
||||
"device is dead then delete or cold migrate the "
|
||||
"instance and then restart the nova-compute service "
|
||||
"to resolve the inconsistency.",
|
||||
dev.address,
|
||||
dev.instance_uuid
|
||||
)
|
||||
# We need to keep the RP, but we cannot just use _add_dev
|
||||
# to generate the inventory on the RP as that would require
|
||||
# to know the dev_spec to e.g. have the RC. So we only
|
||||
# ensure that the RP exists, the inventory will be adjusted
|
||||
# based on the existing allocation in a later step.
|
||||
self._ensure_rp_for_dev(dev)
|
||||
else:
|
||||
LOG.warning(
|
||||
"Device spec is not found for device %s in "
|
||||
@@ -525,11 +607,50 @@ class PlacementView:
|
||||
f"{', '.join(str(rp) for rp in self.rps.values())}"
|
||||
)
|
||||
|
||||
def update_provider_tree(
|
||||
@staticmethod
|
||||
def get_usage_per_rc_and_rp(
|
||||
allocations
|
||||
) -> ty.Dict[str, ty.Dict[str, int]]:
|
||||
"""Returns a dict keyed by RP uuid and the value is a dict of
|
||||
resource class: usage pairs telling how much total usage the given RP
|
||||
has from the given resource class across all the allocations.
|
||||
"""
|
||||
rp_rc_usage: ty.Dict[str, ty.Dict[str, int]] = (
|
||||
collections.defaultdict(lambda: collections.defaultdict(int)))
|
||||
for consumer in allocations.values():
|
||||
for rp_uuid, alloc in consumer["allocations"].items():
|
||||
for rc, amount in alloc["resources"].items():
|
||||
rp_rc_usage[rp_uuid][rc] += amount
|
||||
|
||||
return rp_rc_usage
|
||||
|
||||
def _remove_managed_rps_from_tree_not_in_view(
|
||||
self, provider_tree: provider_tree.ProviderTree
|
||||
) -> None:
|
||||
"""Removes PCI RPs from the provider_tree that are not present in the
|
||||
current PlacementView.
|
||||
"""
|
||||
rp_names_in_view = {rp.name for rp in self.rps.values()}
|
||||
uuids_in_tree = provider_tree.get_provider_uuids_in_tree(
|
||||
self.root_rp_name)
|
||||
for rp_uuid in uuids_in_tree:
|
||||
rp_data = provider_tree.data(rp_uuid)
|
||||
is_pci_rp = provider_tree.has_traits(
|
||||
rp_uuid, [os_traits.COMPUTE_MANAGED_PCI_DEVICE])
|
||||
if is_pci_rp and rp_data.name not in rp_names_in_view:
|
||||
provider_tree.remove(rp_uuid)
|
||||
|
||||
def update_provider_tree(
|
||||
self,
|
||||
provider_tree: provider_tree.ProviderTree,
|
||||
allocations: dict,
|
||||
) -> None:
|
||||
self._remove_managed_rps_from_tree_not_in_view(provider_tree)
|
||||
|
||||
rp_rc_usage = self.get_usage_per_rc_and_rp(allocations)
|
||||
for rp_name, rp in self.rps.items():
|
||||
rp.update_provider_tree(provider_tree, self.root_rp_name)
|
||||
rp.update_provider_tree(
|
||||
provider_tree, self.root_rp_name, rp_rc_usage)
|
||||
|
||||
def update_allocations(
|
||||
self,
|
||||
@@ -639,9 +760,10 @@ def update_provider_tree_for_pci(
|
||||
dev_spec = pci_tracker.dev_filter.get_devspec(dev)
|
||||
pv.process_dev(dev, dev_spec)
|
||||
|
||||
pv.update_provider_tree(provider_tree, allocations)
|
||||
|
||||
LOG.info("Placement PCI resource view: %s", pv)
|
||||
|
||||
pv.update_provider_tree(provider_tree)
|
||||
old_alloc = copy.deepcopy(allocations)
|
||||
# update_provider_tree correlated the PciDevice objects with RPs in
|
||||
# placement and recorded the RP UUID in the PciDevice object. We need to
|
||||
|
||||
@@ -23,7 +23,6 @@ from oslo_serialization import jsonutils
|
||||
|
||||
from nova import exception
|
||||
from nova.tests.fixtures import libvirt as fakelibvirt
|
||||
from nova.tests.functional.api import client
|
||||
from nova.tests.functional.libvirt import test_pci_sriov_servers
|
||||
|
||||
CONF = cfg.CONF
|
||||
@@ -42,12 +41,16 @@ WARN_PCI_TRACKER_HELD_DEVICE = (
|
||||
"this warning.")
|
||||
|
||||
WARN_PCI_PLACEMENT_HELD_DEVICE = (
|
||||
"WARNING [nova.compute.pci_placement_translator] Device spec is "
|
||||
"not found for device %s in [pci]device_spec. We are "
|
||||
"skipping this devices during Placement update. The device is "
|
||||
"allocated by %s. You should not remove an allocated device from "
|
||||
"the configuration. Please restore the configuration or cold "
|
||||
"migrate the instance to resolve the inconsistency."
|
||||
"WARNING [nova.compute.pci_placement_translator] "
|
||||
"Device spec is not found for device %s in "
|
||||
"[pci]device_spec. The device is allocated by "
|
||||
"%s. We are keeping this device in the Placement "
|
||||
"view. You should not remove an allocated device from "
|
||||
"the configuration. Please restore the configuration. "
|
||||
"If you cannot restore the configuration as the "
|
||||
"device is dead then delete or cold migrate the "
|
||||
"instance and then restart the nova-compute service "
|
||||
"to resolve the inconsistency."
|
||||
)
|
||||
|
||||
|
||||
@@ -799,6 +802,34 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
|
||||
self.stdlog.logger.output,
|
||||
)
|
||||
|
||||
# Now delete the service as the warning suggested. It should work.
|
||||
self._delete_server(server)
|
||||
|
||||
# The allocation successfully removed
|
||||
compute1_expected_placement_view["usages"] = {
|
||||
"0000:81:00.0": {
|
||||
"CUSTOM_PCI_8086_1528": 0,
|
||||
}
|
||||
}
|
||||
compute1_expected_placement_view["allocations"].pop(server["id"])
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
self.stdlog.delete_stored_logs()
|
||||
self.restart_compute_service(hostname="compute1")
|
||||
|
||||
# The next compute restart won't trigger any warning
|
||||
self.assertNotIn(
|
||||
"WARNING [nova.compute.pci_placement_translator] ",
|
||||
self.stdlog.logger.output,
|
||||
)
|
||||
# And the device is now removed from Placement
|
||||
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
def test_device_reconfiguration_with_allocations_config_change_stop(self):
|
||||
self._create_one_compute_with_a_pf_consumed_by_an_instance()
|
||||
|
||||
@@ -892,28 +923,48 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
|
||||
WARN_PCI_PLACEMENT_HELD_DEVICE % ("0000:81:00.0", server['id']),
|
||||
self.stdlog.logger.output,
|
||||
)
|
||||
|
||||
self.stdlog.delete_stored_logs()
|
||||
# Delete the server as the warning suggests. Unfortunately the deletion
|
||||
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
|
||||
ex = self.assertRaises(
|
||||
client.OpenStackApiException, self._delete_server, server)
|
||||
self.assertIn("Unexpected API Error. Please report this", str(ex))
|
||||
|
||||
# The sever delete fails as nova tries to delete the RP while it still
|
||||
# has allocations.
|
||||
self.assertRegex(
|
||||
# no placement error is reported
|
||||
self.assertNotRegex(
|
||||
self.stdlog.logger.output,
|
||||
"ERROR .nova.scheduler.client.report..*Failed to delete "
|
||||
"resource provider with UUID.*from the placement API. "
|
||||
"Got 409.*Unable to delete resource provider.*Resource "
|
||||
"provider has allocations.")
|
||||
|
||||
# The instance is put into ERROR state.
|
||||
server = self.api.get_server(server['id'])
|
||||
self.assertEqual(server['status'], 'ERROR')
|
||||
self.stdlog.delete_stored_logs()
|
||||
# the deletion succeeds
|
||||
self._delete_server(server)
|
||||
# no placement error is reported
|
||||
self.assertNotRegex(
|
||||
self.stdlog.logger.output,
|
||||
"ERROR .nova.scheduler.client.report..*Failed to delete "
|
||||
"resource provider with UUID.*from the placement API. "
|
||||
"Got 409.*Unable to delete resource provider.*Resource "
|
||||
"provider has allocations.")
|
||||
|
||||
# And the allocation is not removed.
|
||||
# The allocation is removed from placement
|
||||
compute1_expected_placement_view["usages"] = {
|
||||
"0000:81:00.0": {
|
||||
self.PF_RC: 0,
|
||||
}
|
||||
}
|
||||
compute1_expected_placement_view["allocations"].pop(server["id"])
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
# The PCI device is removed from the PCI tracker when the device is
|
||||
# freed, but not from Placement yet because during VM deletion the
|
||||
# resource tracker updated before the Placement allocation is dropped
|
||||
# so we cannot drop the Placement inventory during delete.
|
||||
self.assertPCIDeviceCounts("compute1", total=0, free=0)
|
||||
|
||||
# We need a periodics run to trigger the deletion of the device in
|
||||
# Placement
|
||||
self._run_periodics()
|
||||
|
||||
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
@@ -1002,26 +1053,40 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
|
||||
self.stdlog.logger.output,
|
||||
)
|
||||
|
||||
# Delete the server as the warning suggests. Unfortunately the deletion
|
||||
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
|
||||
ex = self.assertRaises(
|
||||
client.OpenStackApiException, self._delete_server, server)
|
||||
self.assertIn("Unexpected API Error. Please report this", str(ex))
|
||||
|
||||
# The sever delete fails as nova tries to delete the RP while it still
|
||||
# has allocations.
|
||||
self.assertRegex(
|
||||
self.stdlog.delete_stored_logs()
|
||||
# the deletion succeeds
|
||||
self._delete_server(server)
|
||||
# no placement error is reported
|
||||
self.assertNotRegex(
|
||||
self.stdlog.logger.output,
|
||||
"ERROR .nova.scheduler.client.report..*Failed to delete "
|
||||
"resource provider with UUID.*from the placement API. "
|
||||
"Got 409.*Unable to delete resource provider.*Resource "
|
||||
"provider has allocations.")
|
||||
|
||||
# The instance is put into ERROR state.
|
||||
server = self.api.get_server(server['id'])
|
||||
self.assertEqual(server['status'], 'ERROR')
|
||||
# The allocation is removed from placement
|
||||
compute1_expected_placement_view["usages"] = {
|
||||
"0000:81:00.0": {
|
||||
self.VF_RC: 0,
|
||||
}
|
||||
}
|
||||
compute1_expected_placement_view["allocations"].pop(server["id"])
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
# And the allocation is not removed.
|
||||
# The PCI device is removed from the PCI tracker when the device is
|
||||
# freed, but not from Placement yet because during VM deletion the
|
||||
# resource tracker updated before the Placement allocation is dropped
|
||||
# so we cannot drop the Placement inventory during delete.
|
||||
self.assertPCIDeviceCounts("compute1", total=0, free=0)
|
||||
|
||||
# We need a periodics run to trigger the deletion of the device in
|
||||
# Placement
|
||||
self._run_periodics()
|
||||
|
||||
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
@@ -1033,14 +1098,17 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
|
||||
# remove 0000:81:00.* VFs from the device spec and restart the compute
|
||||
device_spec = self._to_list_of_json_str([])
|
||||
self.flags(group='pci', device_spec=device_spec)
|
||||
self.restart_compute_service(hostname="compute1")
|
||||
# One of the VFs is used but all of them is removed from the config.
|
||||
# The PciTracker warns but keeps the allocated device so the placement
|
||||
# logic mimic this and only warns but keeps the RP and the allocation
|
||||
# in placement intact.
|
||||
self.restart_compute_service(hostname="compute1")
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
self._run_periodics()
|
||||
# The non allocated VF is removed while the allocated one is kept
|
||||
compute1_expected_placement_view["inventories"] = {
|
||||
"0000:81:00.0": {
|
||||
self.VF_RC: 1
|
||||
},
|
||||
}
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
# the warning from the PciTracker
|
||||
@@ -1048,41 +1116,54 @@ class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
|
||||
WARN_PCI_TRACKER_HELD_DEVICE % (server['id'], "1:0000:81:00.2"),
|
||||
self.stdlog.logger.output,
|
||||
)
|
||||
# the warning from the placement PCI tracking logic
|
||||
# two warnings from the placement PCI tracking logic
|
||||
self.assertIn(
|
||||
WARN_PCI_PLACEMENT_HELD_DEVICE % ("0000:81:00.2", server['id']),
|
||||
self.stdlog.logger.output,
|
||||
)
|
||||
# however there is an error as well as nova tries to remove the
|
||||
# RP that has allocations this already a signal of the bug
|
||||
# https://bugs.launchpad.net/nova/+bug/2115905
|
||||
self.assertRegex(
|
||||
self.assertIn(
|
||||
"WARNING [nova.compute.pci_placement_translator] "
|
||||
"Needed to adjust inventories of CUSTOM_PCI_8086_1515 on "
|
||||
"resource provider compute1_0000:81:00.0 from 0 to 1 due to "
|
||||
"existing placement allocations. This should only happen while "
|
||||
"VMs using already removed devices.",
|
||||
self.stdlog.logger.output,
|
||||
"ERROR .nova.scheduler.client.report..*Failed to delete "
|
||||
"resource provider with UUID.*from the placement API. "
|
||||
"Got 409.*Unable to delete resource provider.*Resource "
|
||||
"provider has allocations.")
|
||||
)
|
||||
|
||||
self.stdlog.delete_stored_logs()
|
||||
# Delete the server as the warning suggests. Unfortunately the deletion
|
||||
# fails. This is bug https://bugs.launchpad.net/nova/+bug/2115905
|
||||
ex = self.assertRaises(
|
||||
client.OpenStackApiException, self._delete_server, server)
|
||||
self.assertIn("Unexpected API Error. Please report this", str(ex))
|
||||
|
||||
# We have the same RP deletion error as before.
|
||||
self.assertRegex(
|
||||
# the deletion succeeds
|
||||
self._delete_server(server)
|
||||
# no placement error is reported
|
||||
self.assertNotRegex(
|
||||
self.stdlog.logger.output,
|
||||
"ERROR .nova.scheduler.client.report..*Failed to delete "
|
||||
"resource provider with UUID.*from the placement API. "
|
||||
"Got 409.*Unable to delete resource provider.*Resource "
|
||||
"provider has allocations.")
|
||||
|
||||
# The instance is put into ERROR state.
|
||||
server = self.api.get_server(server['id'])
|
||||
self.assertEqual(server['status'], 'ERROR')
|
||||
# The allocation is removed from placement
|
||||
compute1_expected_placement_view["usages"] = {
|
||||
"0000:81:00.0": {
|
||||
self.VF_RC: 0,
|
||||
}
|
||||
}
|
||||
compute1_expected_placement_view["allocations"].pop(server["id"])
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
# And the allocation is not removed.
|
||||
# The PCI device is removed from the PCI tracker when the device is
|
||||
# freed, but not from Placement yet because during VM deletion the
|
||||
# resource tracker updated before the Placement allocation is dropped
|
||||
# so we cannot drop the Placement inventory during delete.
|
||||
self.assertPCIDeviceCounts("compute1", total=0, free=0)
|
||||
|
||||
# We need a periodics run to trigger the deletion of the device in
|
||||
# Placement
|
||||
self._run_periodics()
|
||||
|
||||
compute1_expected_placement_view["inventories"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["traits"].pop("0000:81:00.0")
|
||||
compute1_expected_placement_view["usages"].pop("0000:81:00.0")
|
||||
self.assert_placement_pci_view(
|
||||
"compute1", **compute1_expected_placement_view)
|
||||
|
||||
|
||||
@@ -11,7 +11,10 @@
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import collections
|
||||
import ddt
|
||||
import os_traits
|
||||
from oslo_utils.fixture import uuidsentinel as uuids
|
||||
from unittest import mock
|
||||
|
||||
@@ -60,14 +63,15 @@ class TestTranslator(test.NoDBTestCase):
|
||||
# So we have a device but there is no spec for it
|
||||
pci_tracker.dev_filter.get_devspec = mock.Mock(return_value=None)
|
||||
pci_tracker.dev_filter.specs = []
|
||||
# we expect that the provider_tree is not touched as the device without
|
||||
# spec is skipped, we assert that with the NonCallableMock
|
||||
provider_tree = mock.NonCallableMock()
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root("fake-node", uuids.compute_rp)
|
||||
|
||||
ppt.update_provider_tree_for_pci(
|
||||
provider_tree, "fake-node", pci_tracker, {}, [])
|
||||
with mock.patch.object(pt, "remove", new=mock.NonCallableMock()):
|
||||
ppt.update_provider_tree_for_pci(
|
||||
pt, "fake-node", pci_tracker, {}, [])
|
||||
|
||||
self.assertIn(
|
||||
"WARNING [nova.compute.pci_placement_translator] "
|
||||
"Device spec is not found for device 0000:81:00.0 in "
|
||||
"[pci]device_spec. Ignoring device in Placement resource view. "
|
||||
"This should not happen. Please file a bug.",
|
||||
@@ -261,7 +265,7 @@ class TestTranslator(test.NoDBTestCase):
|
||||
|
||||
pv._add_dev(vf, {})
|
||||
pv._add_dev(pf, {})
|
||||
pv.update_provider_tree(pt)
|
||||
pv.update_provider_tree(pt, {})
|
||||
|
||||
self.assertEqual(
|
||||
pt.data("fake-node_0000:71:00.0").uuid, vf.extra_info["rp_uuid"]
|
||||
@@ -331,7 +335,7 @@ class TestTranslator(test.NoDBTestCase):
|
||||
self.assertRaisesRegex(exception.PlacementPciException,
|
||||
'Only.*may set one_time_use',
|
||||
pv._add_dev, vf2, {'one_time_use': 'false'})
|
||||
pv.update_provider_tree(pt)
|
||||
pv.update_provider_tree(pt, {})
|
||||
|
||||
# These are both OTU, make sure we get the trait added
|
||||
self.assertIn('HW_PCI_ONE_TIME_USE',
|
||||
@@ -375,21 +379,21 @@ class TestTranslator(test.NoDBTestCase):
|
||||
).inventory['CUSTOM_PCI_DEAD_BEEF'].get('reserved', 0))
|
||||
|
||||
# Before allocation, reserved is unset
|
||||
pv.update_provider_tree(pt)
|
||||
pv.update_provider_tree(pt, {})
|
||||
assert_inventory(71, 0)
|
||||
assert_inventory(72, 0)
|
||||
|
||||
# After allocation, reserved gets set to total (only for the device
|
||||
# that is used)
|
||||
pf.instance_uuid = uuids.instance
|
||||
pv.update_provider_tree(pt)
|
||||
pv.update_provider_tree(pt, {})
|
||||
assert_inventory(71, 0)
|
||||
assert_inventory(72, 1)
|
||||
|
||||
# After deallocation, reserved is again unchanged (i.e. never
|
||||
# decremented)
|
||||
pf.instance_uuid = None
|
||||
pv.update_provider_tree(pt)
|
||||
pv.update_provider_tree(pt, {})
|
||||
assert_inventory(71, 0)
|
||||
assert_inventory(72, 1)
|
||||
|
||||
@@ -412,3 +416,237 @@ class TestTranslator(test.NoDBTestCase):
|
||||
|
||||
pci_tracker.stats.populate_pools_metadata_from_assigned_devices.\
|
||||
assert_called_once_with()
|
||||
|
||||
def _convert_defaultdict_to_dict(self, d):
|
||||
if not isinstance(d, collections.defaultdict):
|
||||
return d
|
||||
return {k: self._convert_defaultdict_to_dict(v) for k, v in d.items()}
|
||||
|
||||
def test_get_usage_per_rc_and_rp_no_allocations(self):
|
||||
actual = ppt.PlacementView.get_usage_per_rc_and_rp({})
|
||||
|
||||
self.assertEqual({}, self._convert_defaultdict_to_dict(actual))
|
||||
|
||||
def test_get_usage_per_rc_and_rp_empty_consumer(self):
|
||||
actual = ppt.PlacementView.get_usage_per_rc_and_rp({
|
||||
uuids.consumer1: {"allocations": {}}
|
||||
})
|
||||
|
||||
self.assertEqual({}, self._convert_defaultdict_to_dict(actual))
|
||||
|
||||
def test_get_usage_per_rc_and_rp(self):
|
||||
allocations = {
|
||||
uuids.consumer1: {
|
||||
"allocations": {
|
||||
uuids.rp1: {
|
||||
"resources": {
|
||||
"RC1": 1,
|
||||
"RC2": 3,
|
||||
},
|
||||
},
|
||||
uuids.rp2: {
|
||||
"resources": {
|
||||
"RC2": 5,
|
||||
"RC3": 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
uuids.consumer2: {
|
||||
"allocations": {
|
||||
uuids.rp2: {
|
||||
"resources": {
|
||||
"RC2": 1,
|
||||
"RC3": 3,
|
||||
},
|
||||
},
|
||||
uuids.rp3: {
|
||||
"resources": {
|
||||
"RC1": 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
uuids.consumer3: {
|
||||
"allocations": {
|
||||
uuids.rp1: {
|
||||
"resources": {
|
||||
"RC2": 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
actual = ppt.PlacementView.get_usage_per_rc_and_rp(allocations)
|
||||
|
||||
expected = {
|
||||
uuids.rp1: {
|
||||
"RC1": 1, # only from consumer1
|
||||
"RC2": 6, # from consumer1 (3) + consumer3 (3)
|
||||
},
|
||||
uuids.rp2: {
|
||||
"RC2": 6, # from consumer1 (5) + consumer2 (1)
|
||||
"RC3": 4, # from consumer1 (1) + consumer2 (3)
|
||||
},
|
||||
uuids.rp3: {
|
||||
"RC1": 1 # only from consumer2
|
||||
},
|
||||
}
|
||||
self.assertEqual(expected, self._convert_defaultdict_to_dict(actual))
|
||||
|
||||
def test_remove_managed_rps_empty_view(self):
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root("fake-node", uuids.compute_rp)
|
||||
pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci)
|
||||
|
||||
pv = ppt.PlacementView("fake-node", [])
|
||||
pv._remove_managed_rps_from_tree_not_in_view(pt)
|
||||
|
||||
# No changes expected in the tree
|
||||
self.assertTrue(pt.exists("fake-node"))
|
||||
self.assertTrue(pt.exists("no-pci"))
|
||||
|
||||
def test_remove_managed_rps_new_rp_in_view(self):
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root("fake-node", uuids.compute_rp)
|
||||
pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci)
|
||||
|
||||
pv = ppt.PlacementView("fake-node", [])
|
||||
pf = pci_device.PciDevice(
|
||||
address="0000:72:00.0",
|
||||
parent_addr=None,
|
||||
dev_type=fields.PciDeviceType.SRIOV_PF,
|
||||
vendor_id="dead",
|
||||
product_id="beef",
|
||||
status=fields.PciDeviceStatus.AVAILABLE,
|
||||
)
|
||||
|
||||
pv.process_dev(pf, devspec.PciDeviceSpec({}))
|
||||
pv._remove_managed_rps_from_tree_not_in_view(pt)
|
||||
|
||||
# No RPs is removed
|
||||
self.assertTrue(pt.exists("fake-node"))
|
||||
self.assertTrue(pt.exists("no-pci"))
|
||||
|
||||
def test_remove_managed_rps_no_rp_change(self):
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root("fake-node", uuids.compute_rp)
|
||||
pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci)
|
||||
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
|
||||
pt.add_traits(
|
||||
"fake-node_0000:72:00.0", os_traits.COMPUTE_MANAGED_PCI_DEVICE)
|
||||
|
||||
pv = ppt.PlacementView("fake-node", [])
|
||||
pf = pci_device.PciDevice(
|
||||
address="0000:72:00.0",
|
||||
parent_addr=None,
|
||||
dev_type=fields.PciDeviceType.SRIOV_PF,
|
||||
vendor_id="dead",
|
||||
product_id="beef",
|
||||
status=fields.PciDeviceStatus.AVAILABLE,
|
||||
)
|
||||
|
||||
pv.process_dev(pf, devspec.PciDeviceSpec({}))
|
||||
pv._remove_managed_rps_from_tree_not_in_view(pt)
|
||||
|
||||
# The existing PCI RP is kept as it exists in the View as well
|
||||
self.assertTrue(pt.exists("fake-node"))
|
||||
self.assertTrue(pt.exists("no-pci"))
|
||||
self.assertTrue(pt.exists("fake-node_0000:72:00.0"))
|
||||
|
||||
def test_remove_managed_rps_rp_removed_from_view(self):
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root("fake-node", uuids.compute_rp)
|
||||
pt.new_child("no-pci", uuids.compute_rp, uuids.non_pci)
|
||||
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
|
||||
pt.add_traits(
|
||||
"fake-node_0000:72:00.0", os_traits.COMPUTE_MANAGED_PCI_DEVICE)
|
||||
|
||||
pv = ppt.PlacementView("fake-node", [])
|
||||
pv._remove_managed_rps_from_tree_not_in_view(pt)
|
||||
|
||||
self.assertTrue(pt.exists("fake-node"))
|
||||
self.assertTrue(pt.exists("no-pci"))
|
||||
# The PCI RP is removed as it is not in the View anymore
|
||||
self.assertFalse(pt.exists("fake-node_0000:72:00.0"))
|
||||
|
||||
def test_adjust_for_removals_no_allocation_no_adjustment(self):
|
||||
rp = ppt.PciResourceProvider("pci")
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root("fake-node", uuids.compute_rp)
|
||||
pt.new_child("pci", uuids.compute_rp, uuids.pci)
|
||||
usage = {uuids.pci: {}}
|
||||
|
||||
rp._adjust_for_removals_and_held_devices(pt, usage)
|
||||
|
||||
self.assertEqual(0, rp.adjustment)
|
||||
|
||||
def test_adjust_for_removals_allocated_configured_no_adjustment(self):
|
||||
rp = ppt.PciResourceProvider("fake-node_0000:72:00.0")
|
||||
pf = pci_device.PciDevice(
|
||||
address="0000:72:00.0",
|
||||
parent_addr=None,
|
||||
dev_type=fields.PciDeviceType.SRIOV_PF,
|
||||
vendor_id="dead",
|
||||
product_id="beef",
|
||||
status=fields.PciDeviceStatus.ALLOCATED,
|
||||
)
|
||||
rp.add_parent(pf, {})
|
||||
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root("fake-node", uuids.compute_rp)
|
||||
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
|
||||
|
||||
usage = {uuids.pci: {"CUSTOM_PCI_DEAD_BEEF": 1}}
|
||||
|
||||
rp._adjust_for_removals_and_held_devices(pt, usage)
|
||||
|
||||
self.assertEqual(0, rp.adjustment)
|
||||
self.assertEqual(1, rp.total)
|
||||
|
||||
def test_adjust_for_removals_allocated_removed(self):
|
||||
rp = ppt.PciResourceProvider("fake-node_0000:72:00.0")
|
||||
vf = pci_device.PciDevice(
|
||||
address="0000:72:00.1",
|
||||
parent_addr="0000:72:00.0",
|
||||
dev_type=fields.PciDeviceType.SRIOV_VF,
|
||||
vendor_id="dead",
|
||||
product_id="beef",
|
||||
status=fields.PciDeviceStatus.ALLOCATED,
|
||||
)
|
||||
rp.add_child(vf, {})
|
||||
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root("fake-node", uuids.compute_rp)
|
||||
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
|
||||
|
||||
# We have two allocations but only one VF left so one VF inventory
|
||||
# is missing. We expect an adjustment for that
|
||||
usage = {uuids.pci: {"CUSTOM_PCI_DEAD_BEEF": 2}}
|
||||
|
||||
rp._adjust_for_removals_and_held_devices(pt, usage)
|
||||
|
||||
self.assertEqual(1, len(rp.devs))
|
||||
self.assertEqual(1, rp.adjustment)
|
||||
self.assertEqual(2, rp.total)
|
||||
self.assertIn(
|
||||
"Needed to adjust inventories", self.stdlog.logger.output)
|
||||
|
||||
def test_adjust_for_removals_allocated_removed_no_inventory(self):
|
||||
rp = ppt.PciResourceProvider("fake-node_0000:72:00.0")
|
||||
|
||||
pt = provider_tree.ProviderTree()
|
||||
pt.new_root("fake-node", uuids.compute_rp)
|
||||
pt.new_child("fake-node_0000:72:00.0", uuids.compute_rp, uuids.pci)
|
||||
|
||||
# One allocation exists but no inventory at all. The RC will be
|
||||
# inferred from the allocation and the inventory is adjusted
|
||||
usage = {uuids.pci: {"CUSTOM_PCI_DEAD_BEEF": 1}}
|
||||
|
||||
rp._adjust_for_removals_and_held_devices(pt, usage)
|
||||
|
||||
self.assertEqual(0, len(rp.devs))
|
||||
self.assertEqual(1, rp.adjustment)
|
||||
self.assertEqual(1, rp.total)
|
||||
self.assertIn(
|
||||
"Needed to adjust inventories", self.stdlog.logger.output)
|
||||
|
||||
Reference in New Issue
Block a user