Heal missing simple PCI allocation in the resource tracker

There can be existing instances with PCI allocation today. When the PCI
tracking in Placement is enabled the instance allocations in Placement
needs to be updated to hold the PCI allocations as well.

This patch adds support to gathering such missing allocation and issuing
a reshape to update them in Placement. This logic will be kept in the
resource tracker until the scheduler gains support for creating PCI
allocations and such scheduler logic is made mandatory.

This patch only handles the simple cases where healing is needed because
an instance exists with PCI allocation and the whole PCI allocation is
missing from placement. More complex cases (i.e. resize to different
amount of PCI devs) will be handled by subsequent patches.

blueprint: pci-device-tracking-in-placement
Change-Id: I6d4ebcd3a4f83d4a2f937fac0b5249654cc46bc9
This commit is contained in:
Balazs Gibizer
2022-08-08 12:20:04 +02:00
parent 48229b46b4
commit d483a696a5
3 changed files with 341 additions and 6 deletions
+103 -5
View File
@@ -11,11 +11,14 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
import copy
import typing as ty
import os_resource_classes
import os_traits
from oslo_log import log as logging
from oslo_utils import uuidutils
from nova.compute import provider_tree
from nova import exception
@@ -126,6 +129,10 @@ class PciResourceProvider:
def devs(self) -> ty.List[pci_device.PciDevice]:
return [self.parent_dev] if self.parent_dev else self.children_devs
@property
def to_be_deleted(self):
return not bool(self.devs)
def add_child(self, dev, dev_spec_tags: ty.Dict[str, str]) -> None:
if self.parent_dev:
raise exception.PlacementPciDependentDeviceException(
@@ -177,13 +184,34 @@ class PciResourceProvider:
# Nothing to do here. The update_provider_tree we handle full RP
pass
def _get_allocations(self) -> ty.Mapping[str, int]:
"""Return a dict of used resources keyed by consumer UUID.
Note that:
1) a single consumer can consume more than one resource from a single
RP. I.e. A VM with two VFs from the same parent PF
2) multiple consumers can consume resources from a single RP. I.e. two
VMs consuming one VF from the same PF each
3) regardless of how many consumers we have on a single PCI RP, they
are always consuming resources from the same resource class as
we are not supporting dependent devices modelled by the same RP but
different resource classes.
"""
return collections.Counter(
[
dev.instance_uuid
for dev in self.devs
if "instance_uuid" in dev and dev.instance_uuid
]
)
def update_provider_tree(
self,
provider_tree: provider_tree.ProviderTree,
parent_rp_name: str,
) -> None:
if not self.parent_dev and not self.children_devs:
if self.to_be_deleted:
# This means we need to delete the RP from placement if exists
if provider_tree.exists(self.name):
# NOTE(gibi): If there are allocations on this RP then
@@ -194,7 +222,16 @@ class PciResourceProvider:
return
if not provider_tree.exists(self.name):
provider_tree.new_child(self.name, parent_rp_name)
# NOTE(gibi): We need to generate UUID for the new provider in Nova
# instead of letting Placement assign one. We are potentially
# healing a missing RP along with missing allocations on that RP.
# The allocation healing happens with POST /reshape, and that API
# only takes RP UUIDs.
provider_tree.new_child(
self.name,
parent_rp_name,
uuid=uuidutils.generate_uuid(dashed=True)
)
provider_tree.update_inventory(
self.name,
@@ -214,6 +251,43 @@ class PciResourceProvider:
)
provider_tree.update_traits(self.name, self.traits)
def update_allocations(
self,
allocations: dict,
provider_tree: provider_tree.ProviderTree
) -> bool:
updated = False
if self.to_be_deleted:
# the RP is going away because either removed from the hypervisor
# or the compute's config is changed to ignore the device.
return updated
# we assume here that if this RP has been created in the current round
# of healing then it already has a UUID assigned.
rp_uuid = provider_tree.data(self.name).uuid
for consumer, amount in self._get_allocations().items():
current_allocs = allocations[consumer]['allocations']
current_rp_allocs = current_allocs.get(rp_uuid)
if current_rp_allocs:
# update an existing allocation if the current one differs
current_rc_allocs = current_rp_allocs["resources"].get(
self.resource_class, 0)
if current_rc_allocs != amount:
current_rp_allocs[
"resources"][self.resource_class] = amount
updated = True
else:
# insert a new allocation as it is missing
current_allocs[rp_uuid] = {
"resources": {self.resource_class: amount}
}
updated = True
return updated
def __str__(self) -> str:
if self.devs:
return (
@@ -348,6 +422,21 @@ class PlacementView:
for rp_name, rp in self.rps.items():
rp.update_provider_tree(provider_tree, self.root_rp_name)
def update_allocations(
self,
allocations: dict,
provider_tree: provider_tree.ProviderTree
) -> bool:
"""Updates the passed in allocations dict inplace with any PCI
allocations that is inferred from the PciDevice objects already added
to the view. It returns True if the allocations dict has been changed,
False otherwise.
"""
updated = False
for rp in self.rps.values():
updated |= rp.update_allocations(allocations, provider_tree)
return updated
def ensure_no_dev_spec_with_devname(dev_specs: ty.List[devspec.PciDeviceSpec]):
for dev_spec in dev_specs:
@@ -437,7 +526,16 @@ def update_provider_tree_for_pci(
LOG.info("Placement PCI resource view: %s", pv)
pv.update_provider_tree(provider_tree)
# FIXME(gibi): Check allocations too based on pci_dev.instance_uuid and
# if here was any update then we have to return True to trigger a reshape.
old_alloc = copy.deepcopy(allocations)
updated = pv.update_allocations(allocations, provider_tree)
return False
if updated:
LOG.debug(
"Placement PCI view needs allocation healing. This should only "
"happen if [scheduler]pci_in_placement is still disabled. "
"Original allocations: %s New allocations: %s",
old_alloc,
allocations,
)
return updated
@@ -62,6 +62,9 @@ class PlacementPCIReportingTests(test_pci_sriov_servers._PCIServersTestBase):
def _to_device_spec_conf(spec_list):
return [jsonutils.dumps(x) for x in spec_list]
class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
def test_new_compute_init_with_pci_devs(self):
"""A brand new compute is started with multiple pci devices configured
for nova.
@@ -748,3 +751,188 @@ class PlacementPCIReportingTests(test_pci_sriov_servers._PCIServersTestBase):
"enabled.",
str(ex)
)
class PlacementPCIAllocationHealingTests(PlacementPCIReportingTests):
def setUp(self):
super().setUp()
# Pre-configure a PCI alias to consume our devs
alias_pci = {
"vendor_id": fakelibvirt.PCI_VEND_ID,
"product_id": fakelibvirt.PCI_PROD_ID,
"name": "a-pci-dev",
}
alias_pf = {
"vendor_id": fakelibvirt.PCI_VEND_ID,
"product_id": fakelibvirt.PF_PROD_ID,
"device_type": "type-PF",
"name": "a-pf",
}
alias_vf = {
"vendor_id": fakelibvirt.PCI_VEND_ID,
"product_id": fakelibvirt.VF_PROD_ID,
"device_type": "type-VF",
"name": "a-vf",
}
self.flags(
group='pci',
alias=self._to_pci_alias_conf([alias_pci, alias_pf, alias_vf]))
@staticmethod
def _to_pci_alias_conf(alias_list):
return [jsonutils.dumps(x) for x in alias_list]
def test_heal_single_pci_allocation(self):
# The fake libvirt will emulate on the host:
# * one type-PCI in slot 0
pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=1, num_pfs=0, num_vfs=0)
# the config matches the PCI dev
device_spec = self._to_device_spec_conf(
[
{
"vendor_id": fakelibvirt.PCI_VEND_ID,
"product_id": fakelibvirt.PCI_PROD_ID,
},
]
)
self.flags(group='pci', device_spec=device_spec)
# Start a compute *without* PCI tracking in placement
self.mock_pci_report_in_placement.return_value = False
self.start_compute(hostname="compute1", pci_info=pci_info)
self.assertPCIDeviceCounts("compute1", total=1, free=1)
# Create an instance that consume our PCI dev
extra_spec = {"pci_passthrough:alias": "a-pci-dev:1"}
flavor_id = self._create_flavor(extra_spec=extra_spec)
server = self._create_server(flavor_id=flavor_id, networks=[])
self.assertPCIDeviceCounts("compute1", total=1, free=0)
# Restart the compute but now with PCI tracking enabled
self.mock_pci_report_in_placement.return_value = True
self.restart_compute_service("compute1")
# Assert that the PCI allocation is healed in placement
self.assertPCIDeviceCounts("compute1", total=1, free=0)
expected_placement_view = {
"inventories": {
"0000:81:00.0": {self.PCI_RC: 1},
},
"traits": {
"0000:81:00.0": [],
},
"usages": {
"0000:81:00.0": {self.PCI_RC: 1}
},
"allocations": {
server['id']: {
"0000:81:00.0": {self.PCI_RC: 1}
}
}
}
self.assert_placement_pci_view("compute1", **expected_placement_view)
# run an update_available_resources periodic and assert that the usage
# and allocation stays
self._run_periodics()
self.assert_placement_pci_view("compute1", **expected_placement_view)
def test_heal_multiple_allocations(self):
# The fake libvirt will emulate on the host:
# * two type-PCI devs (slot 0 and 1)
# * two type-PFs (slot 2 and 3) with 4 type-VFs each
pci_info = fakelibvirt.HostPCIDevicesInfo(
num_pci=2, num_pfs=2, num_vfs=8)
# the config matches:
device_spec = self._to_device_spec_conf(
[
# both type-PCI
{
"vendor_id": fakelibvirt.PCI_VEND_ID,
"product_id": fakelibvirt.PCI_PROD_ID,
},
# the PF in slot 2
{
"vendor_id": fakelibvirt.PCI_VEND_ID,
"product_id": fakelibvirt.PF_PROD_ID,
"address": "0000:81:02.0",
},
# the VFs in slot 3
{
"vendor_id": fakelibvirt.PCI_VEND_ID,
"product_id": fakelibvirt.VF_PROD_ID,
"address": "0000:81:03.*",
},
]
)
self.flags(group='pci', device_spec=device_spec)
# Start a compute *without* PCI tracking in placement
self.mock_pci_report_in_placement.return_value = False
self.start_compute(hostname="compute1", pci_info=pci_info)
# 2 PCI + 1 PF + 4 VFs
self.assertPCIDeviceCounts("compute1", total=7, free=7)
# Create three instances consuming devices:
# * server_2pci: two type-PCI
# * server_pf_vf: one PF and one VF
# * server_2vf: two VFs
extra_spec = {"pci_passthrough:alias": "a-pci-dev:2"}
flavor_id = self._create_flavor(extra_spec=extra_spec)
server_2pci = self._create_server(flavor_id=flavor_id, networks=[])
self.assertPCIDeviceCounts("compute1", total=7, free=5)
extra_spec = {"pci_passthrough:alias": "a-pf:1,a-vf:1"}
flavor_id = self._create_flavor(extra_spec=extra_spec)
server_pf_vf = self._create_server(flavor_id=flavor_id, networks=[])
self.assertPCIDeviceCounts("compute1", total=7, free=3)
extra_spec = {"pci_passthrough:alias": "a-vf:2"}
flavor_id = self._create_flavor(extra_spec=extra_spec)
server_2vf = self._create_server(flavor_id=flavor_id, networks=[])
self.assertPCIDeviceCounts("compute1", total=7, free=1)
# Restart the compute but now with PCI tracking enabled
self.mock_pci_report_in_placement.return_value = True
self.restart_compute_service("compute1")
# Assert that the PCI allocation is healed in placement
self.assertPCIDeviceCounts("compute1", total=7, free=1)
expected_placement_view = {
"inventories": {
"0000:81:00.0": {self.PCI_RC: 1},
"0000:81:01.0": {self.PCI_RC: 1},
"0000:81:02.0": {self.PF_RC: 1},
"0000:81:03.0": {self.VF_RC: 4},
},
"traits": {
"0000:81:00.0": [],
"0000:81:01.0": [],
"0000:81:02.0": [],
"0000:81:03.0": [],
},
"usages": {
"0000:81:00.0": {self.PCI_RC: 1},
"0000:81:01.0": {self.PCI_RC: 1},
"0000:81:02.0": {self.PF_RC: 1},
"0000:81:03.0": {self.VF_RC: 3},
},
"allocations": {
server_2pci['id']: {
"0000:81:00.0": {self.PCI_RC: 1},
"0000:81:01.0": {self.PCI_RC: 1},
},
server_pf_vf['id']: {
"0000:81:02.0": {self.PF_RC: 1},
"0000:81:03.0": {self.VF_RC: 1},
},
server_2vf['id']: {
"0000:81:03.0": {self.VF_RC: 2}
},
},
}
self.assert_placement_pci_view("compute1", **expected_placement_view)
# run an update_available_resources periodic and assert that the usage
# and allocation stays
self._run_periodics()
self.assert_placement_pci_view("compute1", **expected_placement_view)
@@ -80,7 +80,15 @@ class _PCIServersTestBase(base.ServersTestBase):
return rp
self.fail(f'RP {name} is not found in Placement {rps}')
def assert_placement_pci_view(self, hostname, inventories, traits):
def assert_placement_pci_view(
self, hostname, inventories, traits, usages=None, allocations=None
):
if not usages:
usages = {}
if not allocations:
allocations = {}
compute_rp_uuid = self.compute_rp_uuids[hostname]
rps = self._get_all_rps_in_a_tree(compute_rp_uuid)
@@ -120,6 +128,47 @@ class _PCIServersTestBase(base.ServersTestBase):
f"Traits on RP {real_rp_name} does not match with expectation"
)
for rp_name, usage in usages.items():
real_rp_name = f'{hostname}_{rp_name}'
rp = self._get_rp_by_name(real_rp_name, rps)
rp_usage = self._get_provider_usages(rp['uuid'])
self.assertEqual(
usage,
rp_usage,
f"Usage on RP {real_rp_name} does not match with expectation"
)
for consumer, expected_allocations in allocations.items():
actual_allocations = self._get_allocations_by_server_uuid(consumer)
self.assertEqual(
len(expected_allocations),
# actual_allocations also contains allocations against the
# root provider for VCPU, MEMORY_MB, and DISK_GB so subtract
# one
len(actual_allocations) - 1,
f"The consumer {consumer} allocates from different number of "
f"RPs than expected. Expected: {expected_allocations}, "
f"Actual: {actual_allocations}"
)
for rp_name, expected_rp_allocs in expected_allocations.items():
real_rp_name = f'{hostname}_{rp_name}'
rp = self._get_rp_by_name(real_rp_name, rps)
self.assertIn(
rp['uuid'],
actual_allocations,
f"The consumer {consumer} expected to allocate from "
f"{rp['uuid']}. Expected: {expected_allocations}, "
f"Actual: {actual_allocations}"
)
actual_rp_allocs = actual_allocations[rp['uuid']]['resources']
self.assertEqual(
expected_rp_allocs,
actual_rp_allocs,
f"The consumer {consumer} expected to have allocation "
f"{expected_rp_allocs} on {rp_name} but it has "
f"{actual_rp_allocs} instead."
)
class _PCIServersWithMigrationTestBase(_PCIServersTestBase):