Heal missing simple PCI allocation in the resource tracker

There can be existing instances with PCI allocation today. When the PCI tracking in Placement is enabled the instance allocations in Placement needs to be updated to hold the PCI allocations as well. This patch adds support to gathering such missing allocation and issuing a reshape to update them in Placement. This logic will be kept in the resource tracker until the scheduler gains support for creating PCI allocations and such scheduler logic is made mandatory. This patch only handles the simple cases where healing is needed because an instance exists with PCI allocation and the whole PCI allocation is missing from placement. More complex cases (i.e. resize to different amount of PCI devs) will be handled by subsequent patches. blueprint: pci-device-tracking-in-placement Change-Id: I6d4ebcd3a4f83d4a2f937fac0b5249654cc46bc9
2022-08-08 12:20:04 +02:00
parent 48229b46b4
commit d483a696a5
3 changed files with 341 additions and 6 deletions
@@ -11,11 +11,14 @@
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
+import collections
+import copy
 import typing as ty

 import os_resource_classes
 import os_traits
 from oslo_log import log as logging
+from oslo_utils import uuidutils

 from nova.compute import provider_tree
 from nova import exception
@@ -126,6 +129,10 @@ class PciResourceProvider:
    def devs(self) -> ty.List[pci_device.PciDevice]:
        return [self.parent_dev] if self.parent_dev else self.children_devs

+    @property
+    def to_be_deleted(self):
+        return not bool(self.devs)
+
    def add_child(self, dev, dev_spec_tags: ty.Dict[str, str]) -> None:
        if self.parent_dev:
            raise exception.PlacementPciDependentDeviceException(
@@ -177,13 +184,34 @@ class PciResourceProvider:
        # Nothing to do here. The update_provider_tree we handle full RP
        pass

+    def _get_allocations(self) -> ty.Mapping[str, int]:
+        """Return a dict of used resources keyed by consumer UUID.
+
+        Note that:
+        1) a single consumer can consume more than one resource from a single
+           RP. I.e. A VM with two VFs from the same parent PF
+        2) multiple consumers can consume resources from a single RP. I.e. two
+           VMs consuming one VF from the same PF each
+        3) regardless of how many consumers we have on a single PCI RP, they
+           are always consuming resources from the same resource class as
+           we are not supporting dependent devices modelled by the same RP but
+           different resource classes.
+        """
+        return collections.Counter(
+            [
+                dev.instance_uuid
+                for dev in self.devs
+                if "instance_uuid" in dev and dev.instance_uuid
+            ]
+        )
+
    def update_provider_tree(
        self,
        provider_tree: provider_tree.ProviderTree,
        parent_rp_name: str,
    ) -> None:

-        if not self.parent_dev and not self.children_devs:
+        if self.to_be_deleted:
            # This means we need to delete the RP from placement if exists
            if provider_tree.exists(self.name):
                # NOTE(gibi): If there are allocations on this RP then
@@ -194,7 +222,16 @@ class PciResourceProvider:
            return

        if not provider_tree.exists(self.name):
-            provider_tree.new_child(self.name, parent_rp_name)
+            # NOTE(gibi): We need to generate UUID for the new provider in Nova
+            # instead of letting Placement assign one. We are potentially
+            # healing a missing RP along with missing allocations on that RP.
+            # The allocation healing happens with POST /reshape, and that API
+            # only takes RP UUIDs.
+            provider_tree.new_child(
+                self.name,
+                parent_rp_name,
+                uuid=uuidutils.generate_uuid(dashed=True)
+            )

        provider_tree.update_inventory(
            self.name,
@@ -214,6 +251,43 @@ class PciResourceProvider:
        )
        provider_tree.update_traits(self.name, self.traits)

+    def update_allocations(
+        self,
+        allocations: dict,
+        provider_tree: provider_tree.ProviderTree
+    ) -> bool:
+        updated = False
+
+        if self.to_be_deleted:
+            # the RP is going away because either removed from the hypervisor
+            # or the compute's config is changed to ignore the device.
+            return updated
+
+        # we assume here that if this RP has been created in the current round
+        # of healing then it already has a UUID assigned.
+        rp_uuid = provider_tree.data(self.name).uuid
+
+        for consumer, amount in self._get_allocations().items():
+            current_allocs = allocations[consumer]['allocations']
+            current_rp_allocs = current_allocs.get(rp_uuid)
+
+            if current_rp_allocs:
+                # update an existing allocation if the current one differs
+                current_rc_allocs = current_rp_allocs["resources"].get(
+                    self.resource_class, 0)
+                if current_rc_allocs != amount:
+                    current_rp_allocs[
+                        "resources"][self.resource_class] = amount
+                    updated = True
+            else:
+                # insert a new allocation as it is missing
+                current_allocs[rp_uuid] = {
+                    "resources": {self.resource_class: amount}
+                }
+                updated = True
+
+        return updated
+
    def __str__(self) -> str:
        if self.devs:
            return (
@@ -348,6 +422,21 @@ class PlacementView:
        for rp_name, rp in self.rps.items():
            rp.update_provider_tree(provider_tree, self.root_rp_name)

+    def update_allocations(
+        self,
+        allocations: dict,
+        provider_tree: provider_tree.ProviderTree
+    ) -> bool:
+        """Updates the passed in allocations dict inplace with any PCI
+        allocations that is inferred from the PciDevice objects already added
+        to the view. It returns True if the allocations dict has been changed,
+        False otherwise.
+        """
+        updated = False
+        for rp in self.rps.values():
+            updated |= rp.update_allocations(allocations, provider_tree)
+        return updated
+

 def ensure_no_dev_spec_with_devname(dev_specs: ty.List[devspec.PciDeviceSpec]):
    for dev_spec in dev_specs:
@@ -437,7 +526,16 @@ def update_provider_tree_for_pci(
    LOG.info("Placement PCI resource view: %s", pv)

    pv.update_provider_tree(provider_tree)
-    # FIXME(gibi): Check allocations too based on pci_dev.instance_uuid and
-    #  if here was any update then we have to return True to trigger a reshape.
+    old_alloc = copy.deepcopy(allocations)
+    updated = pv.update_allocations(allocations, provider_tree)

-    return False
+    if updated:
+        LOG.debug(
+            "Placement PCI view needs allocation healing. This should only "
+            "happen if [scheduler]pci_in_placement is still disabled. "
+            "Original allocations: %s New allocations: %s",
+            old_alloc,
+            allocations,
+        )
+
+    return updated
@@ -62,6 +62,9 @@ class PlacementPCIReportingTests(test_pci_sriov_servers._PCIServersTestBase):
    def _to_device_spec_conf(spec_list):
        return [jsonutils.dumps(x) for x in spec_list]

+
+class PlacementPCIInventoryReportingTests(PlacementPCIReportingTests):
+
    def test_new_compute_init_with_pci_devs(self):
        """A brand new compute is started with multiple pci devices configured
        for nova.
@@ -748,3 +751,188 @@ class PlacementPCIReportingTests(test_pci_sriov_servers._PCIServersTestBase):
            "enabled.",
            str(ex)
        )
+
+
+class PlacementPCIAllocationHealingTests(PlacementPCIReportingTests):
+    def setUp(self):
+        super().setUp()
+        # Pre-configure a PCI alias to consume our devs
+        alias_pci = {
+            "vendor_id": fakelibvirt.PCI_VEND_ID,
+            "product_id": fakelibvirt.PCI_PROD_ID,
+            "name": "a-pci-dev",
+        }
+        alias_pf = {
+            "vendor_id": fakelibvirt.PCI_VEND_ID,
+            "product_id": fakelibvirt.PF_PROD_ID,
+            "device_type": "type-PF",
+            "name": "a-pf",
+        }
+        alias_vf = {
+            "vendor_id": fakelibvirt.PCI_VEND_ID,
+            "product_id": fakelibvirt.VF_PROD_ID,
+            "device_type": "type-VF",
+            "name": "a-vf",
+        }
+        self.flags(
+            group='pci',
+            alias=self._to_pci_alias_conf([alias_pci, alias_pf, alias_vf]))
+
+    @staticmethod
+    def _to_pci_alias_conf(alias_list):
+        return [jsonutils.dumps(x) for x in alias_list]
+
+    def test_heal_single_pci_allocation(self):
+        # The fake libvirt will emulate on the host:
+        # * one type-PCI in slot 0
+        pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pci=1, num_pfs=0, num_vfs=0)
+        # the config matches the PCI dev
+        device_spec = self._to_device_spec_conf(
+            [
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.PCI_PROD_ID,
+                },
+            ]
+        )
+        self.flags(group='pci', device_spec=device_spec)
+
+        # Start a compute *without* PCI tracking in placement
+        self.mock_pci_report_in_placement.return_value = False
+        self.start_compute(hostname="compute1", pci_info=pci_info)
+        self.assertPCIDeviceCounts("compute1", total=1, free=1)
+
+        # Create an instance that consume our PCI dev
+        extra_spec = {"pci_passthrough:alias": "a-pci-dev:1"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        server = self._create_server(flavor_id=flavor_id, networks=[])
+        self.assertPCIDeviceCounts("compute1", total=1, free=0)
+
+        # Restart the compute but now with PCI tracking enabled
+        self.mock_pci_report_in_placement.return_value = True
+        self.restart_compute_service("compute1")
+        # Assert that the PCI allocation is healed in placement
+        self.assertPCIDeviceCounts("compute1", total=1, free=0)
+        expected_placement_view = {
+            "inventories": {
+                "0000:81:00.0": {self.PCI_RC: 1},
+            },
+            "traits": {
+                "0000:81:00.0": [],
+            },
+            "usages": {
+                "0000:81:00.0": {self.PCI_RC: 1}
+            },
+            "allocations": {
+                server['id']: {
+                    "0000:81:00.0": {self.PCI_RC: 1}
+                }
+            }
+        }
+        self.assert_placement_pci_view("compute1", **expected_placement_view)
+
+        # run an update_available_resources periodic and assert that the usage
+        # and allocation stays
+        self._run_periodics()
+        self.assert_placement_pci_view("compute1", **expected_placement_view)
+
+    def test_heal_multiple_allocations(self):
+        # The fake libvirt will emulate on the host:
+        # * two type-PCI devs (slot 0 and 1)
+        # * two type-PFs (slot 2 and 3) with 4 type-VFs each
+        pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pci=2, num_pfs=2, num_vfs=8)
+        # the config matches:
+        device_spec = self._to_device_spec_conf(
+            [
+                # both type-PCI
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.PCI_PROD_ID,
+                },
+                # the PF in slot 2
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.PF_PROD_ID,
+                    "address": "0000:81:02.0",
+                },
+                # the VFs in slot 3
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.VF_PROD_ID,
+                    "address": "0000:81:03.*",
+                },
+            ]
+        )
+        self.flags(group='pci', device_spec=device_spec)
+
+        # Start a compute *without* PCI tracking in placement
+        self.mock_pci_report_in_placement.return_value = False
+        self.start_compute(hostname="compute1", pci_info=pci_info)
+        # 2 PCI + 1 PF + 4 VFs
+        self.assertPCIDeviceCounts("compute1", total=7, free=7)
+
+        # Create three instances consuming devices:
+        # * server_2pci: two type-PCI
+        # * server_pf_vf: one PF and one VF
+        # * server_2vf: two VFs
+        extra_spec = {"pci_passthrough:alias": "a-pci-dev:2"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        server_2pci = self._create_server(flavor_id=flavor_id, networks=[])
+        self.assertPCIDeviceCounts("compute1", total=7, free=5)
+
+        extra_spec = {"pci_passthrough:alias": "a-pf:1,a-vf:1"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        server_pf_vf = self._create_server(flavor_id=flavor_id, networks=[])
+        self.assertPCIDeviceCounts("compute1", total=7, free=3)
+
+        extra_spec = {"pci_passthrough:alias": "a-vf:2"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        server_2vf = self._create_server(flavor_id=flavor_id, networks=[])
+        self.assertPCIDeviceCounts("compute1", total=7, free=1)
+
+        # Restart the compute but now with PCI tracking enabled
+        self.mock_pci_report_in_placement.return_value = True
+        self.restart_compute_service("compute1")
+        # Assert that the PCI allocation is healed in placement
+        self.assertPCIDeviceCounts("compute1", total=7, free=1)
+        expected_placement_view = {
+            "inventories": {
+                "0000:81:00.0": {self.PCI_RC: 1},
+                "0000:81:01.0": {self.PCI_RC: 1},
+                "0000:81:02.0": {self.PF_RC: 1},
+                "0000:81:03.0": {self.VF_RC: 4},
+            },
+            "traits": {
+                "0000:81:00.0": [],
+                "0000:81:01.0": [],
+                "0000:81:02.0": [],
+                "0000:81:03.0": [],
+            },
+            "usages": {
+                "0000:81:00.0": {self.PCI_RC: 1},
+                "0000:81:01.0": {self.PCI_RC: 1},
+                "0000:81:02.0": {self.PF_RC: 1},
+                "0000:81:03.0": {self.VF_RC: 3},
+            },
+            "allocations": {
+                server_2pci['id']: {
+                    "0000:81:00.0": {self.PCI_RC: 1},
+                    "0000:81:01.0": {self.PCI_RC: 1},
+                },
+                server_pf_vf['id']: {
+                    "0000:81:02.0": {self.PF_RC: 1},
+                    "0000:81:03.0": {self.VF_RC: 1},
+                },
+                server_2vf['id']: {
+                    "0000:81:03.0": {self.VF_RC: 2}
+                },
+            },
+        }
+        self.assert_placement_pci_view("compute1", **expected_placement_view)
+
+        # run an update_available_resources periodic and assert that the usage
+        # and allocation stays
+        self._run_periodics()
+        self.assert_placement_pci_view("compute1", **expected_placement_view)
@@ -80,7 +80,15 @@ class _PCIServersTestBase(base.ServersTestBase):
                return rp
        self.fail(f'RP {name} is not found in Placement {rps}')

-    def assert_placement_pci_view(self, hostname, inventories, traits):
+    def assert_placement_pci_view(
+        self, hostname, inventories, traits, usages=None, allocations=None
+    ):
+        if not usages:
+            usages = {}
+
+        if not allocations:
+            allocations = {}
+
        compute_rp_uuid = self.compute_rp_uuids[hostname]
        rps = self._get_all_rps_in_a_tree(compute_rp_uuid)

@@ -120,6 +128,47 @@ class _PCIServersTestBase(base.ServersTestBase):
                f"Traits on RP {real_rp_name} does not match with expectation"
            )

+        for rp_name, usage in usages.items():
+            real_rp_name = f'{hostname}_{rp_name}'
+            rp = self._get_rp_by_name(real_rp_name, rps)
+            rp_usage = self._get_provider_usages(rp['uuid'])
+            self.assertEqual(
+                usage,
+                rp_usage,
+                f"Usage on RP {real_rp_name} does not match with expectation"
+            )
+
+        for consumer, expected_allocations in allocations.items():
+            actual_allocations = self._get_allocations_by_server_uuid(consumer)
+            self.assertEqual(
+                len(expected_allocations),
+                # actual_allocations also contains allocations against the
+                # root provider for VCPU, MEMORY_MB, and DISK_GB so subtract
+                # one
+                len(actual_allocations) - 1,
+                f"The consumer {consumer} allocates from different number of "
+                f"RPs than expected. Expected: {expected_allocations}, "
+                f"Actual: {actual_allocations}"
+            )
+            for rp_name, expected_rp_allocs in expected_allocations.items():
+                real_rp_name = f'{hostname}_{rp_name}'
+                rp = self._get_rp_by_name(real_rp_name, rps)
+                self.assertIn(
+                    rp['uuid'],
+                    actual_allocations,
+                    f"The consumer {consumer} expected to allocate from "
+                    f"{rp['uuid']}. Expected: {expected_allocations}, "
+                    f"Actual: {actual_allocations}"
+                )
+                actual_rp_allocs = actual_allocations[rp['uuid']]['resources']
+                self.assertEqual(
+                    expected_rp_allocs,
+                    actual_rp_allocs,
+                    f"The consumer {consumer} expected to have allocation "
+                    f"{expected_rp_allocs} on {rp_name} but it has "
+                    f"{actual_rp_allocs} instead."
+                )
+

 class _PCIServersWithMigrationTestBase(_PCIServersTestBase):