Ignore metadata tags in pci/stats _find_pool logic

The stats module uses the _find_pool() call to find a matching pool for a new device or a device that is being deallocated. If no existing pool matches with the dev then then a new pool is created for it. The pool matching logic was faulty as it did not remove all the metadata keys from the pool like rp_uuid. So if the dev did not have that key but the pool did then the dev did not match. On the other hand the PCI allocation logic (when PCI in Placement is enabled) assumed that devices from a single rp_uuid are always in a single pool. As this assumption was broken by the above bug the PCI allocation blindly tried to allocate resources for an rp_uuid from each matching pool causing overallocation. The main fix in this patch is to ignore the metadata tags in _find_pool(). But also two safety net are added to the allocation logic. The logic now asserts that the assumption is correct and if not (i.e. it found multiple pools with the same rp_uuid) then it bails out. It also does not ever blindly allocate the same rp_uuid request from multiple pools. Closes-Bug: #2098496 Change-Id: I9678230397fa1a3c735ee01ed756d5af3b4e1191
2025-03-13 16:24:27 +01:00
parent 32afd0c644
commit 229fb3513a
4 changed files with 180 additions and 55 deletions
@@ -104,6 +104,9 @@ class PciDeviceStats(object):
            pool_keys = pool.copy()
            del pool_keys['count']
            del pool_keys['devices']
+            for tag in self.ignored_pool_tags:
+                pool_keys.pop(tag, None)
+
            if (len(pool_keys.keys()) == len(dev_pool.keys()) and
                self._equal_properties(dev_pool, pool_keys, list(dev_pool))):
                return pool
@@ -309,12 +312,20 @@ class PciDeviceStats(object):
            else:
                # but if there is placement allocation then we have to follow
                # it
+
+                if not self._assert_one_pool_per_rp_uuid(pools):
+                    raise exception.PciDeviceRequestFailed(
+                        requests=pci_requests)
+
                requested_devs_per_pool_rp = collections.Counter(rp_uuids)
                for pool in pools:
                    count = requested_devs_per_pool_rp[pool['rp_uuid']]
                    pool['count'] -= count
                    alloc_devices += self._allocate_devs(
                        pool, count, request.request_id)
+                    # we consumed all the requested devices for the rp_uuid
+                    # so we can drop that rp_uuid from the request.
+                    requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)

        return alloc_devices

@@ -865,10 +876,18 @@ class PciDeviceStats(object):
                    break
        else:
            # but if there is placement allocation then we have to follow that
+
+            if not self._assert_one_pool_per_rp_uuid(pools):
+                return False
+
            requested_devs_per_pool_rp = collections.Counter(rp_uuids)
            for pool in filtered_pools:
                count = requested_devs_per_pool_rp[pool['rp_uuid']]
                pool['count'] -= count
+                # we consumed all the requested devices for the rp_uuid
+                # so we can drop that rp_uuid from the request.
+                requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)
+
                if pool['count'] == 0:
                    pools.remove(pool)

@@ -1024,3 +1043,31 @@ class PciDeviceStats(object):

            if pool_rps:  # now we know that it is a single RP
                pool['rp_uuid'] = next(iter(pool_rps))
+
+    @staticmethod
+    def _assert_one_pool_per_rp_uuid(pools: ty.List[Pool]) -> bool:
+        """Asserts that each pool has a unique rp_uuid if any
+
+        :param pools: A list of Pool objects.
+        :return: True if each pool has a unique rp_uuid or no rp_uuid assigned,
+                 False otherwise.
+        """
+        pools_per_rp_uuid = collections.defaultdict(list)
+        for pool in pools:
+            if "rp_uuid" in pool:
+                pools_per_rp_uuid[pool["rp_uuid"]].append(pool)
+
+        split_rp_uuids = {
+            rp_uuid: pools
+            for rp_uuid, pools in pools_per_rp_uuid.items()
+            if len(pools) > 1}
+
+        if split_rp_uuids:
+            LOG.warning(
+                "The PCI allocation logic assumes that devices "
+                "related to the same rp_uuid are in the same pool. "
+                "However the following rp_uuids are split across multiple "
+                "pools. This should not happen. Please file a bug report. %s",
+                split_rp_uuids
+            )
+        return not split_rp_uuids
@@ -113,69 +113,23 @@ class PCIInPlacementCreateAfterDeleteTestCase(base.PlacementPCIReportingTests):
        self.assertPCIDeviceCounts('compute1', total=2, free=2)
        self.assert_placement_pci_view("compute1", **compute1_empty)

-        # This is already shows the potential issue behind bug/2098496.
-        # After the server is deleted we are not back to a single pool of 2
-        # VFs but instead two separate pools with one device each but both
-        # related to the same rp_uuid. This breaks an assumption of the pool
-        # allocation logic later.
-        pools = objects.ComputeNode.get_first_node_by_host_for_old_compat(
-            self.ctxt, "compute1").pci_device_pools
-        self.assertEqual(len(pools), 2)
-        self.assertEqual(pools[0].count, 1)
-        self.assertEqual(pools[1].count, 1)
-        self.assertEqual(pools[0].tags['rp_uuid'], pools[1].tags['rp_uuid'])
-
-        # This is the expected state after the bug is fixed
        # assert that the single pool is not broken into two during the
        # de-allocation
-        # pools = objects.ComputeNode.get_first_node_by_host_for_old_compat(
-        #     self.ctxt, "compute1").pci_device_pools
-        # self.assertEqual(len(pools), 1)
-        # self.assertEqual(pools[0].count, 2)
+        pools = objects.ComputeNode.get_first_node_by_host_for_old_compat(
+            self.ctxt, "compute1").pci_device_pools
+        self.assertEqual(len(pools), 1)
+        self.assertEqual(pools[0].count, 2)

        server_1vf = self._create_server(flavor_id=flavor_id, networks=[])
-        # This is bug/2098496 as the VM now consumes 2 VFs instead of one it
-        # requested
-        self.assertPCIDeviceCounts('compute1', total=2, free=0)
+
+        self.assertPCIDeviceCounts('compute1', total=2, free=1)
        compute1_expected_placement_view = copy.deepcopy(compute1_empty)
        compute1_expected_placement_view["usages"] = {
-            "0000:81:00.0": {rc: 2}
+            "0000:81:00.0": {rc: 1}
        }
        compute1_expected_placement_view["allocations"][server_1vf["id"]] = {
-            "0000:81:00.0": {rc: 2},
+            "0000:81:00.0": {rc: 1},
        }
        self.assert_placement_pci_view(
            "compute1", **compute1_expected_placement_view)
-        # prove that the placement view got corrupted only after the claim
-        # on the compute due the allocation healing logic
-        last_healing = self.pci_healing_fixture.last_healing("compute1")
-        alloc_before = last_healing[0]
-        alloc_after = last_healing[1]
-
-        def alloc_by_rc(allocations, rc):
-            return [
-                alloc["resources"][rc]
-                for alloc in allocations.values()
-                if rc in alloc["resources"]
-            ]
-
-        pci_alloc_before = alloc_by_rc(
-            alloc_before[server_1vf["id"]]["allocations"], rc)
-        self.assertEqual(1, sum(pci_alloc_before))
-
-        pci_alloc_after = alloc_by_rc(
-            alloc_after[server_1vf["id"]]["allocations"], rc)
-        self.assertEqual(2, sum(pci_alloc_after))
-
-        # This is the expected result after the bug is fixed
-        # self.assertPCIDeviceCounts('compute1', total=2, free=1)
-        # compute1_expected_placement_view = copy.deepcopy(compute1_empty)
-        # compute1_expected_placement_view["usages"] = {
-        #     "0000:81:00.0": {rc: 1}
-        # }
-        # compute1_expected_placement_view["allocations"][server_1vf["id"]] = {
-        #     "0000:81:00.0": {rc: 1},
-        # }
-        # self.assert_placement_pci_view(
-        #     "compute1", **compute1_expected_placement_view)
-        # self.assert_no_pci_healing("compute1")
+        self.assert_no_pci_healing("compute1")
@@ -13,6 +13,8 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 import collections
+import copy
+import ddt
 from unittest import mock

 from oslo_config import cfg
@@ -1122,6 +1124,7 @@ class PciDeviceStatsPlacementSupportTestCase(test.NoDBTestCase):
        )


+@ddt.ddt
 class PciDeviceStatsProviderMappingTestCase(test.NoDBTestCase):
    def setUp(self):
        super().setUp()
@@ -1608,6 +1611,115 @@ class PciDeviceStatsProviderMappingTestCase(test.NoDBTestCase):
            },
        )

+    @mock.patch("nova.pci.stats.LOG.warning")
+    def _test_split_pool_rejected(self, consume_fn, mock_warning):
+        self.pci_stats = stats.PciDeviceStats(
+            objects.NUMATopology(), dev_filter=self.dev_filter
+        )
+        for vf_index in [1, 2]:
+            dev = objects.PciDevice(
+                compute_node_id=1,
+                vendor_id="dead",
+                product_id="beef",
+                address=f"0000:81:01.{vf_index}",
+                parent_addr="0000:81:01.0",
+                numa_node=0,
+                dev_type="type-VF",
+            )
+            self.pci_stats.add_device(dev)
+            dev.extra_info = {'rp_uuid': uuids.pf1}
+
+        self.pci_stats.populate_pools_metadata_from_assigned_devices()
+        # NOTE(gibi): this simulates an invalid situation where the same
+        # VFs from the same PF are split across multiple pools. This should
+        # not happen, but we want to check that the code rejects it
+        self.pci_stats.pools.extend(copy.deepcopy(self.pci_stats.pools))
+
+        vf_req = objects.InstancePCIRequest(
+            count=1,
+            alias_name='a-vf',
+            request_id=uuids.vf_req,
+            spec=[
+                {
+                    "vendor_id": "dead",
+                    "product_id": "beef",
+                    "dev_type": "type-VF",
+                    # Simulate that the scheduler already allocate a candidate
+                    # and the mapping is stored in the request.
+                    "rp_uuids": ",".join([uuids.pf1])
+                }
+            ],
+        )
+
+        self.assertRaises(
+            exception.PciDeviceRequestFailed,
+            consume_fn,
+            [vf_req],
+        )
+        mock_warning.assert_called_once_with(
+            'The PCI allocation logic assumes that devices related to the '
+            'same rp_uuid are in the same pool. However the following '
+            'rp_uuids are split across multiple pools. This should not '
+            'happen. Please file a bug report. %s',
+            {uuids.pf1: self.pci_stats.pools})
+
+    def test_consume_split_pool_rejected(self):
+        self._test_split_pool_rejected(
+            lambda reqs: self.pci_stats.consume_requests(reqs))
+
+    def test_apply_split_pool_rejected(self):
+        self._test_split_pool_rejected(
+            lambda reqs: self.pci_stats.apply_requests(reqs, {}))
+
+    @ddt.data(
+        # no pool no problem
+        [[], True],
+        # single pool without rp_uuid is OK
+        [[{}], True],
+        # single pool with rp_uuid is OK
+        [[{}, {"rp_uuid": uuids.pf1}], True],
+        # two pools with different rp_uuid is OK
+        [[{"rp_uuid": uuids.pf1}, {"rp_uuid": uuids.pf2}], True],
+        # two pools with the same rp_uuid is NOT OK
+        [[{}, {"rp_uuid": uuids.pf1}, {"rp_uuid": uuids.pf2},
+          {"rp_uuid": uuids.pf1}], False]
+    )
+    @ddt.unpack
+    def test_assert_one_pool_per_rp_uuid(self, pools, result):
+        self.assertEqual(
+            result, stats.PciDeviceStats._assert_one_pool_per_rp_uuid(pools))
+
+    def test_find_pool_ignores_metadata(self):
+        self.pci_stats = stats.PciDeviceStats(
+            objects.NUMATopology(), dev_filter=self.dev_filter
+        )
+        dev1 = objects.PciDevice(
+            compute_node_id=1,
+            vendor_id="dead",
+            product_id="beef",
+            address="0000:81:01.1",
+            parent_addr="0000:81:01.0",
+            numa_node=0,
+            dev_type="type-VF",
+        )
+        dev1.extra_info = {'rp_uuid': uuids.pf1}
+        self.pci_stats.add_device(dev1)
+        self.pci_stats.populate_pools_metadata_from_assigned_devices()
+
+        dev2 = objects.PciDevice(
+            compute_node_id=1,
+            vendor_id="dead",
+            product_id="beef",
+            address="0000:81:01.2",
+            parent_addr="0000:81:01.0",
+            numa_node=0,
+            dev_type="type-VF",
+        )
+        dev2.extra_info = {'rp_uuid': uuids.pf1}
+        self.pci_stats.add_device(dev2)
+
+        self.assertEqual(1, len(self.pci_stats.pools))
+

 class PciDeviceVFPFStatsTestCase(test.NoDBTestCase):

@@ -0,0 +1,12 @@
+---
+fixes:
+  - |
+    Fixed the issue
+    `bug 2098496 <https://bugs.launchpad.net/nova/+bug/2098496>`__ where nova
+    assigned more PCI hostdevs to a VM than the flavor requested via the
+    pci_passthrough:alias extra_spec. This only affected systems where both
+    ``[filter_scheduler]pci_in_placement`` and ``[pci]report_in_placement``
+    were set to True. This only affected systems where the PCI alias requested
+    type-VF devices and a single PF device on the compute node supported more
+    than one VFs and ``[pci]device_spec`` configuration allowed nova to use
+    multiple VFs from a single PF.