From 229fb3513ae39fa120167107afe42568ca85ace9 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Thu, 13 Mar 2025 16:24:27 +0100
Subject: [PATCH] Ignore metadata tags in pci/stats _find_pool logic

The stats module uses the _find_pool() call to find a matching pool for
a new device or a device that is being deallocated. If no existing pool
matches with the dev then then a new pool is created for it. The
pool matching logic was faulty as it did not remove all the metadata
keys from the pool like rp_uuid. So if the dev did not have that key but
the pool did then the dev did not match.

On the other hand the PCI allocation logic (when PCI in Placement is
enabled) assumed that devices from a single rp_uuid are always in a
single pool. As this assumption was broken by the above bug the PCI
allocation blindly tried to allocate resources for an rp_uuid from each
matching pool causing overallocation.

The main fix in this patch is to ignore the metadata tags in
_find_pool(). But also two safety net are added to the allocation logic.
The logic now asserts that the assumption is correct and if not (i.e. it
found multiple pools with the same rp_uuid) then it bails out. It also
does not ever blindly allocate the same rp_uuid request from multiple
pools.

Closes-Bug: #2098496
Change-Id: I9678230397fa1a3c735ee01ed756d5af3b4e1191
---
 nova/pci/stats.py                             |  47 ++++++++
 .../regressions/test_bug_2098496.py           |  64 ++--------
 nova/tests/unit/pci/test_stats.py             | 112 ++++++++++++++++++
 ...tdevs-than-requested-0139018213f1be96.yaml |  12 ++
 4 files changed, 180 insertions(+), 55 deletions(-)
 create mode 100644 releasenotes/notes/bug-2098496-move-pci-hostdevs-than-requested-0139018213f1be96.yaml

diff --git a/nova/pci/stats.py b/nova/pci/stats.py
index 42c58e7b0b..7d6ac92279 100644
--- a/nova/pci/stats.py
+++ b/nova/pci/stats.py
@@ -104,6 +104,9 @@ class PciDeviceStats(object):
             pool_keys = pool.copy()
             del pool_keys['count']
             del pool_keys['devices']
+            for tag in self.ignored_pool_tags:
+                pool_keys.pop(tag, None)
+
             if (len(pool_keys.keys()) == len(dev_pool.keys()) and
                 self._equal_properties(dev_pool, pool_keys, list(dev_pool))):
                 return pool
@@ -309,12 +312,20 @@ class PciDeviceStats(object):
             else:
                 # but if there is placement allocation then we have to follow
                 # it
+
+                if not self._assert_one_pool_per_rp_uuid(pools):
+                    raise exception.PciDeviceRequestFailed(
+                        requests=pci_requests)
+
                 requested_devs_per_pool_rp = collections.Counter(rp_uuids)
                 for pool in pools:
                     count = requested_devs_per_pool_rp[pool['rp_uuid']]
                     pool['count'] -= count
                     alloc_devices += self._allocate_devs(
                         pool, count, request.request_id)
+                    # we consumed all the requested devices for the rp_uuid
+                    # so we can drop that rp_uuid from the request.
+                    requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)
 
         return alloc_devices
 
@@ -865,10 +876,18 @@ class PciDeviceStats(object):
                     break
         else:
             # but if there is placement allocation then we have to follow that
+
+            if not self._assert_one_pool_per_rp_uuid(pools):
+                return False
+
             requested_devs_per_pool_rp = collections.Counter(rp_uuids)
             for pool in filtered_pools:
                 count = requested_devs_per_pool_rp[pool['rp_uuid']]
                 pool['count'] -= count
+                # we consumed all the requested devices for the rp_uuid
+                # so we can drop that rp_uuid from the request.
+                requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)
+
                 if pool['count'] == 0:
                     pools.remove(pool)
 
@@ -1024,3 +1043,31 @@ class PciDeviceStats(object):
 
             if pool_rps:  # now we know that it is a single RP
                 pool['rp_uuid'] = next(iter(pool_rps))
+
+    @staticmethod
+    def _assert_one_pool_per_rp_uuid(pools: ty.List[Pool]) -> bool:
+        """Asserts that each pool has a unique rp_uuid if any
+
+        :param pools: A list of Pool objects.
+        :return: True if each pool has a unique rp_uuid or no rp_uuid assigned,
+                 False otherwise.
+        """
+        pools_per_rp_uuid = collections.defaultdict(list)
+        for pool in pools:
+            if "rp_uuid" in pool:
+                pools_per_rp_uuid[pool["rp_uuid"]].append(pool)
+
+        split_rp_uuids = {
+            rp_uuid: pools
+            for rp_uuid, pools in pools_per_rp_uuid.items()
+            if len(pools) > 1}
+
+        if split_rp_uuids:
+            LOG.warning(
+                "The PCI allocation logic assumes that devices "
+                "related to the same rp_uuid are in the same pool. "
+                "However the following rp_uuids are split across multiple "
+                "pools. This should not happen. Please file a bug report. %s",
+                split_rp_uuids
+            )
+        return not split_rp_uuids
diff --git a/nova/tests/functional/regressions/test_bug_2098496.py b/nova/tests/functional/regressions/test_bug_2098496.py
index 731d9f441a..44f59c79bf 100644
--- a/nova/tests/functional/regressions/test_bug_2098496.py
+++ b/nova/tests/functional/regressions/test_bug_2098496.py
@@ -113,69 +113,23 @@ class PCIInPlacementCreateAfterDeleteTestCase(base.PlacementPCIReportingTests):
         self.assertPCIDeviceCounts('compute1', total=2, free=2)
         self.assert_placement_pci_view("compute1", **compute1_empty)
 
-        # This is already shows the potential issue behind bug/2098496.
-        # After the server is deleted we are not back to a single pool of 2
-        # VFs but instead two separate pools with one device each but both
-        # related to the same rp_uuid. This breaks an assumption of the pool
-        # allocation logic later.
-        pools = objects.ComputeNode.get_first_node_by_host_for_old_compat(
-            self.ctxt, "compute1").pci_device_pools
-        self.assertEqual(len(pools), 2)
-        self.assertEqual(pools[0].count, 1)
-        self.assertEqual(pools[1].count, 1)
-        self.assertEqual(pools[0].tags['rp_uuid'], pools[1].tags['rp_uuid'])
-
-        # This is the expected state after the bug is fixed
         # assert that the single pool is not broken into two during the
         # de-allocation
-        # pools = objects.ComputeNode.get_first_node_by_host_for_old_compat(
-        #     self.ctxt, "compute1").pci_device_pools
-        # self.assertEqual(len(pools), 1)
-        # self.assertEqual(pools[0].count, 2)
+        pools = objects.ComputeNode.get_first_node_by_host_for_old_compat(
+            self.ctxt, "compute1").pci_device_pools
+        self.assertEqual(len(pools), 1)
+        self.assertEqual(pools[0].count, 2)
 
         server_1vf = self._create_server(flavor_id=flavor_id, networks=[])
-        # This is bug/2098496 as the VM now consumes 2 VFs instead of one it
-        # requested
-        self.assertPCIDeviceCounts('compute1', total=2, free=0)
+
+        self.assertPCIDeviceCounts('compute1', total=2, free=1)
         compute1_expected_placement_view = copy.deepcopy(compute1_empty)
         compute1_expected_placement_view["usages"] = {
-            "0000:81:00.0": {rc: 2}
+            "0000:81:00.0": {rc: 1}
         }
         compute1_expected_placement_view["allocations"][server_1vf["id"]] = {
-            "0000:81:00.0": {rc: 2},
+            "0000:81:00.0": {rc: 1},
         }
         self.assert_placement_pci_view(
             "compute1", **compute1_expected_placement_view)
-        # prove that the placement view got corrupted only after the claim
-        # on the compute due the allocation healing logic
-        last_healing = self.pci_healing_fixture.last_healing("compute1")
-        alloc_before = last_healing[0]
-        alloc_after = last_healing[1]
-
-        def alloc_by_rc(allocations, rc):
-            return [
-                alloc["resources"][rc]
-                for alloc in allocations.values()
-                if rc in alloc["resources"]
-            ]
-
-        pci_alloc_before = alloc_by_rc(
-            alloc_before[server_1vf["id"]]["allocations"], rc)
-        self.assertEqual(1, sum(pci_alloc_before))
-
-        pci_alloc_after = alloc_by_rc(
-            alloc_after[server_1vf["id"]]["allocations"], rc)
-        self.assertEqual(2, sum(pci_alloc_after))
-
-        # This is the expected result after the bug is fixed
-        # self.assertPCIDeviceCounts('compute1', total=2, free=1)
-        # compute1_expected_placement_view = copy.deepcopy(compute1_empty)
-        # compute1_expected_placement_view["usages"] = {
-        #     "0000:81:00.0": {rc: 1}
-        # }
-        # compute1_expected_placement_view["allocations"][server_1vf["id"]] = {
-        #     "0000:81:00.0": {rc: 1},
-        # }
-        # self.assert_placement_pci_view(
-        #     "compute1", **compute1_expected_placement_view)
-        # self.assert_no_pci_healing("compute1")
+        self.assert_no_pci_healing("compute1")
diff --git a/nova/tests/unit/pci/test_stats.py b/nova/tests/unit/pci/test_stats.py
index 20b5c8f06d..53d0a556e6 100644
--- a/nova/tests/unit/pci/test_stats.py
+++ b/nova/tests/unit/pci/test_stats.py
@@ -13,6 +13,8 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 import collections
+import copy
+import ddt
 from unittest import mock
 
 from oslo_config import cfg
@@ -1122,6 +1124,7 @@ class PciDeviceStatsPlacementSupportTestCase(test.NoDBTestCase):
         )
 
 
+@ddt.ddt
 class PciDeviceStatsProviderMappingTestCase(test.NoDBTestCase):
     def setUp(self):
         super().setUp()
@@ -1608,6 +1611,115 @@ class PciDeviceStatsProviderMappingTestCase(test.NoDBTestCase):
             },
         )
 
+    @mock.patch("nova.pci.stats.LOG.warning")
+    def _test_split_pool_rejected(self, consume_fn, mock_warning):
+        self.pci_stats = stats.PciDeviceStats(
+            objects.NUMATopology(), dev_filter=self.dev_filter
+        )
+        for vf_index in [1, 2]:
+            dev = objects.PciDevice(
+                compute_node_id=1,
+                vendor_id="dead",
+                product_id="beef",
+                address=f"0000:81:01.{vf_index}",
+                parent_addr="0000:81:01.0",
+                numa_node=0,
+                dev_type="type-VF",
+            )
+            self.pci_stats.add_device(dev)
+            dev.extra_info = {'rp_uuid': uuids.pf1}
+
+        self.pci_stats.populate_pools_metadata_from_assigned_devices()
+        # NOTE(gibi): this simulates an invalid situation where the same
+        # VFs from the same PF are split across multiple pools. This should
+        # not happen, but we want to check that the code rejects it
+        self.pci_stats.pools.extend(copy.deepcopy(self.pci_stats.pools))
+
+        vf_req = objects.InstancePCIRequest(
+            count=1,
+            alias_name='a-vf',
+            request_id=uuids.vf_req,
+            spec=[
+                {
+                    "vendor_id": "dead",
+                    "product_id": "beef",
+                    "dev_type": "type-VF",
+                    # Simulate that the scheduler already allocate a candidate
+                    # and the mapping is stored in the request.
+                    "rp_uuids": ",".join([uuids.pf1])
+                }
+            ],
+        )
+
+        self.assertRaises(
+            exception.PciDeviceRequestFailed,
+            consume_fn,
+            [vf_req],
+        )
+        mock_warning.assert_called_once_with(
+            'The PCI allocation logic assumes that devices related to the '
+            'same rp_uuid are in the same pool. However the following '
+            'rp_uuids are split across multiple pools. This should not '
+            'happen. Please file a bug report. %s',
+            {uuids.pf1: self.pci_stats.pools})
+
+    def test_consume_split_pool_rejected(self):
+        self._test_split_pool_rejected(
+            lambda reqs: self.pci_stats.consume_requests(reqs))
+
+    def test_apply_split_pool_rejected(self):
+        self._test_split_pool_rejected(
+            lambda reqs: self.pci_stats.apply_requests(reqs, {}))
+
+    @ddt.data(
+        # no pool no problem
+        [[], True],
+        # single pool without rp_uuid is OK
+        [[{}], True],
+        # single pool with rp_uuid is OK
+        [[{}, {"rp_uuid": uuids.pf1}], True],
+        # two pools with different rp_uuid is OK
+        [[{"rp_uuid": uuids.pf1}, {"rp_uuid": uuids.pf2}], True],
+        # two pools with the same rp_uuid is NOT OK
+        [[{}, {"rp_uuid": uuids.pf1}, {"rp_uuid": uuids.pf2},
+          {"rp_uuid": uuids.pf1}], False]
+    )
+    @ddt.unpack
+    def test_assert_one_pool_per_rp_uuid(self, pools, result):
+        self.assertEqual(
+            result, stats.PciDeviceStats._assert_one_pool_per_rp_uuid(pools))
+
+    def test_find_pool_ignores_metadata(self):
+        self.pci_stats = stats.PciDeviceStats(
+            objects.NUMATopology(), dev_filter=self.dev_filter
+        )
+        dev1 = objects.PciDevice(
+            compute_node_id=1,
+            vendor_id="dead",
+            product_id="beef",
+            address="0000:81:01.1",
+            parent_addr="0000:81:01.0",
+            numa_node=0,
+            dev_type="type-VF",
+        )
+        dev1.extra_info = {'rp_uuid': uuids.pf1}
+        self.pci_stats.add_device(dev1)
+        self.pci_stats.populate_pools_metadata_from_assigned_devices()
+
+        dev2 = objects.PciDevice(
+            compute_node_id=1,
+            vendor_id="dead",
+            product_id="beef",
+            address="0000:81:01.2",
+            parent_addr="0000:81:01.0",
+            numa_node=0,
+            dev_type="type-VF",
+        )
+        dev2.extra_info = {'rp_uuid': uuids.pf1}
+        self.pci_stats.add_device(dev2)
+
+        self.assertEqual(1, len(self.pci_stats.pools))
+
 
 class PciDeviceVFPFStatsTestCase(test.NoDBTestCase):
 
diff --git a/releasenotes/notes/bug-2098496-move-pci-hostdevs-than-requested-0139018213f1be96.yaml b/releasenotes/notes/bug-2098496-move-pci-hostdevs-than-requested-0139018213f1be96.yaml
new file mode 100644
index 0000000000..8bd77fddc6
--- /dev/null
+++ b/releasenotes/notes/bug-2098496-move-pci-hostdevs-than-requested-0139018213f1be96.yaml
@@ -0,0 +1,12 @@
+---
+fixes:
+  - |
+    Fixed the issue
+    `bug 2098496 <https://bugs.launchpad.net/nova/+bug/2098496>`__ where nova
+    assigned more PCI hostdevs to a VM than the flavor requested via the
+    pci_passthrough:alias extra_spec. This only affected systems where both
+    ``[filter_scheduler]pci_in_placement`` and ``[pci]report_in_placement``
+    were set to True. This only affected systems where the PCI alias requested
+    type-VF devices and a single PF device on the compute node supported more
+    than one VFs and ``[pci]device_spec`` configuration allowed nova to use
+    multiple VFs from a single PF.