Ignore metadata tags in pci/stats _find_pool logic

The stats module uses the _find_pool() call to find a matching pool for
a new device or a device that is being deallocated. If no existing pool
matches with the dev then then a new pool is created for it. The
pool matching logic was faulty as it did not remove all the metadata
keys from the pool like rp_uuid. So if the dev did not have that key but
the pool did then the dev did not match.

On the other hand the PCI allocation logic (when PCI in Placement is
enabled) assumed that devices from a single rp_uuid are always in a
single pool. As this assumption was broken by the above bug the PCI
allocation blindly tried to allocate resources for an rp_uuid from each
matching pool causing overallocation.

The main fix in this patch is to ignore the metadata tags in
_find_pool(). But also two safety net are added to the allocation logic.
The logic now asserts that the assumption is correct and if not (i.e. it
found multiple pools with the same rp_uuid) then it bails out. It also
does not ever blindly allocate the same rp_uuid request from multiple
pools.

Closes-Bug: #2098496
Change-Id: I9678230397fa1a3c735ee01ed756d5af3b4e1191
This commit is contained in:
Balazs Gibizer
2025-03-13 16:24:27 +01:00
parent 32afd0c644
commit 229fb3513a
4 changed files with 180 additions and 55 deletions
+47
View File
@@ -104,6 +104,9 @@ class PciDeviceStats(object):
pool_keys = pool.copy()
del pool_keys['count']
del pool_keys['devices']
for tag in self.ignored_pool_tags:
pool_keys.pop(tag, None)
if (len(pool_keys.keys()) == len(dev_pool.keys()) and
self._equal_properties(dev_pool, pool_keys, list(dev_pool))):
return pool
@@ -309,12 +312,20 @@ class PciDeviceStats(object):
else:
# but if there is placement allocation then we have to follow
# it
if not self._assert_one_pool_per_rp_uuid(pools):
raise exception.PciDeviceRequestFailed(
requests=pci_requests)
requested_devs_per_pool_rp = collections.Counter(rp_uuids)
for pool in pools:
count = requested_devs_per_pool_rp[pool['rp_uuid']]
pool['count'] -= count
alloc_devices += self._allocate_devs(
pool, count, request.request_id)
# we consumed all the requested devices for the rp_uuid
# so we can drop that rp_uuid from the request.
requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)
return alloc_devices
@@ -865,10 +876,18 @@ class PciDeviceStats(object):
break
else:
# but if there is placement allocation then we have to follow that
if not self._assert_one_pool_per_rp_uuid(pools):
return False
requested_devs_per_pool_rp = collections.Counter(rp_uuids)
for pool in filtered_pools:
count = requested_devs_per_pool_rp[pool['rp_uuid']]
pool['count'] -= count
# we consumed all the requested devices for the rp_uuid
# so we can drop that rp_uuid from the request.
requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)
if pool['count'] == 0:
pools.remove(pool)
@@ -1024,3 +1043,31 @@ class PciDeviceStats(object):
if pool_rps: # now we know that it is a single RP
pool['rp_uuid'] = next(iter(pool_rps))
@staticmethod
def _assert_one_pool_per_rp_uuid(pools: ty.List[Pool]) -> bool:
"""Asserts that each pool has a unique rp_uuid if any
:param pools: A list of Pool objects.
:return: True if each pool has a unique rp_uuid or no rp_uuid assigned,
False otherwise.
"""
pools_per_rp_uuid = collections.defaultdict(list)
for pool in pools:
if "rp_uuid" in pool:
pools_per_rp_uuid[pool["rp_uuid"]].append(pool)
split_rp_uuids = {
rp_uuid: pools
for rp_uuid, pools in pools_per_rp_uuid.items()
if len(pools) > 1}
if split_rp_uuids:
LOG.warning(
"The PCI allocation logic assumes that devices "
"related to the same rp_uuid are in the same pool. "
"However the following rp_uuids are split across multiple "
"pools. This should not happen. Please file a bug report. %s",
split_rp_uuids
)
return not split_rp_uuids
@@ -113,69 +113,23 @@ class PCIInPlacementCreateAfterDeleteTestCase(base.PlacementPCIReportingTests):
self.assertPCIDeviceCounts('compute1', total=2, free=2)
self.assert_placement_pci_view("compute1", **compute1_empty)
# This is already shows the potential issue behind bug/2098496.
# After the server is deleted we are not back to a single pool of 2
# VFs but instead two separate pools with one device each but both
# related to the same rp_uuid. This breaks an assumption of the pool
# allocation logic later.
pools = objects.ComputeNode.get_first_node_by_host_for_old_compat(
self.ctxt, "compute1").pci_device_pools
self.assertEqual(len(pools), 2)
self.assertEqual(pools[0].count, 1)
self.assertEqual(pools[1].count, 1)
self.assertEqual(pools[0].tags['rp_uuid'], pools[1].tags['rp_uuid'])
# This is the expected state after the bug is fixed
# assert that the single pool is not broken into two during the
# de-allocation
# pools = objects.ComputeNode.get_first_node_by_host_for_old_compat(
# self.ctxt, "compute1").pci_device_pools
# self.assertEqual(len(pools), 1)
# self.assertEqual(pools[0].count, 2)
pools = objects.ComputeNode.get_first_node_by_host_for_old_compat(
self.ctxt, "compute1").pci_device_pools
self.assertEqual(len(pools), 1)
self.assertEqual(pools[0].count, 2)
server_1vf = self._create_server(flavor_id=flavor_id, networks=[])
# This is bug/2098496 as the VM now consumes 2 VFs instead of one it
# requested
self.assertPCIDeviceCounts('compute1', total=2, free=0)
self.assertPCIDeviceCounts('compute1', total=2, free=1)
compute1_expected_placement_view = copy.deepcopy(compute1_empty)
compute1_expected_placement_view["usages"] = {
"0000:81:00.0": {rc: 2}
"0000:81:00.0": {rc: 1}
}
compute1_expected_placement_view["allocations"][server_1vf["id"]] = {
"0000:81:00.0": {rc: 2},
"0000:81:00.0": {rc: 1},
}
self.assert_placement_pci_view(
"compute1", **compute1_expected_placement_view)
# prove that the placement view got corrupted only after the claim
# on the compute due the allocation healing logic
last_healing = self.pci_healing_fixture.last_healing("compute1")
alloc_before = last_healing[0]
alloc_after = last_healing[1]
def alloc_by_rc(allocations, rc):
return [
alloc["resources"][rc]
for alloc in allocations.values()
if rc in alloc["resources"]
]
pci_alloc_before = alloc_by_rc(
alloc_before[server_1vf["id"]]["allocations"], rc)
self.assertEqual(1, sum(pci_alloc_before))
pci_alloc_after = alloc_by_rc(
alloc_after[server_1vf["id"]]["allocations"], rc)
self.assertEqual(2, sum(pci_alloc_after))
# This is the expected result after the bug is fixed
# self.assertPCIDeviceCounts('compute1', total=2, free=1)
# compute1_expected_placement_view = copy.deepcopy(compute1_empty)
# compute1_expected_placement_view["usages"] = {
# "0000:81:00.0": {rc: 1}
# }
# compute1_expected_placement_view["allocations"][server_1vf["id"]] = {
# "0000:81:00.0": {rc: 1},
# }
# self.assert_placement_pci_view(
# "compute1", **compute1_expected_placement_view)
# self.assert_no_pci_healing("compute1")
self.assert_no_pci_healing("compute1")
+112
View File
@@ -13,6 +13,8 @@
# License for the specific language governing permissions and limitations
# under the License.
import collections
import copy
import ddt
from unittest import mock
from oslo_config import cfg
@@ -1122,6 +1124,7 @@ class PciDeviceStatsPlacementSupportTestCase(test.NoDBTestCase):
)
@ddt.ddt
class PciDeviceStatsProviderMappingTestCase(test.NoDBTestCase):
def setUp(self):
super().setUp()
@@ -1608,6 +1611,115 @@ class PciDeviceStatsProviderMappingTestCase(test.NoDBTestCase):
},
)
@mock.patch("nova.pci.stats.LOG.warning")
def _test_split_pool_rejected(self, consume_fn, mock_warning):
self.pci_stats = stats.PciDeviceStats(
objects.NUMATopology(), dev_filter=self.dev_filter
)
for vf_index in [1, 2]:
dev = objects.PciDevice(
compute_node_id=1,
vendor_id="dead",
product_id="beef",
address=f"0000:81:01.{vf_index}",
parent_addr="0000:81:01.0",
numa_node=0,
dev_type="type-VF",
)
self.pci_stats.add_device(dev)
dev.extra_info = {'rp_uuid': uuids.pf1}
self.pci_stats.populate_pools_metadata_from_assigned_devices()
# NOTE(gibi): this simulates an invalid situation where the same
# VFs from the same PF are split across multiple pools. This should
# not happen, but we want to check that the code rejects it
self.pci_stats.pools.extend(copy.deepcopy(self.pci_stats.pools))
vf_req = objects.InstancePCIRequest(
count=1,
alias_name='a-vf',
request_id=uuids.vf_req,
spec=[
{
"vendor_id": "dead",
"product_id": "beef",
"dev_type": "type-VF",
# Simulate that the scheduler already allocate a candidate
# and the mapping is stored in the request.
"rp_uuids": ",".join([uuids.pf1])
}
],
)
self.assertRaises(
exception.PciDeviceRequestFailed,
consume_fn,
[vf_req],
)
mock_warning.assert_called_once_with(
'The PCI allocation logic assumes that devices related to the '
'same rp_uuid are in the same pool. However the following '
'rp_uuids are split across multiple pools. This should not '
'happen. Please file a bug report. %s',
{uuids.pf1: self.pci_stats.pools})
def test_consume_split_pool_rejected(self):
self._test_split_pool_rejected(
lambda reqs: self.pci_stats.consume_requests(reqs))
def test_apply_split_pool_rejected(self):
self._test_split_pool_rejected(
lambda reqs: self.pci_stats.apply_requests(reqs, {}))
@ddt.data(
# no pool no problem
[[], True],
# single pool without rp_uuid is OK
[[{}], True],
# single pool with rp_uuid is OK
[[{}, {"rp_uuid": uuids.pf1}], True],
# two pools with different rp_uuid is OK
[[{"rp_uuid": uuids.pf1}, {"rp_uuid": uuids.pf2}], True],
# two pools with the same rp_uuid is NOT OK
[[{}, {"rp_uuid": uuids.pf1}, {"rp_uuid": uuids.pf2},
{"rp_uuid": uuids.pf1}], False]
)
@ddt.unpack
def test_assert_one_pool_per_rp_uuid(self, pools, result):
self.assertEqual(
result, stats.PciDeviceStats._assert_one_pool_per_rp_uuid(pools))
def test_find_pool_ignores_metadata(self):
self.pci_stats = stats.PciDeviceStats(
objects.NUMATopology(), dev_filter=self.dev_filter
)
dev1 = objects.PciDevice(
compute_node_id=1,
vendor_id="dead",
product_id="beef",
address="0000:81:01.1",
parent_addr="0000:81:01.0",
numa_node=0,
dev_type="type-VF",
)
dev1.extra_info = {'rp_uuid': uuids.pf1}
self.pci_stats.add_device(dev1)
self.pci_stats.populate_pools_metadata_from_assigned_devices()
dev2 = objects.PciDevice(
compute_node_id=1,
vendor_id="dead",
product_id="beef",
address="0000:81:01.2",
parent_addr="0000:81:01.0",
numa_node=0,
dev_type="type-VF",
)
dev2.extra_info = {'rp_uuid': uuids.pf1}
self.pci_stats.add_device(dev2)
self.assertEqual(1, len(self.pci_stats.pools))
class PciDeviceVFPFStatsTestCase(test.NoDBTestCase):
@@ -0,0 +1,12 @@
---
fixes:
- |
Fixed the issue
`bug 2098496 <https://bugs.launchpad.net/nova/+bug/2098496>`__ where nova
assigned more PCI hostdevs to a VM than the flavor requested via the
pci_passthrough:alias extra_spec. This only affected systems where both
``[filter_scheduler]pci_in_placement`` and ``[pci]report_in_placement``
were set to True. This only affected systems where the PCI alias requested
type-VF devices and a single PF device on the compute node supported more
than one VFs and ``[pci]device_spec`` configuration allowed nova to use
multiple VFs from a single PF.