From c5efabbd07bc081e6d3b7ceb4ca138707edfae9b Mon Sep 17 00:00:00 2001 From: Dan Smith Date: Wed, 12 Mar 2025 11:43:24 -0700 Subject: [PATCH] Invalidate PCI-in-placement cached RPs during claim This makes us invalidate our cache of the PCI-in-placement resource providers when we go to do instance_claim(). This is not technically required right now, but is setup for the next patch where we will update that inventory during claim and we need to make sure we are working with the latest version. Without this, we may consider a cached version of the inventory to be the same as the proposed one, and thus not actually update placement when we need to. Since PCI-in- placement was designed to tolerate external changes to the inventory (especially/explicitly changing the reserved count), we need to be careful not to allow our cache to prevent us from taking the action we intend. Related to blueprint one-time-use-devices Change-Id: I89039328af7a2d2e6a4128dd08dbe8e97ecb16cd --- nova/compute/resource_tracker.py | 30 ++++++++++++ .../unit/compute/test_resource_tracker.py | 47 ++++++++++++++++++- 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 8b845ef5f6..d832302ed9 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -138,6 +138,26 @@ class ResourceTracker(object): 'not match host') self.service_ref = service_ref + def _invalidate_pci_in_placement_cached_rps(self, allocs): + """Invalidate cache for PCI-in-placement providers. + + This invalidates the local cached copy of any provider for which an + allocation of a PCI-in-placement device exists. We do this in case the + reserved count has been modified externally to make sure we see it. + """ + if not allocs: + return + for rp, rp_allocs in allocs.items(): + try: + p_data = self.provider_tree.data(rp) + except ValueError: + # Not all allocations for an instance are necessarily against + # a provider in our tree + continue + if os_traits.COMPUTE_MANAGED_PCI_DEVICE in p_data.traits: + self.reportclient.invalidate_resource_provider( + rp, cacheonly=True) + @utils.synchronized(COMPUTE_RESOURCE_SEMAPHORE, fair=True) def instance_claim(self, context, instance, nodename, allocations, limits=None): @@ -207,6 +227,11 @@ class ResourceTracker(object): claimed_resources = self._claim_resources(allocations) instance.resources = claimed_resources + # In case we have any allocations for PCI-in-placement devices, be + # sure to invalidate our cache of those providers before we run + # _update() below (which does the PCI-in-placement sync). + self._invalidate_pci_in_placement_cached_rps(allocations) + # Mark resources in-use and update stats self._update_usage_from_instance(context, instance, nodename) @@ -370,6 +395,11 @@ class ResourceTracker(object): instance.migration_context = mig_context instance.save() + # In case we have any allocations for PCI-in-placement devices, be + # sure to invalidate our cache of those providers before we run + # _update() below (which does the PCI-in-placement sync). + self._invalidate_pci_in_placement_cached_rps(allocations) + # Mark the resources in-use for the resize landing on this # compute host: self._update_usage_from_migration(context, instance, migration, diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 3af6eb2be8..322a84ba04 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -3424,9 +3424,12 @@ class TestLiveMigration(BaseTestCase): mock.patch.object(objects.Instance, 'save'), mock.patch.object(self.rt, '_update'), mock.patch.object(self.rt.pci_tracker, 'claim_instance'), - mock.patch.object(self.rt, '_update_usage_from_migration') + mock.patch.object(self.rt, '_update_usage_from_migration'), + mock.patch.object(self.rt, + '_invalidate_pci_in_placement_cached_rps') ) as (mock_from_instance, mock_migration_save, mock_instance_save, - mock_update, mock_pci_claim_instance, mock_update_usage): + mock_update, mock_pci_claim_instance, mock_update_usage, + mock_invalidate): claim = self.rt.live_migration_claim(ctxt, instance, _NODENAME, migration, limits=None, allocs=None) @@ -3441,6 +3444,7 @@ class TestLiveMigration(BaseTestCase): mock_pci_claim_instance.assert_not_called() mock_update_usage.assert_called_with(ctxt, instance, migration, _NODENAME) + mock_invalidate.assert_called() class TestUpdateUsageFromMigration(test.NoDBTestCase): @@ -4350,6 +4354,45 @@ class ResourceTrackerTestCase(test.NoDBTestCase): self.assertRaises(AssertionError, _test_explict_unfair) self.assertRaises(AssertionError, _test_implicit_unfair) + def test_invalidate_pci_in_placement(self): + client = mock.MagicMock() + rt = resource_tracker.ResourceTracker( + _HOSTNAME, mock.sentinel.driver, client) + # NOTE(danms): We want to cover the case where we have allocations + # against a compute node, a PCI-in-placement child, and another + # provider outside that tree. Further, we use a slightly-unrealistic + # PCI-in-placement allocation with multiple RCs just to make sure + # that we call invalidate once per *provider* and not once per + # RC. + fake_allocs = { + str(uuids.compute): { + 'resources': { + 'VCPU': 1, + 'MEMORY_MB': 512, + }, + }, + str(uuids.othercompute): { + 'resources': { + 'VCPU': 1, + } + }, + str(uuids.pci): { + 'resources': { + 'CUSTOM_PCI_DEAD_BEEF': 1, + 'SOME_OTHER_TRAIT': 1, + } + } + } + with mock.patch.object(rt, 'provider_tree') as mock_pt: + mock_pt.data.side_effect = [ + mock.MagicMock(traits=[]), + ValueError, + mock.MagicMock(traits=['COMPUTE_MANAGED_PCI_DEVICE']), + ] + rt._invalidate_pci_in_placement_cached_rps(fake_allocs) + client.invalidate_resource_provider.assert_called_once_with( + uuids.pci, cacheonly=True) + def test_set_service_ref(self): rt = resource_tracker.ResourceTracker( _HOSTNAME, mock.sentinel.driver, mock.sentinel.reportclient)