diff --git a/doc/source/admin/troubleshooting/orphaned-allocations.rst b/doc/source/admin/troubleshooting/orphaned-allocations.rst index 5fefa10d3b..ca49aa4aab 100644 --- a/doc/source/admin/troubleshooting/orphaned-allocations.rst +++ b/doc/source/admin/troubleshooting/orphaned-allocations.rst @@ -180,8 +180,13 @@ things: * `Migration-based allocations`_ would be lost if manually deleted during a resize. These are allocations tracked by the migration resource record on the source compute service during a migration. -* Healing allocations does not supported nested resource allocations before the - 20.0.0 (Train) release. +* Healing allocations only partially support nested allocations. Nested + allocations due to Neutron ports having QoS policies are supported since + 20.0.0 (Train) release. But nested allocations due to vGPU or Cyborg device + profile requests in the flavor are not supported. Also if you are using + provider.yaml files on compute hosts to define additional resources, if those + resources are defined on child resource providers then instances using such + resources are not supported. If you do use the ``heal_allocations`` command to cleanup allocations for a specific trouble instance, it is recommended to take note of what the diff --git a/doc/source/cli/nova-manage.rst b/doc/source/cli/nova-manage.rst index a30c649b9c..7d1d0d2311 100644 --- a/doc/source/cli/nova-manage.rst +++ b/doc/source/cli/nova-manage.rst @@ -1144,6 +1144,15 @@ state transition. For each instance found, allocations are created against the compute node resource provider for that instance based on the flavor associated with the instance. +.. note:: + Nested allocations are only partially supported. Nested allocations due to + Neutron ports having QoS policies are supported since 20.0.0 (Train) + release. But nested allocations due to vGPU or Cyborg device profile + requests in the flavor are not supported. Also if you are using + provider.yaml files on compute hosts to define additional resources, if + those resources are defined on child resource providers then instances + using such resources are not supported. + Also if the instance has any port attached that has resource request (e.g. :neutron-doc:`Quality of Service (QoS): Guaranteed Bandwidth `) but the corresponding diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py index 070bdf558c..b96fa63910 100644 --- a/nova/cmd/manage.py +++ b/nova/cmd/manage.py @@ -1715,6 +1715,18 @@ class PlacementCommands(object): allocations['user_id'] = instance.user_id return allocations + @staticmethod + def ensure_instance_has_no_vgpu_request(instance): + if instance.flavor.extra_specs.get("resources:VGPU"): + raise exception.HealvGPUAllocationNotSupported( + instance_uuid=instance.uuid) + + @staticmethod + def ensure_instance_has_no_cyborg_device_profile_request(instance): + if instance.flavor.extra_specs.get("accel:device_profile"): + raise exception.HealDeviceProfileAllocationNotSupported( + instance_uuid=instance.uuid) + def _heal_allocations_for_instance(self, ctxt, instance, node_cache, output, placement, dry_run, heal_port_allocations, neutron, @@ -1771,6 +1783,9 @@ class PlacementCommands(object): output(_('Instance %s is not on a host.') % instance.uuid) return + self.ensure_instance_has_no_vgpu_request(instance) + self.ensure_instance_has_no_cyborg_device_profile_request(instance) + try: allocations = placement.get_allocs_for_consumer( ctxt, instance.uuid) @@ -1887,7 +1902,7 @@ class PlacementCommands(object): :param max_count: batch size (limit per instance query) :param unlimited: True if all instances in the cell should be processed, else False to just process $max_count instances - :param outout: function that takes a single message for verbose output + :param output: function that takes a single message for verbose output :param placement: nova.scheduler.client.report.SchedulerReportClient to communicate with the Placement service API. :param dry_run: Process instances and print output but do not commit @@ -2027,6 +2042,7 @@ class PlacementCommands(object): * 5: Unable to query ports from neutron * 6: Unable to update ports in neutron * 7: Cannot roll back neutron port updates. Manual steps needed. + * 8: Cannot heal instance with vGPU or Cyborg resource request * 127: Invalid input. """ # NOTE(mriedem): Thoughts on ways to expand this: @@ -2155,6 +2171,12 @@ class PlacementCommands(object): except exception.UnableToRollbackPortUpdates as e: print(e.format_message()) return 7 + except ( + exception.HealvGPUAllocationNotSupported, + exception.HealDeviceProfileAllocationNotSupported, + ) as e: + print(e.format_message()) + return 8 # Make sure we don't go over the max count. Note that we # don't include instances that already have allocations in the diff --git a/nova/exception.py b/nova/exception.py index f5e393e5a6..734e2b7f38 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -2235,6 +2235,24 @@ class MissingDomainCapabilityFeatureException(NovaException): "including <%(feature)s> feature.") +class HealAllocationException(NovaException): + msg_fmt = _("Healing instance allocation failed.") + + +class HealvGPUAllocationNotSupported(HealAllocationException): + msg_fmt = _( + "Healing allocation for instance %(instance_uuid)s with vGPU resource " + "request is not supported." + ) + + +class HealDeviceProfileAllocationNotSupported(HealAllocationException): + msg_fmt = _( + "Healing allocation for instance %(instance_uuid)s with Cyborg device " + "profile request is not supported." + ) + + class HealPortAllocationException(NovaException): msg_fmt = _("Healing port allocation failed.") diff --git a/nova/tests/functional/test_nova_manage.py b/nova/tests/functional/test_nova_manage.py index 6e7e192364..2535b32914 100644 --- a/nova/tests/functional/test_nova_manage.py +++ b/nova/tests/functional/test_nova_manage.py @@ -780,6 +780,50 @@ class TestNovaManagePlacementHealAllocations( ) self.assertEqual(4, result, self.output.getvalue()) + def test_instance_with_vgpu_is_blocked(self): + # we cannot boot with VGPU in these tests so manipulate the + # instance.flavor directly after the boot to simulate an instance with + # VGPU request + server, _ = self._boot_and_remove_allocations(self.flavor, 'cell1') + instance = objects.Instance.get_by_uuid( + context.get_admin_context(), server['id']) + instance.flavor.extra_specs["resources:VGPU"] = 1 + instance.save() + + result = self.cli.heal_allocations( + verbose=True, instance_uuid=server['id'], + force=True + ) + + self.assertIn( + f"Healing allocation for instance {server['id']} with vGPU " + f"resource request is not supported.", + self.output.getvalue() + ) + self.assertEqual(8, result, self.output.getvalue()) + + def test_instance_with_cyborg_dev_profile_is_blocked(self): + # we cannot boot with cyborg device in these tests so manipulate the + # instance.flavor directly after the boot to simulate an instance with + # cyborg request + server, _ = self._boot_and_remove_allocations(self.flavor, 'cell1') + instance = objects.Instance.get_by_uuid( + context.get_admin_context(), server['id']) + instance.flavor.extra_specs["accel:device_profile"] = "foo" + instance.save() + + result = self.cli.heal_allocations( + verbose=True, instance_uuid=server['id'], + force=True + ) + + self.assertIn( + f"Healing allocation for instance {server['id']} with Cyborg " + f"device profile request is not supported.", + self.output.getvalue() + ) + self.assertEqual(8, result, self.output.getvalue()) + class TestNovaManagePlacementHealPortAllocations( test_servers.PortResourceRequestBasedSchedulingTestBase): diff --git a/nova/tests/unit/cmd/test_manage.py b/nova/tests/unit/cmd/test_manage.py index 5b41ce7930..cb4b4e13b4 100644 --- a/nova/tests/unit/cmd/test_manage.py +++ b/nova/tests/unit/cmd/test_manage.py @@ -2399,7 +2399,8 @@ class TestNovaManagePlacement(test.NoDBTestCase): return_value=objects.InstanceList(objects=[ objects.Instance( uuid=uuidsentinel.instance, host='fake', node='fake', - task_state=None)])) + task_state=None, + flavor=objects.Flavor(extra_specs={}))])) @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'get_allocs_for_consumer', return_value={}) @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename', @@ -2419,7 +2420,7 @@ class TestNovaManagePlacement(test.NoDBTestCase): return_value=objects.InstanceList(objects=[ objects.Instance( uuid=uuidsentinel.instance, host='fake', node='fake', - task_state=None, flavor=objects.Flavor(), + task_state=None, flavor=objects.Flavor(extra_specs={}), project_id='fake-project', user_id='fake-user')])) @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'get_allocs_for_consumer', return_value={}) @@ -2463,7 +2464,7 @@ class TestNovaManagePlacement(test.NoDBTestCase): new=mock.Mock(return_value=objects.InstanceList(objects=[ objects.Instance( uuid=uuidsentinel.instance, host='fake', node='fake', - task_state=None, flavor=objects.Flavor(), + task_state=None, flavor=objects.Flavor(extra_specs={}), project_id='fake-project', user_id='fake-user')]))) def test_heal_allocations_get_allocs_placement_fails(self): self.assertEqual(3, self.cli.heal_allocations()) @@ -2480,7 +2481,7 @@ class TestNovaManagePlacement(test.NoDBTestCase): side_effect=[ objects.InstanceList(objects=[objects.Instance( uuid=uuidsentinel.instance, host='fake', node='fake', - task_state=None, flavor=objects.Flavor(), + task_state=None, flavor=objects.Flavor(extra_specs={}), project_id='fake-project', user_id='fake-user')]), objects.InstanceList(objects=[])]) @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' @@ -2505,7 +2506,8 @@ class TestNovaManagePlacement(test.NoDBTestCase): objects.Instance( uuid=uuidsentinel.instance, host='fake', node='fake', task_state=None, - project_id='fake-project', user_id='fake-user')]), + project_id='fake-project', user_id='fake-user', + flavor=objects.Flavor(extra_specs={}))]), objects.InstanceList())) @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'get_allocs_for_consumer') @@ -2559,7 +2561,8 @@ class TestNovaManagePlacement(test.NoDBTestCase): objects.Instance( uuid=uuidsentinel.instance, host='fake', node='fake', task_state=None, project_id='fake-project', - user_id='fake-user')])) + user_id='fake-user', + flavor=objects.Flavor(extra_specs={}))])) @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' 'get_allocs_for_consumer') @mock.patch('nova.scheduler.client.report.SchedulerReportClient.put',