diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index e525bad66d..5443023bba 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -941,6 +941,12 @@ class ComputeTaskManager(base.Base): # is not forced to be the original host request_spec.reset_forced_destinations() try: + # if this is a rebuild of instance on the same host with + # new image. + if not recreate and orig_image_ref != image_ref: + self._validate_image_traits_for_rebuild(context, + instance, + image_ref) request_spec.ensure_project_and_user_id(instance) host_lists = self._schedule_instances(context, request_spec, [instance.uuid], @@ -989,6 +995,74 @@ class ComputeTaskManager(base.Base): host=host, node=node, limits=limits, request_spec=request_spec) + def _validate_image_traits_for_rebuild(self, context, instance, image_ref): + """Validates that the traits specified in the image can be satisfied + by the providers of the current allocations for the instance during + rebuild of the instance. If the traits cannot be + satisfied, fails the action by raising a NoValidHost exception. + + :raises: NoValidHost exception in case the traits on the providers + of the allocated resources for the instance do not match + the required traits on the image. + """ + image_meta = objects.ImageMeta.from_image_ref( + context, self.image_api, image_ref) + if ('properties' not in image_meta or + 'traits_required' not in image_meta.properties or not + image_meta.properties.traits_required): + return + + image_traits = set(image_meta.properties.traits_required) + + # check any of the image traits are forbidden in flavor traits. + # if so raise an exception + extra_specs = instance.flavor.extra_specs + forbidden_flavor_traits = set() + for key, val in extra_specs.items(): + if key.startswith('trait'): + # get the actual key. + prefix, parsed_key = key.split(':', 1) + if val == 'forbidden': + forbidden_flavor_traits.add(parsed_key) + + forbidden_traits = image_traits & forbidden_flavor_traits + + if forbidden_traits: + raise exception.NoValidHost( + reason=_("Image traits are part of forbidden " + "traits in flavor associated with the Image. " + "Please relaunch the instance.")) + return + + # If image traits are present, then validate against allocations. + allocations = self.report_client.get_allocations_for_consumer( + context, instance.uuid) + instance_rp_uuids = list(allocations) + + # Get provider tree for the instance. We use the uuid of the host + # on which the instance is rebuilding to get the provider tree. + compute_node = objects.ComputeNode.get_by_host_and_nodename( + context, instance.host, instance.node) + + # TODO(karimull): Call with a read-only version, when available. + instance_rp_tree = ( + self.report_client.get_provider_tree_and_ensure_root( + context, compute_node.uuid)) + + traits_in_instance_rps = set() + + for rp_uuid in instance_rp_uuids: + traits_in_instance_rps.update( + instance_rp_tree.data(rp_uuid).traits) + + missing_traits = image_traits - traits_in_instance_rps + + if missing_traits: + raise exception.NoValidHost( + reason=_("Image traits cannot be " + "satisfied by the current resource providers. " + "Please relaunch the instance.")) + # TODO(avolkov): move method to bdm @staticmethod def _volume_size(instance_type, bdm): diff --git a/nova/tests/functional/libvirt/test_shared_resource_provider.py b/nova/tests/functional/libvirt/test_shared_resource_provider.py index 635791edd7..bdef8609b3 100644 --- a/nova/tests/functional/libvirt/test_shared_resource_provider.py +++ b/nova/tests/functional/libvirt/test_shared_resource_provider.py @@ -15,8 +15,10 @@ import fixtures +from nova.compute import instance_actions from nova import conf from nova.tests.functional import integrated_helpers +import nova.tests.unit.image.fake from nova.tests.unit.virt.libvirt import fakelibvirt from nova.tests import uuidsentinel as uuids @@ -100,3 +102,137 @@ class SharedStorageProviderUsageTestCase( self.assertEqual({'DISK_GB': 1}, shared_rp_usages) self.assertEqual({'MEMORY_MB': 512, 'VCPU': 1}, cn_rp_usages) + + def create_shared_storage_rp(self): + # shared storage resource provider + shared_RP = self._post_resource_provider( + rp_name='shared_resource_provider1') + + # created inventory for shared storage RP + inv = {"resource_class": "DISK_GB", + "total": 78, "reserved": 0, "min_unit": 1, "max_unit": 78, + "step_size": 1, "allocation_ratio": 1.0} + self._set_inventory(shared_RP['uuid'], inv) + + # Added traits to shared storage resource provider + self._set_provider_traits(shared_RP['uuid'], + ['MISC_SHARES_VIA_AGGREGATE', + 'STORAGE_DISK_SSD']) + return shared_RP['uuid'] + + def test_rebuild_instance_with_image_traits_on_shared_rp(self): + + shared_rp_uuid = self.create_shared_storage_rp() + # add both cn_rp and shared_rp under one aggregate + self._set_aggregate(shared_rp_uuid, uuids.shr_disk_agg) + self._set_aggregate(self.host_uuid, uuids.shr_disk_agg) + + self.assertIn("DISK_GB", + self._get_provider_inventory(self.host_uuid)) + + # run update_available_resource periodic task after configuring shared + # resource provider to update compute node resources + self._run_periodics() + + # we expect that the virt driver stops reporting DISK_GB on the compute + # RP as soon as a shared RP with DISK_GB is created in the compute tree + self.assertNotIn("DISK_GB", + self._get_provider_inventory(self.host_uuid)) + + server_req_body = { + 'server': { + 'imageRef': '155d900f-4e14-4e4c-a73d-069cbf4541e6', + 'flavorRef': '1', + 'name': 'test_shared_storage_rp_configuration_with_cn_rp', + 'networks': 'none' + } + } + # create server + server = self.api.post_server(server_req_body) + self._wait_for_state_change(self.api, server, 'ACTIVE') + + rebuild_image_ref = ( + nova.tests.unit.image.fake.AUTO_DISK_CONFIG_ENABLED_IMAGE_UUID) + + with nova.utils.temporary_mutation(self.api, microversion='2.35'): + self.api.api_put('/images/%s/metadata' % rebuild_image_ref, + {'metadata': { + 'trait:STORAGE_DISK_SSD': 'required'}}) + rebuild_req_body = { + 'rebuild': { + 'imageRef': rebuild_image_ref + } + } + self.api.api_post('/servers/%s/action' % server['id'], + rebuild_req_body) + self._wait_for_server_parameter( + self.api, server, {'OS-EXT-STS:task_state': None}) + + # get shared_rp and cn_rp usages + shared_rp_usages = self._get_provider_usages(shared_rp_uuid) + cn_rp_usages = self._get_provider_usages(self.host_uuid) + # Check if DISK_GB resource is allocated from shared_RP and the + # remaining resources are allocated from host_uuid. + self.assertEqual({'DISK_GB': 1}, shared_rp_usages) + self.assertEqual({'MEMORY_MB': 512, 'VCPU': 1}, + cn_rp_usages) + allocs = self._get_allocations_by_server_uuid(server['id']) + self.assertIn(self.host_uuid, allocs) + server = self.api.get_server(server['id']) + self.assertEqual(rebuild_image_ref, server['image']['id']) + + def test_rebuild_instance_with_image_traits_on_shared_rp_no_valid_host( + self): + shared_rp_uuid = self.create_shared_storage_rp() + # add both cn_rp and shared_rp under one aggregate + self._set_aggregate(shared_rp_uuid, uuids.shr_disk_agg) + self._set_aggregate(self.host_uuid, uuids.shr_disk_agg) + + self.assertIn("DISK_GB", + self._get_provider_inventory(self.host_uuid)) + + # run update_available_resource periodic task after configuring shared + # resource provider to update compute node resources + self._run_periodics() + + # we expect that the virt driver stops reporting DISK_GB on the compute + # RP as soon as a shared RP with DISK_GB is created in the compute tree + self.assertNotIn("DISK_GB", + self._get_provider_inventory(self.host_uuid)) + org_image_id = '155d900f-4e14-4e4c-a73d-069cbf4541e6' + server_req_body = { + 'server': { + 'imageRef': org_image_id, + 'flavorRef': '1', + 'name': 'test_shared_storage_rp_configuration_with_cn_rp', + 'networks': 'none' + } + } + # create server + server = self.api.post_server(server_req_body) + self._wait_for_state_change(self.api, server, 'ACTIVE') + + rebuild_image_ref = ( + nova.tests.unit.image.fake.AUTO_DISK_CONFIG_ENABLED_IMAGE_UUID) + + with nova.utils.temporary_mutation(self.api, microversion='2.35'): + self.api.api_put('/images/%s/metadata' % rebuild_image_ref, + {'metadata': { + 'trait:CUSTOM_FOO': 'required'}}) + rebuild_req_body = { + 'rebuild': { + 'imageRef': rebuild_image_ref + } + } + self.api.api_post('/servers/%s/action' % server['id'], + rebuild_req_body) + # Look for the failed rebuild action. + self._wait_for_action_fail_completion( + server, instance_actions.REBUILD, 'rebuild_server', self.admin_api) + # Assert the server image_ref was rolled back on failure. + server = self.api.get_server(server['id']) + self.assertEqual(org_image_id, server['image']['id']) + + # The server should be in ERROR state + self.assertEqual('ERROR', server['status']) + self.assertIn('No valid host', server['fault']['message']) diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index 5a857f363e..0f27674dba 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -3595,6 +3595,10 @@ class TraitsBasedSchedulingTest(integrated_helpers.ProviderUsageBaseTestCase): self.flavor_with_trait['id'], {'extra_specs': {'trait:HW_CPU_X86_VMX': 'required'}}) self.flavor_without_trait = flavors[1] + self.flavor_with_forbidden_trait = flavors[2] + self.admin_api.post_extra_spec( + self.flavor_with_forbidden_trait['id'], + {'extra_specs': {'trait:HW_CPU_X86_SGX': 'forbidden'}}) # Note that we're using v2.35 explicitly as the api returns 404 # starting with 2.36 @@ -3822,6 +3826,175 @@ class TraitsBasedSchedulingTest(integrated_helpers.ProviderUsageBaseTestCase): self.assertIn('fault', server) self.assertIn('No valid host', server['fault']['message']) + def test_rebuild_instance_with_image_traits(self): + """Rebuilds a server with a different image which has traits + associated with it and which will run it through the scheduler to + validate the image is still OK with the compute host that the + instance is running on. + """ + # Decorate compute2 resource provider with both flavor and image trait. + rp_uuid = self._get_provider_uuid_by_host(self.compute2.host) + self._set_provider_traits(rp_uuid, ['HW_CPU_X86_VMX', + 'HW_CPU_X86_SGX']) + # make sure we start with no usage on the compute node + rp_usages = self._get_provider_usages(rp_uuid) + self.assertEqual({'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}, rp_usages) + + # create a server without traits on image and with traits on flavour + server = self._create_server_with_traits( + self.flavor_with_trait['id'], self.image_id_without_trait) + server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE') + + # make the compute node full and ensure rebuild still succeed + inv = {"resource_class": "VCPU", + "total": 1} + self._set_inventory(rp_uuid, inv) + + # Now rebuild the server with a different image with traits + rebuild_req_body = { + 'rebuild': { + 'imageRef': self.image_id_with_trait + } + } + self.api.api_post('/servers/%s/action' % server['id'], + rebuild_req_body) + self._wait_for_server_parameter( + self.api, server, {'OS-EXT-STS:task_state': None}) + + allocs = self._get_allocations_by_server_uuid(server['id']) + self.assertIn(rp_uuid, allocs) + + # Assert the server ended up on the expected compute host that has + # the required trait. + self.assertEqual(self.compute2.host, server['OS-EXT-SRV-ATTR:host']) + + def test_rebuild_instance_with_image_traits_no_host(self): + """Rebuilding a server with a different image which has required + traits on the image fails to valid the host that this server is + currently running, cause the compute host resource provider is not + associated with similar trait. + """ + # Decorate compute2 resource provider with traits on flavor + rp_uuid = self._get_provider_uuid_by_host(self.compute2.host) + self._set_provider_traits(rp_uuid, ['HW_CPU_X86_VMX']) + + # make sure we start with no usage on the compute node + rp_usages = self._get_provider_usages(rp_uuid) + self.assertEqual({'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}, rp_usages) + + # create a server without traits on image and with traits on flavour + server = self._create_server_with_traits( + self.flavor_with_trait['id'], self.image_id_without_trait) + server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE') + + # Now rebuild the server with a different image with traits + rebuild_req_body = { + 'rebuild': { + 'imageRef': self.image_id_with_trait + } + } + + self.api.api_post('/servers/%s/action' % server['id'], + rebuild_req_body) + # Look for the failed rebuild action. + self._wait_for_action_fail_completion( + server, instance_actions.REBUILD, 'rebuild_server', self.admin_api) + # Assert the server image_ref was rolled back on failure. + server = self.api.get_server(server['id']) + self.assertEqual(self.image_id_without_trait, server['image']['id']) + + # The server should be in ERROR state + self.assertEqual('ERROR', server['status']) + self.assertEqual("No valid host was found. Image traits cannot be " + "satisfied by the current resource providers. " + "Please relaunch the instance.", + server['fault']['message']) + + def test_rebuild_instance_with_image_traits_no_image_change(self): + """Rebuilds a server with a same image which has traits + associated with it and which will run it through the scheduler to + validate the image is still OK with the compute host that the + instance is running on. + """ + # Decorate compute2 resource provider with both flavor and image trait. + rp_uuid = self._get_provider_uuid_by_host(self.compute2.host) + self._set_provider_traits(rp_uuid, ['HW_CPU_X86_VMX', + 'HW_CPU_X86_SGX']) + # make sure we start with no usage on the compute node + rp_usages = self._get_provider_usages(rp_uuid) + self.assertEqual({'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}, + rp_usages) + + # create a server with traits in both image and flavour + server = self._create_server_with_traits( + self.flavor_with_trait['id'], self.image_id_with_trait) + server = self._wait_for_state_change(self.admin_api, server, + 'ACTIVE') + + # Now rebuild the server with a different image with traits + rebuild_req_body = { + 'rebuild': { + 'imageRef': self.image_id_with_trait + } + } + self.api.api_post('/servers/%s/action' % server['id'], + rebuild_req_body) + self._wait_for_server_parameter( + self.api, server, {'OS-EXT-STS:task_state': None}) + + allocs = self._get_allocations_by_server_uuid(server['id']) + self.assertIn(rp_uuid, allocs) + + # Assert the server ended up on the expected compute host that has + # the required trait. + self.assertEqual(self.compute2.host, + server['OS-EXT-SRV-ATTR:host']) + + def test_rebuild_instance_with_image_traits_and_forbidden_flavor_traits( + self): + """Rebuilding a server with a different image which has required + traits on the image fails to validate image traits because flavor + associated with the current instance has the similar triat that is + forbidden + """ + # Decorate compute2 resource provider with traits on flavor + rp_uuid = self._get_provider_uuid_by_host(self.compute2.host) + self._set_provider_traits(rp_uuid, ['HW_CPU_X86_VMX']) + + # make sure we start with no usage on the compute node + rp_usages = self._get_provider_usages(rp_uuid) + self.assertEqual({'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}, rp_usages) + + # create a server with forbidden traits on flavor and no triats on + # image + server = self._create_server_with_traits( + self.flavor_with_forbidden_trait['id'], + self.image_id_without_trait) + server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE') + + # Now rebuild the server with a different image with traits + rebuild_req_body = { + 'rebuild': { + 'imageRef': self.image_id_with_trait + } + } + + self.api.api_post('/servers/%s/action' % server['id'], + rebuild_req_body) + # Look for the failed rebuild action. + self._wait_for_action_fail_completion( + server, instance_actions.REBUILD, 'rebuild_server', self.admin_api) + # Assert the server image_ref was rolled back on failure. + server = self.api.get_server(server['id']) + self.assertEqual(self.image_id_without_trait, server['image']['id']) + + # The server should be in ERROR state + self.assertEqual('ERROR', server['status']) + self.assertEqual("No valid host was found. Image traits are part of " + "forbidden traits in flavor associated with the " + "Image. Please relaunch the instance.", + server['fault']['message']) + class ServerTestV256Common(ServersTestBase): api_major_version = 'v2.1'