From 012ea7439bad72d34bb851c18d9390826752ff05 Mon Sep 17 00:00:00 2001
From: Author Name <karimullah.mohammed@intel.com>
Date: Tue, 8 May 2018 12:20:04 -0700
Subject: [PATCH] Handle rebuild of instances with image traits

Update the conductor to handle rebuilding of an instance. If a rebuild
is requested with an image with traits, we check the current allocations
of the instance against the image required traits and allow the action
only if the current allocations can satisfy the image requirements.

Change-Id: Ie49d605c66062d2548241d7e04f5a2a6b98c011e
Implements: blueprint glance-image-traits
---
 nova/conductor/manager.py                     |  74 ++++++++
 .../libvirt/test_shared_resource_provider.py  | 136 ++++++++++++++
 nova/tests/functional/test_servers.py         | 173 ++++++++++++++++++
 3 files changed, 383 insertions(+)

diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index e525bad66d..5443023bba 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -941,6 +941,12 @@ class ComputeTaskManager(base.Base):
                     # is not forced to be the original host
                     request_spec.reset_forced_destinations()
                 try:
+                    # if this is a rebuild of instance on the same host with
+                    # new image.
+                    if not recreate and orig_image_ref != image_ref:
+                        self._validate_image_traits_for_rebuild(context,
+                                                                instance,
+                                                                image_ref)
                     request_spec.ensure_project_and_user_id(instance)
                     host_lists = self._schedule_instances(context,
                             request_spec, [instance.uuid],
@@ -989,6 +995,74 @@ class ComputeTaskManager(base.Base):
                     host=host, node=node, limits=limits,
                     request_spec=request_spec)
 
+    def _validate_image_traits_for_rebuild(self, context, instance, image_ref):
+        """Validates that the traits specified in the image can be satisfied
+        by the providers of the current allocations for the instance during
+        rebuild of the instance. If the traits cannot be
+        satisfied, fails the action by raising a NoValidHost exception.
+
+        :raises: NoValidHost exception in case the traits on the providers
+                 of the allocated resources for the instance do not match
+                 the required traits on the image.
+        """
+        image_meta = objects.ImageMeta.from_image_ref(
+            context, self.image_api, image_ref)
+        if ('properties' not in image_meta or
+                'traits_required' not in image_meta.properties or not
+                image_meta.properties.traits_required):
+            return
+
+        image_traits = set(image_meta.properties.traits_required)
+
+        # check any of the image traits are forbidden in flavor traits.
+        # if so raise an exception
+        extra_specs = instance.flavor.extra_specs
+        forbidden_flavor_traits = set()
+        for key, val in extra_specs.items():
+            if key.startswith('trait'):
+                # get the actual key.
+                prefix, parsed_key = key.split(':', 1)
+                if val == 'forbidden':
+                    forbidden_flavor_traits.add(parsed_key)
+
+        forbidden_traits = image_traits & forbidden_flavor_traits
+
+        if forbidden_traits:
+            raise exception.NoValidHost(
+                reason=_("Image traits are part of forbidden "
+                         "traits in flavor associated with the Image. "
+                         "Please relaunch the instance."))
+            return
+
+        # If image traits are present, then validate against allocations.
+        allocations = self.report_client.get_allocations_for_consumer(
+            context, instance.uuid)
+        instance_rp_uuids = list(allocations)
+
+        # Get provider tree for the instance. We use the uuid of the host
+        # on which the instance is rebuilding to get the provider tree.
+        compute_node = objects.ComputeNode.get_by_host_and_nodename(
+            context, instance.host, instance.node)
+
+        # TODO(karimull): Call with a read-only version, when available.
+        instance_rp_tree = (
+            self.report_client.get_provider_tree_and_ensure_root(
+                context, compute_node.uuid))
+
+        traits_in_instance_rps = set()
+
+        for rp_uuid in instance_rp_uuids:
+            traits_in_instance_rps.update(
+                instance_rp_tree.data(rp_uuid).traits)
+
+        missing_traits = image_traits - traits_in_instance_rps
+
+        if missing_traits:
+            raise exception.NoValidHost(
+                reason=_("Image traits cannot be "
+                         "satisfied by the current resource providers. "
+                         "Please relaunch the instance."))
+
     # TODO(avolkov): move method to bdm
     @staticmethod
     def _volume_size(instance_type, bdm):
diff --git a/nova/tests/functional/libvirt/test_shared_resource_provider.py b/nova/tests/functional/libvirt/test_shared_resource_provider.py
index 635791edd7..bdef8609b3 100644
--- a/nova/tests/functional/libvirt/test_shared_resource_provider.py
+++ b/nova/tests/functional/libvirt/test_shared_resource_provider.py
@@ -15,8 +15,10 @@
 
 import fixtures
 
+from nova.compute import instance_actions
 from nova import conf
 from nova.tests.functional import integrated_helpers
+import nova.tests.unit.image.fake
 from nova.tests.unit.virt.libvirt import fakelibvirt
 from nova.tests import uuidsentinel as uuids
 
@@ -100,3 +102,137 @@ class SharedStorageProviderUsageTestCase(
         self.assertEqual({'DISK_GB': 1}, shared_rp_usages)
         self.assertEqual({'MEMORY_MB': 512, 'VCPU': 1},
                          cn_rp_usages)
+
+    def create_shared_storage_rp(self):
+        # shared storage resource provider
+        shared_RP = self._post_resource_provider(
+            rp_name='shared_resource_provider1')
+
+        # created inventory for shared storage RP
+        inv = {"resource_class": "DISK_GB",
+               "total": 78, "reserved": 0, "min_unit": 1, "max_unit": 78,
+               "step_size": 1, "allocation_ratio": 1.0}
+        self._set_inventory(shared_RP['uuid'], inv)
+
+        # Added traits to shared storage resource provider
+        self._set_provider_traits(shared_RP['uuid'],
+                                  ['MISC_SHARES_VIA_AGGREGATE',
+                                   'STORAGE_DISK_SSD'])
+        return shared_RP['uuid']
+
+    def test_rebuild_instance_with_image_traits_on_shared_rp(self):
+
+        shared_rp_uuid = self.create_shared_storage_rp()
+        # add both cn_rp and shared_rp under one aggregate
+        self._set_aggregate(shared_rp_uuid, uuids.shr_disk_agg)
+        self._set_aggregate(self.host_uuid, uuids.shr_disk_agg)
+
+        self.assertIn("DISK_GB",
+                      self._get_provider_inventory(self.host_uuid))
+
+        # run update_available_resource periodic task after configuring shared
+        # resource provider to update compute node resources
+        self._run_periodics()
+
+        # we expect that the virt driver stops reporting DISK_GB on the compute
+        # RP as soon as a shared RP with DISK_GB is created in the compute tree
+        self.assertNotIn("DISK_GB",
+                         self._get_provider_inventory(self.host_uuid))
+
+        server_req_body = {
+            'server': {
+                'imageRef': '155d900f-4e14-4e4c-a73d-069cbf4541e6',
+                'flavorRef': '1',
+                'name': 'test_shared_storage_rp_configuration_with_cn_rp',
+                'networks': 'none'
+            }
+        }
+        # create server
+        server = self.api.post_server(server_req_body)
+        self._wait_for_state_change(self.api, server, 'ACTIVE')
+
+        rebuild_image_ref = (
+            nova.tests.unit.image.fake.AUTO_DISK_CONFIG_ENABLED_IMAGE_UUID)
+
+        with nova.utils.temporary_mutation(self.api, microversion='2.35'):
+            self.api.api_put('/images/%s/metadata' % rebuild_image_ref,
+                             {'metadata': {
+                                 'trait:STORAGE_DISK_SSD': 'required'}})
+        rebuild_req_body = {
+            'rebuild': {
+                'imageRef': rebuild_image_ref
+            }
+        }
+        self.api.api_post('/servers/%s/action' % server['id'],
+                          rebuild_req_body)
+        self._wait_for_server_parameter(
+            self.api, server, {'OS-EXT-STS:task_state': None})
+
+        # get shared_rp and cn_rp usages
+        shared_rp_usages = self._get_provider_usages(shared_rp_uuid)
+        cn_rp_usages = self._get_provider_usages(self.host_uuid)
+        # Check if DISK_GB resource is allocated from shared_RP and the
+        # remaining resources are allocated from host_uuid.
+        self.assertEqual({'DISK_GB': 1}, shared_rp_usages)
+        self.assertEqual({'MEMORY_MB': 512, 'VCPU': 1},
+                         cn_rp_usages)
+        allocs = self._get_allocations_by_server_uuid(server['id'])
+        self.assertIn(self.host_uuid, allocs)
+        server = self.api.get_server(server['id'])
+        self.assertEqual(rebuild_image_ref, server['image']['id'])
+
+    def test_rebuild_instance_with_image_traits_on_shared_rp_no_valid_host(
+            self):
+        shared_rp_uuid = self.create_shared_storage_rp()
+        # add both cn_rp and shared_rp under one aggregate
+        self._set_aggregate(shared_rp_uuid, uuids.shr_disk_agg)
+        self._set_aggregate(self.host_uuid, uuids.shr_disk_agg)
+
+        self.assertIn("DISK_GB",
+                      self._get_provider_inventory(self.host_uuid))
+
+        # run update_available_resource periodic task after configuring shared
+        # resource provider to update compute node resources
+        self._run_periodics()
+
+        # we expect that the virt driver stops reporting DISK_GB on the compute
+        # RP as soon as a shared RP with DISK_GB is created in the compute tree
+        self.assertNotIn("DISK_GB",
+                         self._get_provider_inventory(self.host_uuid))
+        org_image_id = '155d900f-4e14-4e4c-a73d-069cbf4541e6'
+        server_req_body = {
+            'server': {
+                'imageRef': org_image_id,
+                'flavorRef': '1',
+                'name': 'test_shared_storage_rp_configuration_with_cn_rp',
+                'networks': 'none'
+            }
+        }
+        # create server
+        server = self.api.post_server(server_req_body)
+        self._wait_for_state_change(self.api, server, 'ACTIVE')
+
+        rebuild_image_ref = (
+            nova.tests.unit.image.fake.AUTO_DISK_CONFIG_ENABLED_IMAGE_UUID)
+
+        with nova.utils.temporary_mutation(self.api, microversion='2.35'):
+            self.api.api_put('/images/%s/metadata' % rebuild_image_ref,
+                             {'metadata': {
+                                 'trait:CUSTOM_FOO': 'required'}})
+        rebuild_req_body = {
+            'rebuild': {
+                'imageRef': rebuild_image_ref
+            }
+        }
+        self.api.api_post('/servers/%s/action' % server['id'],
+                          rebuild_req_body)
+        # Look for the failed rebuild action.
+        self._wait_for_action_fail_completion(
+            server, instance_actions.REBUILD, 'rebuild_server', self.admin_api)
+        # Assert the server image_ref was rolled back on failure.
+        server = self.api.get_server(server['id'])
+        self.assertEqual(org_image_id, server['image']['id'])
+
+        # The server should be in ERROR state
+        self.assertEqual('ERROR', server['status'])
+        self.assertIn('No valid host', server['fault']['message'])
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index d49fbaf201..422343668e 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -3506,6 +3506,10 @@ class TraitsBasedSchedulingTest(integrated_helpers.ProviderUsageBaseTestCase):
             self.flavor_with_trait['id'],
             {'extra_specs': {'trait:HW_CPU_X86_VMX': 'required'}})
         self.flavor_without_trait = flavors[1]
+        self.flavor_with_forbidden_trait = flavors[2]
+        self.admin_api.post_extra_spec(
+            self.flavor_with_forbidden_trait['id'],
+            {'extra_specs': {'trait:HW_CPU_X86_SGX': 'forbidden'}})
 
         # Note that we're using v2.35 explicitly as the api returns 404
         # starting with 2.36
@@ -3733,6 +3737,175 @@ class TraitsBasedSchedulingTest(integrated_helpers.ProviderUsageBaseTestCase):
         self.assertIn('fault', server)
         self.assertIn('No valid host', server['fault']['message'])
 
+    def test_rebuild_instance_with_image_traits(self):
+        """Rebuilds a server with a different image which has traits
+        associated with it and which will run it through the scheduler to
+        validate the image is still OK with the compute host that the
+        instance is running on.
+         """
+        # Decorate compute2 resource provider with both flavor and image trait.
+        rp_uuid = self._get_provider_uuid_by_host(self.compute2.host)
+        self._set_provider_traits(rp_uuid, ['HW_CPU_X86_VMX',
+                                            'HW_CPU_X86_SGX'])
+        # make sure we start with no usage on the compute node
+        rp_usages = self._get_provider_usages(rp_uuid)
+        self.assertEqual({'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}, rp_usages)
+
+        # create a server without traits on image and with traits on flavour
+        server = self._create_server_with_traits(
+            self.flavor_with_trait['id'], self.image_id_without_trait)
+        server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE')
+
+        # make the compute node full and ensure rebuild still succeed
+        inv = {"resource_class": "VCPU",
+               "total": 1}
+        self._set_inventory(rp_uuid, inv)
+
+        # Now rebuild the server with a different image with traits
+        rebuild_req_body = {
+            'rebuild': {
+                'imageRef': self.image_id_with_trait
+            }
+        }
+        self.api.api_post('/servers/%s/action' % server['id'],
+                          rebuild_req_body)
+        self._wait_for_server_parameter(
+            self.api, server, {'OS-EXT-STS:task_state': None})
+
+        allocs = self._get_allocations_by_server_uuid(server['id'])
+        self.assertIn(rp_uuid, allocs)
+
+        # Assert the server ended up on the expected compute host that has
+        # the required trait.
+        self.assertEqual(self.compute2.host, server['OS-EXT-SRV-ATTR:host'])
+
+    def test_rebuild_instance_with_image_traits_no_host(self):
+        """Rebuilding a server with a different image which has required
+        traits on the image fails to valid the host that this server is
+        currently running, cause the compute host resource provider is not
+        associated with similar trait.
+        """
+        # Decorate compute2 resource provider with traits on flavor
+        rp_uuid = self._get_provider_uuid_by_host(self.compute2.host)
+        self._set_provider_traits(rp_uuid, ['HW_CPU_X86_VMX'])
+
+        # make sure we start with no usage on the compute node
+        rp_usages = self._get_provider_usages(rp_uuid)
+        self.assertEqual({'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}, rp_usages)
+
+        # create a server without traits on image and with traits on flavour
+        server = self._create_server_with_traits(
+            self.flavor_with_trait['id'], self.image_id_without_trait)
+        server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE')
+
+        # Now rebuild the server with a different image with traits
+        rebuild_req_body = {
+            'rebuild': {
+                'imageRef': self.image_id_with_trait
+            }
+        }
+
+        self.api.api_post('/servers/%s/action' % server['id'],
+                          rebuild_req_body)
+        # Look for the failed rebuild action.
+        self._wait_for_action_fail_completion(
+            server, instance_actions.REBUILD, 'rebuild_server', self.admin_api)
+        # Assert the server image_ref was rolled back on failure.
+        server = self.api.get_server(server['id'])
+        self.assertEqual(self.image_id_without_trait, server['image']['id'])
+
+        # The server should be in ERROR state
+        self.assertEqual('ERROR', server['status'])
+        self.assertEqual("No valid host was found. Image traits cannot be "
+                         "satisfied by the current resource providers. "
+                         "Please relaunch the instance.",
+                         server['fault']['message'])
+
+    def test_rebuild_instance_with_image_traits_no_image_change(self):
+        """Rebuilds a server with a same image which has traits
+        associated with it and which will run it through the scheduler to
+        validate the image is still OK with the compute host that the
+        instance is running on.
+         """
+        # Decorate compute2 resource provider with both flavor and image trait.
+        rp_uuid = self._get_provider_uuid_by_host(self.compute2.host)
+        self._set_provider_traits(rp_uuid, ['HW_CPU_X86_VMX',
+                                            'HW_CPU_X86_SGX'])
+        # make sure we start with no usage on the compute node
+        rp_usages = self._get_provider_usages(rp_uuid)
+        self.assertEqual({'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0},
+                         rp_usages)
+
+        # create a server with traits in both image and flavour
+        server = self._create_server_with_traits(
+            self.flavor_with_trait['id'], self.image_id_with_trait)
+        server = self._wait_for_state_change(self.admin_api, server,
+                                             'ACTIVE')
+
+        # Now rebuild the server with a different image with traits
+        rebuild_req_body = {
+            'rebuild': {
+                'imageRef': self.image_id_with_trait
+            }
+        }
+        self.api.api_post('/servers/%s/action' % server['id'],
+                          rebuild_req_body)
+        self._wait_for_server_parameter(
+            self.api, server, {'OS-EXT-STS:task_state': None})
+
+        allocs = self._get_allocations_by_server_uuid(server['id'])
+        self.assertIn(rp_uuid, allocs)
+
+        # Assert the server ended up on the expected compute host that has
+        # the required trait.
+        self.assertEqual(self.compute2.host,
+                         server['OS-EXT-SRV-ATTR:host'])
+
+    def test_rebuild_instance_with_image_traits_and_forbidden_flavor_traits(
+                                                                        self):
+        """Rebuilding a server with a different image which has required
+        traits on the image fails to validate image traits because flavor
+        associated with the current instance has the similar triat that is
+        forbidden
+        """
+        # Decorate compute2 resource provider with traits on flavor
+        rp_uuid = self._get_provider_uuid_by_host(self.compute2.host)
+        self._set_provider_traits(rp_uuid, ['HW_CPU_X86_VMX'])
+
+        # make sure we start with no usage on the compute node
+        rp_usages = self._get_provider_usages(rp_uuid)
+        self.assertEqual({'VCPU': 0, 'MEMORY_MB': 0, 'DISK_GB': 0}, rp_usages)
+
+        # create a server with forbidden traits on flavor and no triats on
+        # image
+        server = self._create_server_with_traits(
+            self.flavor_with_forbidden_trait['id'],
+            self.image_id_without_trait)
+        server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE')
+
+        # Now rebuild the server with a different image with traits
+        rebuild_req_body = {
+            'rebuild': {
+                'imageRef': self.image_id_with_trait
+            }
+        }
+
+        self.api.api_post('/servers/%s/action' % server['id'],
+                          rebuild_req_body)
+        # Look for the failed rebuild action.
+        self._wait_for_action_fail_completion(
+            server, instance_actions.REBUILD, 'rebuild_server', self.admin_api)
+        # Assert the server image_ref was rolled back on failure.
+        server = self.api.get_server(server['id'])
+        self.assertEqual(self.image_id_without_trait, server['image']['id'])
+
+        # The server should be in ERROR state
+        self.assertEqual('ERROR', server['status'])
+        self.assertEqual("No valid host was found. Image traits are part of "
+                         "forbidden traits in flavor associated with the "
+                         "Image. Please relaunch the instance.",
+                         server['fault']['message'])
+
 
 class ServerTestV256Common(ServersTestBase):
     api_major_version = 'v2.1'