Support live migration with qos ports

This patch enhances the live_migration task in the conductor to support qos ports during live migration. The high level sequence of events are the following: * when request spec is gathered before the scheduler call the resource requests are collected from neutron ports and the request spec is updated * after a successful scheduling the request group - resource provider mapping is calculated * the instance pci requests are updated to drive the pci claim on the target host to allocate VFs from the same PCI PF the bandwidth is allocated from * the inactive port binding on the target host is updated to have the RP UUID in the allocation key according to the resource allocation on the destination host. As the port binding is already updated in the conductor the late check about the allocation key in the binding profile is turned off for live migration in the neutronv2 api. Note that this patch only handles the live migration without target host subsequent patches will add support for migration with target host and other edge case like reschedule. blueprint: support-move-ops-with-qos-ports-ussuri Change-Id: Ibb84ea210795634f02997d4627e0beec5fea36ee
2019-11-25 15:03:10 +01:00
parent b96bc9d675
commit bd8e2fe9c8
6 changed files with 273 additions and 29 deletions
@@ -8508,7 +8508,6 @@ class ComputeManager(manager.Manager):
        migration = objects.Migration(source_compute=instance.host,
                                      dest_compute=self.host,
                                      migration_type='live-migration')
-        # TODO(gibi): calculate and pass resource_provider_mapping
        self.network_api.migrate_instance_finish(
            context, instance, migration, provider_mappings=None)

@@ -257,7 +257,12 @@ class LiveMigrationTask(base.TaskBase):
        self._check_destination_has_enough_memory()
        source_node, dest_node = self._check_compatible_with_source_hypervisor(
            self.destination)
-        self._call_livem_checks_on_host(self.destination)
+        # NOTE(gibi): This code path is used when the live migration is forced
+        # to a target host and skipping the scheduler. Such operation is
+        # rejected for servers with nested resource allocations since
+        # I7cbd5d9fb875ebf72995362e0b6693492ce32051. So here we can safely
+        # assume that the provider mapping is empty.
+        self._call_livem_checks_on_host(self.destination, {})
        # Make sure the forced destination host is in the same cell that the
        # instance currently lives in.
        # NOTE(mriedem): This can go away if/when the forced destination host
@@ -316,7 +321,7 @@ class LiveMigrationTask(base.TaskBase):
            raise exception.DestinationHypervisorTooOld()
        return source_info, destination_info

-    def _call_livem_checks_on_host(self, destination):
+    def _call_livem_checks_on_host(self, destination, provider_mapping):
        self._check_can_migrate_pci(self.source, destination)
        try:
            self.migrate_data = self.compute_rpcapi.\
@@ -338,11 +343,25 @@ class LiveMigrationTask(base.TaskBase):
                self.migrate_data.vifs = migrate_data_obj.VIFMigrateData.\
                    create_skeleton_migrate_vifs(
                    self.instance.get_network_info())
-            bindings = self._bind_ports_on_destination(destination)
+            bindings = self._bind_ports_on_destination(
+                destination, provider_mapping)
            self._update_migrate_vifs_from_bindings(self.migrate_data.vifs,
                                                    bindings)

-    def _bind_ports_on_destination(self, destination):
+    @staticmethod
+    def _get_port_profile_from_provider_mapping(port_id, provider_mappings):
+        if port_id in provider_mappings:
+            # NOTE(gibi): In the resource provider mapping there can be
+            # more than one RP fulfilling a request group. But resource
+            # requests of a Neutron port is always mapped to a
+            # numbered request group that is always fulfilled by one
+            # resource provider. So we only pass that single RP UUID
+            # here.
+            return {'allocation': provider_mappings[port_id][0]}
+        else:
+            return {}
+
+    def _bind_ports_on_destination(self, destination, provider_mappings):
        LOG.debug('Start binding ports on destination host: %s', destination,
                  instance=self.instance)
        # Bind ports on the destination host; returns a dict, keyed by
@@ -355,15 +374,18 @@ class LiveMigrationTask(base.TaskBase):
            # if that is the case, it may have updated the required port
            # profile for the destination node (e.g new PCI address if SR-IOV)
            # perform port binding against the requested profile
-            migrate_vifs_with_profile = [mig_vif for mig_vif in
-                                         self.migrate_data.vifs
-                                         if 'profile_json' in mig_vif]
-
-            ports_profile = None
-            if migrate_vifs_with_profile:
-                # Update to the port profile is required
-                ports_profile = {mig_vif.port_id: mig_vif.profile
-                                 for mig_vif in migrate_vifs_with_profile}
+            ports_profile = {}
+            for mig_vif in self.migrate_data.vifs:
+                profile = mig_vif.profile if 'profile_json' in mig_vif else {}
+                # NOTE(gibi): provider_mappings also contribute to the
+                # binding profile of the ports if the port has resource
+                # request. So we need to merge the profile information from
+                # both sources.
+                profile.update(
+                    self._get_port_profile_from_provider_mapping(
+                        mig_vif.port_id, provider_mappings))
+                if profile:
+                    ports_profile[mig_vif.port_id] = profile

            bindings = self.network_api.bind_ports_to_host(
                context=self.context, instance=self.instance, host=destination,
@@ -425,8 +447,15 @@ class LiveMigrationTask(base.TaskBase):
        # is not forced to be the original host
        request_spec.reset_forced_destinations()

-        # TODO(gibi): We need to make sure that the requested_resources field
-        # is re calculated based on neutron ports.
+        port_res_req = (
+            self.network_api.get_requested_resource_for_instance(
+                self.context, self.instance.uuid))
+        # NOTE(gibi): When cyborg or other module wants to handle
+        # similar non-nova resources then here we have to collect
+        # all the external resource requests in a single list and
+        # add them to the RequestSpec.
+        request_spec.requested_resources = port_res_req
+
        scheduler_utils.setup_instance_group(self.context, request_spec)

        # We currently only support live migrating to hosts in the same
@@ -476,9 +505,19 @@ class LiveMigrationTask(base.TaskBase):
                # ex.exc_type.
                raise exception.MigrationSchedulerRPCError(
                    reason=six.text_type(ex))
+
+            scheduler_utils.fill_provider_mapping(request_spec, selection)
+
+            provider_mapping = request_spec.get_request_group_mapping()
+
+            if provider_mapping:
+                compute_utils.\
+                    update_pci_request_spec_with_allocated_interface_name(
+                        self.context, self.report_client, self.instance,
+                        provider_mapping)
            try:
                self._check_compatible_with_source_hypervisor(host)
-                self._call_livem_checks_on_host(host)
+                self._call_livem_checks_on_host(host, provider_mapping)
            except (exception.Invalid, exception.MigrationPreCheckError) as e:
                LOG.debug("Skipping host: %(host)s because: %(e)s",
                    {"host": host, "e": e})
@@ -1260,10 +1260,6 @@ class API(base_api.NetworkAPI):

        client = _get_ksa_client(context, admin=True)

-        # TODO(gibi): To support ports with resource request during server
-        # live migrate operation we need to take care of 'allocation' key in
-        # the binding profile per binding.
-
        bindings_by_port_id = {}
        for vif in network_info:
            # Now bind each port to the destination host and keep track of each
@@ -3399,7 +3395,10 @@ class API(base_api.NetworkAPI):
                        reason=_("Unable to correlate PCI slot %s") %
                                 pci_slot)

-            if p.get('resource_request'):
+            # NOTE(gibi): during live migration the conductor already sets the
+            # allocation key in the port binding
+            if (p.get('resource_request') and
+                    migration['migration_type'] != constants.LIVE_MIGRATION):
                if not provider_mappings:
                    # TODO(gibi): Remove this check when compute RPC API is
                    # bumped to 6.0
@@ -7237,6 +7237,63 @@ class ServerMoveWithPortResourceRequestTest(
            server, self.compute1_rp_uuid, non_qos_port, qos_port,
            qos_sriov_port, self.flavor_with_group_policy)

+    def _turn_off_api_check(self):
+        # The API actively rejecting the move operations with resource
+        # request so we have to turn off that check.
+        # TODO(gibi): Remove this when the move operations are supported and
+        # the API check is removed.
+        patcher = mock.patch(
+            'nova.api.openstack.common.'
+            'supports_port_resource_request_during_move',
+            return_value=True)
+        self.addCleanup(patcher.stop)
+        patcher.start()
+
+    def test_live_migrate_with_qos_port(self):
+        # TODO(gibi): remove this when live migration is fully supported and
+        # therefore the check is removed from the api
+        self._turn_off_api_check()
+
+        non_qos_normal_port = self.neutron.port_1
+        qos_normal_port = self.neutron.port_with_resource_request
+        qos_sriov_port = self.neutron.port_with_sriov_resource_request
+
+        server = self._create_server_with_ports(
+            non_qos_normal_port, qos_normal_port, qos_sriov_port)
+
+        # check that the server allocates from the current host properly
+        self._check_allocation(
+            server, self.compute1_rp_uuid, non_qos_normal_port,
+            qos_normal_port, qos_sriov_port, self.flavor_with_group_policy)
+
+        self.api.post_server_action(
+            server['id'],
+            {
+                'os-migrateLive': {
+                    'host': None,
+                    'block_migration': 'auto'
+                }
+            }
+        )
+
+        self._wait_for_server_parameter(
+            server,
+            {'OS-EXT-SRV-ATTR:host': 'host2',
+             'status': 'ACTIVE'})
+
+        self._check_allocation(
+            server, self.compute2_rp_uuid, non_qos_normal_port,
+            qos_normal_port, qos_sriov_port, self.flavor_with_group_policy)
+
+        self._delete_server_and_check_allocations(
+            server, qos_normal_port, qos_sriov_port)
+
+    # TODO(gibi): add tests for live migration cases:
+    # * with target host
+    # * re-schedule success
+    # * re-schedule fail -> pci request cleanup?
+    # * abort / cancel -> dest / pci request cleanup?
+

 class PortResourceRequestReSchedulingTest(
        PortResourceRequestBasedSchedulingTestBase):
@@ -12,6 +12,7 @@

 import mock
 import oslo_messaging as messaging
+from oslo_serialization import jsonutils
 from oslo_utils.fixture import uuidsentinel as uuids
 import six

@@ -74,6 +75,13 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        self.ensure_network_metadata_mock = _p.start()
        self.addCleanup(_p.stop)

+        _p = mock.patch(
+            'nova.network.neutronv2.api.API.'
+            'get_requested_resource_for_instance',
+            return_value=[])
+        self.mock_get_res_req = _p.start()
+        self.addCleanup(_p.stop)
+
    def _generate_task(self):
        self.task = live_migrate.LiveMigrationTask(self.context,
            self.instance, self.destination, self.block_migration,
@@ -438,7 +446,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
        mock_select.assert_called_once_with(self.context, self.fake_spec,
            [self.instance.uuid], return_objects=True, return_alternates=False)
        mock_check.assert_called_once_with('host1')
-        mock_call.assert_called_once_with('host1')
+        mock_call.assert_called_once_with('host1', {})

    @mock.patch.object(live_migrate.LiveMigrationTask,
                       '_call_livem_checks_on_host')
@@ -459,7 +467,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
            self.context, self.fake_spec, [self.instance.uuid],
            return_objects=True, return_alternates=False)
        mock_check.assert_called_once_with('host1')
-        mock_call.assert_called_once_with('host1')
+        mock_call.assert_called_once_with('host1', {})

    @mock.patch.object(live_migrate.LiveMigrationTask,
                       '_remove_host_allocations')
@@ -486,7 +494,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
            mock.call(self.context, self.fake_spec, [self.instance.uuid],
                      return_objects=True, return_alternates=False)])
        mock_check.assert_has_calls([mock.call('host1'), mock.call('host2')])
-        mock_call.assert_called_once_with('host2')
+        mock_call.assert_called_once_with('host2', {})

    def test_find_destination_retry_with_old_hypervisor(self):
        self._test_find_destination_retry_hypervisor_raises(
@@ -521,7 +529,8 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
            mock.call(self.context, self.fake_spec, [self.instance.uuid],
                      return_objects=True, return_alternates=False)])
        mock_check.assert_has_calls([mock.call('host1'), mock.call('host2')])
-        mock_call.assert_has_calls([mock.call('host1'), mock.call('host2')])
+        mock_call.assert_has_calls(
+            [mock.call('host1', {}), mock.call('host2', {})])

    @mock.patch.object(live_migrate.LiveMigrationTask,
                       '_remove_host_allocations')
@@ -549,7 +558,8 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
            mock.call(self.context, self.fake_spec, [self.instance.uuid],
                      return_objects=True, return_alternates=False)])
        mock_check.assert_has_calls([mock.call('host1'), mock.call('host2')])
-        mock_call.assert_has_calls([mock.call('host1'), mock.call('host2')])
+        mock_call.assert_has_calls(
+            [mock.call('host1', {}), mock.call('host2', {})])

    @mock.patch.object(objects.Migration, 'save')
    @mock.patch.object(live_migrate.LiveMigrationTask,
@@ -612,7 +622,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
                                  side_effect=messaging.MessagingTimeout),
                mock.patch.object(self.task, '_check_can_migrate_pci')):
            self.assertRaises(exception.MigrationPreCheckError,
-                              self.task._call_livem_checks_on_host, {})
+                              self.task._call_livem_checks_on_host, {}, {})

    def test_call_livem_checks_on_host_bind_ports(self):
        data = objects.LibvirtLiveMigrateData()
@@ -639,7 +649,7 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
                network_model.VIF(uuids.port2)])
            self.instance.info_cache = objects.InstanceInfoCache(
                network_info=nwinfo)
-            self.task._call_livem_checks_on_host('dest-host')
+            self.task._call_livem_checks_on_host('dest-host', {})
            # Assert the migrate_data set on the task based on the port
            # bindings created.
            self.assertIn('vifs', data)
@@ -651,6 +661,118 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):

        _test()

+    @mock.patch('nova.network.neutronv2.api.API.bind_ports_to_host')
+    def test_bind_ports_on_destination_merges_profiles(self, mock_bind_ports):
+        """Assert that if both the migration_data and the provider mapping
+        contains binding profile related information then such information is
+        merged in the resulting profile.
+        """
+
+        self.task.migrate_data = objects.LibvirtLiveMigrateData(
+            vifs=[
+                objects.VIFMigrateData(
+                    port_id=uuids.port1,
+                    profile_json=jsonutils.dumps(
+                        {'some-key': 'value'}))
+            ])
+        provider_mappings = {uuids.port1: [uuids.dest_bw_rp]}
+
+        self.task._bind_ports_on_destination('dest-host', provider_mappings)
+
+        mock_bind_ports.assert_called_once_with(
+            context=self.context, instance=self.instance, host='dest-host',
+            vnic_types=None,
+            port_profiles={uuids.port1: {'allocation': uuids.dest_bw_rp,
+                                         'some-key': 'value'}})
+
+    @mock.patch('nova.network.neutronv2.api.API.bind_ports_to_host')
+    def test_bind_ports_on_destination_migration_data(self, mock_bind_ports):
+        """Assert that if only the migration_data contains binding profile
+        related information then that is sent to neutron.
+        """
+
+        self.task.migrate_data = objects.LibvirtLiveMigrateData(
+            vifs=[
+                objects.VIFMigrateData(
+                    port_id=uuids.port1,
+                    profile_json=jsonutils.dumps(
+                        {'some-key': 'value'}))
+            ])
+        provider_mappings = {}
+
+        self.task._bind_ports_on_destination('dest-host', provider_mappings)
+
+        mock_bind_ports.assert_called_once_with(
+            context=self.context, instance=self.instance, host='dest-host',
+            vnic_types=None,
+            port_profiles={uuids.port1: {'some-key': 'value'}})
+
+    @mock.patch('nova.network.neutronv2.api.API.bind_ports_to_host')
+    def test_bind_ports_on_destination_provider_mapping(self, mock_bind_ports):
+        """Assert that if only the provider mapping contains binding
+        profile related information then that is sent to neutron.
+        """
+
+        self.task.migrate_data = objects.LibvirtLiveMigrateData(
+            vifs=[
+                objects.VIFMigrateData(
+                    port_id=uuids.port1)
+            ])
+        provider_mappings = {uuids.port1: [uuids.dest_bw_rp]}
+
+        self.task._bind_ports_on_destination('dest-host', provider_mappings)
+
+        mock_bind_ports.assert_called_once_with(
+            context=self.context, instance=self.instance, host='dest-host',
+            vnic_types=None,
+            port_profiles={uuids.port1: {'allocation': uuids.dest_bw_rp}})
+
+    @mock.patch(
+        'nova.compute.utils.'
+        'update_pci_request_spec_with_allocated_interface_name')
+    @mock.patch('nova.scheduler.utils.fill_provider_mapping')
+    @mock.patch.object(live_migrate.LiveMigrationTask,
+                       '_call_livem_checks_on_host')
+    @mock.patch.object(live_migrate.LiveMigrationTask,
+                       '_check_compatible_with_source_hypervisor')
+    @mock.patch.object(query.SchedulerQueryClient, 'select_destinations',
+                       return_value=[[fake_selection1]])
+    @mock.patch.object(objects.RequestSpec, 'reset_forced_destinations')
+    @mock.patch.object(scheduler_utils, 'setup_instance_group')
+    def test_find_destination_with_resource_request(
+            self, mock_setup, mock_reset, mock_select, mock_check, mock_call,
+            mock_fill_provider_mapping, mock_update_pci_req):
+        resource_req = [objects.RequestGroup(requester_id=uuids.port_id)]
+        self.mock_get_res_req.return_value = resource_req
+
+        self.assertEqual(("host1", "node1", fake_limits1),
+                         self.task._find_destination())
+
+        # Make sure the request_spec was updated to include the cell
+        # mapping.
+        self.assertIsNotNone(self.fake_spec.requested_destination.cell)
+        # Make sure the spec was updated to include the project_id.
+        self.assertEqual(self.fake_spec.project_id, self.instance.project_id)
+        # Make sure that requested_resources are added to the request spec
+        self.assertEqual(
+            resource_req, self.task.request_spec.requested_resources)
+
+        mock_setup.assert_called_once_with(self.context, self.fake_spec)
+        mock_reset.assert_called_once_with()
+        self.ensure_network_metadata_mock.assert_called_once_with(
+            self.instance)
+        self.heal_reqspec_is_bfv_mock.assert_called_once_with(
+            self.context, self.fake_spec, self.instance)
+        mock_select.assert_called_once_with(self.context, self.fake_spec,
+            [self.instance.uuid], return_objects=True, return_alternates=False)
+        mock_check.assert_called_once_with('host1')
+        mock_call.assert_called_once_with('host1', {uuids.port_id: []})
+        mock_fill_provider_mapping.assert_called_once_with(
+            self.task.request_spec, fake_selection1)
+        mock_update_pci_req.assert_called_once_with(
+            self.context, self.task.report_client, self.instance,
+            {uuids.port_id: []})
+
    @mock.patch.object(objects.InstanceMapping, 'get_by_instance_uuid',
                       side_effect=exception.InstanceMappingNotFound(
                           uuid=uuids.instance))
@@ -4658,6 +4658,34 @@ class TestNeutronv2(TestNeutronv2Base):
            "are required for ports with a resource request.",
            six.text_type(ex))

+    @mock.patch.object(neutronapi, 'get_client', return_value=mock.Mock())
+    def test_update_port_bindings_for_instance_with_resource_req_live_mig(
+            self, get_client_mock):
+
+        instance = fake_instance.fake_instance_obj(self.context)
+        fake_ports = {'ports': [
+            {'id': 'fake-port-1',
+             'binding:vnic_type': 'normal',
+             constants.BINDING_HOST_ID: 'old-host',
+             constants.BINDING_PROFILE:
+                 {'allocation': uuids.dest_compute_rp},
+             'resource_request': mock.sentinel.resource_request}]}
+        migration = objects.Migration(
+            status='confirmed', migration_type='live-migration')
+        list_ports_mock = mock.Mock(return_value=fake_ports)
+        get_client_mock.return_value.list_ports = list_ports_mock
+
+        # No mapping is passed in as during live migration the conductor
+        # already created the binding and added the allocation key
+        self.api._update_port_binding_for_instance(
+            self.context, instance, 'new-host', migration, {})
+
+        # Note that binding:profile is not updated
+        get_client_mock.return_value.update_port.assert_called_once_with(
+            'fake-port-1',
+            {'port': {'device_owner': 'compute:None',
+                      'binding:host_id': 'new-host'}})
+
    def test_get_pci_mapping_for_migration(self):
        instance = fake_instance.fake_instance_obj(self.context)
        instance.migration_context = objects.MigrationContext()