diff --git a/nova/compute/api.py b/nova/compute/api.py index d67158c95f..5b91add65b 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -3867,7 +3867,9 @@ class API(base.Base): # NOTE(sbauza): Force is a boolean by the new related API version if force is False and host_name: nodes = objects.ComputeNodeList.get_all_by_host(context, host_name) - # NOTE(sbauza): Unset the host to make sure we call the scheduler + # Unset the host to make sure we call the scheduler + # from the conductor LiveMigrationTask. Yes this is tightly-coupled + # to behavior in conductor and not great. host_name = None # FIXME(sbauza): Since only Ironic driver uses more than one # compute per service but doesn't support live migrations, @@ -3883,6 +3885,8 @@ class API(base.Base): host=target.host, node=target.hypervisor_hostname ) + # This is essentially a hint to the scheduler to only consider + # the specified host but still run it through the filters. request_spec.requested_destination = destination try: diff --git a/nova/conductor/tasks/live_migrate.py b/nova/conductor/tasks/live_migrate.py index 54763b49f6..16de56102f 100644 --- a/nova/conductor/tasks/live_migrate.py +++ b/nova/conductor/tasks/live_migrate.py @@ -49,11 +49,29 @@ class LiveMigrationTask(base.TaskBase): self._check_host_is_up(self.source) if not self.destination: + # Either no host was specified in the API request and the user + # wants the scheduler to pick a destination host, or a host was + # specified but is not forcing it, so they want the scheduler + # filters to run on the specified host, like a scheduler hint. self.destination = self._find_destination() self.migration.dest_compute = self.destination self.migration.save() else: - self._check_requested_destination() + # This is the case that the user specified the 'force' flag when + # live migrating with a specific destination host so the scheduler + # is bypassed. There are still some minimal checks performed here + # though. + source_node, dest_node = self._check_requested_destination() + # Now that we're semi-confident in the force specified host, we + # need to copy the source compute node allocations in Placement + # to the destination compute node. Normally select_destinations() + # in the scheduler would do this for us, but when forcing the + # target host we don't call the scheduler. + # TODO(mriedem): In Queens, call select_destinations() with a + # skip_filters=True flag so the scheduler does the work of claiming + # resources on the destination in Placement but still bypass the + # scheduler filters, which honors the 'force' flag in the API. + self._claim_resources_on_destination(source_node, dest_node) # TODO(johngarbutt) need to move complexity out of compute manager # TODO(johngarbutt) disk_over_commit? @@ -89,11 +107,83 @@ class LiveMigrationTask(base.TaskBase): raise exception.ComputeServiceUnavailable(host=host) def _check_requested_destination(self): + """Performs basic pre-live migration checks for the forced host. + + :returns: tuple of (source ComputeNode, destination ComputeNode) + """ self._check_destination_is_not_source() self._check_host_is_up(self.destination) self._check_destination_has_enough_memory() - self._check_compatible_with_source_hypervisor(self.destination) + source_node, dest_node = self._check_compatible_with_source_hypervisor( + self.destination) self._call_livem_checks_on_host(self.destination) + return source_node, dest_node + + def _claim_resources_on_destination(self, source_node, dest_node): + """Copies allocations from source node to dest node in Placement + + :param source_node: source ComputeNode where the instance currently + lives + :param dest_node: destination ComputeNode where the instance is being + forced to live migrate. + """ + reportclient = self.scheduler_client.reportclient + # Get the current allocations for the source node and the instance. + source_node_allocations = reportclient.get_allocations_for_instance( + source_node.uuid, self.instance) + if source_node_allocations: + # Generate an allocation request for the destination node. + alloc_request = { + 'allocations': [ + { + 'resource_provider': { + 'uuid': dest_node.uuid + }, + 'resources': source_node_allocations + } + ] + } + # The claim_resources method will check for existing allocations + # for the instance and effectively "double up" the allocations for + # both the source and destination node. That's why when requesting + # allocations for resources on the destination node before we live + # migrate, we use the existing resource allocations from the + # source node. + if reportclient.claim_resources( + self.instance.uuid, alloc_request, + self.instance.project_id, self.instance.user_id): + LOG.debug('Instance allocations successfully created on ' + 'destination node %(dest)s: %(alloc_request)s', + {'dest': dest_node.uuid, + 'alloc_request': alloc_request}, + instance=self.instance) + else: + # We have to fail even though the user requested that we force + # the host. This is because we need Placement to have an + # accurate reflection of what's allocated on all nodes so the + # scheduler can make accurate decisions about which nodes have + # capacity for building an instance. We also cannot rely on the + # resource tracker in the compute service automatically healing + # the allocations since that code is going away in Queens. + reason = (_('Unable to migrate instance %(instance_uuid)s to ' + 'host %(host)s. There is not enough capacity on ' + 'the host for the instance.') % + {'instance_uuid': self.instance.uuid, + 'host': self.destination}) + raise exception.MigrationPreCheckError(reason=reason) + else: + # This shouldn't happen, but it could be a case where there are + # older (Ocata) computes still so the existing allocations are + # getting overwritten by the update_available_resource periodic + # task in the compute service. + # TODO(mriedem): Make this an error when the auto-heal + # compatibility code in the resource tracker is removed. + LOG.warning('No instance allocations found for source node ' + '%(source)s in Placement. Not creating allocations ' + 'for destination node %(dest)s and assuming the ' + 'compute service will heal the allocations.', + {'source': source_node.uuid, 'dest': dest_node.uuid}, + instance=self.instance) def _check_destination_is_not_source(self): if self.destination == self.source: @@ -101,6 +191,11 @@ class LiveMigrationTask(base.TaskBase): instance_id=self.instance.uuid, host=self.destination) def _check_destination_has_enough_memory(self): + # TODO(mriedem): This method can be removed when the forced host + # scenario is calling select_destinations() in the scheduler because + # Placement will be used to filter allocation candidates by MEMORY_MB. + # We likely can't remove it until the CachingScheduler is gone though + # since the CachingScheduler does not use Placement. compute = self._get_compute_info(self.destination) free_ram_mb = compute.free_ram_mb total_ram_mb = compute.memory_mb @@ -139,6 +234,7 @@ class LiveMigrationTask(base.TaskBase): destination_version = destination_info.hypervisor_version if source_version > destination_version: raise exception.DestinationHypervisorTooOld() + return source_info, destination_info def _call_livem_checks_on_host(self, destination): try: diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py index b0487625df..963f625423 100644 --- a/nova/scheduler/client/report.py +++ b/nova/scheduler/client/report.py @@ -911,7 +911,7 @@ class SchedulerReportClient(object): self._delete_inventory(compute_node.uuid) @safe_connect - def _get_allocations_for_instance(self, rp_uuid, instance): + def get_allocations_for_instance(self, rp_uuid, instance): url = '/allocations/%s' % instance.uuid resp = self.get(url) if not resp: @@ -925,8 +925,8 @@ class SchedulerReportClient(object): def _allocate_for_instance(self, rp_uuid, instance): my_allocations = _instance_to_allocations_dict(instance) - current_allocations = self._get_allocations_for_instance(rp_uuid, - instance) + current_allocations = self.get_allocations_for_instance(rp_uuid, + instance) if current_allocations == my_allocations: allocstr = ','.join(['%s=%s' % (k, v) for k, v in my_allocations.items()]) @@ -943,9 +943,14 @@ class SchedulerReportClient(object): LOG.info(_LI('Submitted allocation for instance'), instance=instance) - # NOTE(jaypipes): Currently, this method is ONLY used by the scheduler to - # allocate resources on the selected destination hosts. This method should - # not be called by the resource tracker; instead, the + # NOTE(jaypipes): Currently, this method is ONLY used in two places: + # 1. By the scheduler to allocate resources on the selected destination + # hosts. + # 2. By the conductor LiveMigrationTask to allocate resources on a forced + # destination host. This is a short-term fix for Pike which should be + # replaced in Queens by conductor calling the scheduler in the force + # host case. + # This method should not be called by the resource tracker; instead, the # _allocate_for_instance() method is used which does not perform any # checking that a move operation is in place. @safe_connect diff --git a/nova/tests/functional/notification_sample_tests/test_instance.py b/nova/tests/functional/notification_sample_tests/test_instance.py index f84a6045c5..55fe90d739 100644 --- a/nova/tests/functional/notification_sample_tests/test_instance.py +++ b/nova/tests/functional/notification_sample_tests/test_instance.py @@ -20,6 +20,7 @@ from nova.tests import fixtures from nova.tests.functional.notification_sample_tests \ import notification_sample_base from nova.tests.unit import fake_notifier +from nova.virt import fake class TestInstanceNotificationSampleWithMultipleCompute( @@ -33,6 +34,7 @@ class TestInstanceNotificationSampleWithMultipleCompute( self.useFixture(self.neutron) self.cinder = fixtures.CinderFixture(self) self.useFixture(self.cinder) + self.useFixture(fixtures.AllServicesCurrent()) def test_live_migration_actions(self): server = self._boot_a_server( @@ -40,6 +42,8 @@ class TestInstanceNotificationSampleWithMultipleCompute( self._wait_for_notification('instance.create.end') self._attach_volume_to_server(server, self.cinder.SWAP_OLD_VOL) # server will boot on host1 + fake.set_nodes(['host2']) + self.addCleanup(fake.restore_nodes) self.useFixture(fixtures.ConfPatcher(host='host2')) self.compute2 = self.start_service('compute', host='host2') diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index f8ae3dc8d3..51b8f589f3 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -1831,19 +1831,11 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): self.assertFlavorMatchesAllocation(self.flavor1, source_usages) dest_usages = self._get_provider_usages(dest_rp_uuid) - # NOTE(lajos katona): When bug 1712008 is solved the dest host - # expected to have resource allocation: - # self.assertFlavorMatchesAllocation(self.flavor1, dest_usages) - - # NOTE(lajos katona): the allocation on the destination host is empty, - # but when bug 1712008 is solved expected to be equal described by the - # flavor: - self.assertFlavorMatchesAllocation( - {'ram': 0, 'disk': 0, 'vcpus': 0}, dest_usages) + self.assertFlavorMatchesAllocation(self.flavor1, dest_usages) allocations = self._get_allocations_by_server_uuid(server['id']) - # the server has just 1 allocation: - self.assertEqual(1, len(allocations)) + # the server has an allocation on the source and dest nodes + self.assertEqual(2, len(allocations)) # NOTE(lajos katona): When bug 1712045 is solved the server has # no allocation on the source: @@ -1853,13 +1845,8 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): source_allocation = allocations[source_rp_uuid]['resources'] self.assertFlavorMatchesAllocation(self.flavor1, source_allocation) - # NOTE(lajos katona): When bug 1712008 solved the server should have - # an allocation for the destination: - # dest_allocation = allocations[dest_rp_uuid]['resources'] - # self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation) - - # Instead the server has no allocation for the destination host: - self.assertNotIn(dest_rp_uuid, allocations) + dest_allocation = allocations[dest_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation) self._delete_and_check_allocations( server, source_rp_uuid, dest_rp_uuid) diff --git a/nova/tests/unit/conductor/tasks/test_live_migrate.py b/nova/tests/unit/conductor/tasks/test_live_migrate.py index 4082454611..4c633c5905 100644 --- a/nova/tests/unit/conductor/tasks/test_live_migrate.py +++ b/nova/tests/unit/conductor/tasks/test_live_migrate.py @@ -62,14 +62,19 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): def test_execute_with_destination(self): with test.nested( mock.patch.object(self.task, '_check_host_is_up'), - mock.patch.object(self.task, '_check_requested_destination'), + mock.patch.object(self.task, '_check_requested_destination', + return_value=(mock.sentinel.source_node, + mock.sentinel.dest_node)), + mock.patch.object(self.task, '_claim_resources_on_destination'), mock.patch.object(self.task.compute_rpcapi, 'live_migration'), - ) as (mock_check_up, mock_check_dest, mock_mig): + ) as (mock_check_up, mock_check_dest, mock_claim, mock_mig): mock_mig.return_value = "bob" self.assertEqual("bob", self.task.execute()) mock_check_up.assert_called_once_with(self.instance_host) mock_check_dest.assert_called_once_with() + mock_claim.assert_called_once_with( + mock.sentinel.source_node, mock.sentinel.dest_node) mock_mig.assert_called_once_with( self.context, host=self.instance_host, @@ -161,7 +166,8 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): mock_get_info.return_value = hypervisor_details mock_check.return_value = "migrate_data" - self.task._check_requested_destination() + self.assertEqual((hypervisor_details, hypervisor_details), + self.task._check_requested_destination()) self.assertEqual("migrate_data", self.task.migrate_data) mock_get_host.assert_called_once_with(self.context, self.destination) mock_is_up.assert_called_once_with("service") @@ -475,3 +481,99 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase): side_effect=messaging.MessagingTimeout): self.assertRaises(exception.MigrationPreCheckError, self.task._call_livem_checks_on_host, {}) + + def test_claim_resources_on_destination_no_source_allocations(self): + """Tests the negative scenario where the instance does not have + allocations in Placement on the source compute node so no claim is + attempted on the destination compute node. + """ + source_node = objects.ComputeNode(uuid=uuids.source_node) + dest_node = objects.ComputeNode(uuid=uuids.dest_node) + + @mock.patch.object(self.task.scheduler_client.reportclient, + 'get_allocations_for_instance', return_value={}) + @mock.patch.object(self.task.scheduler_client.reportclient, + 'claim_resources', + new_callable=mock.NonCallableMock) + def test(mock_claim, mock_get_allocs): + self.task._claim_resources_on_destination(source_node, dest_node) + mock_get_allocs.assert_called_once_with( + uuids.source_node, self.instance) + + test() + + def test_claim_resources_on_destination_claim_fails(self): + """Tests the negative scenario where the resource allocation claim + on the destination compute node fails, resulting in an error. + """ + source_node = objects.ComputeNode(uuid=uuids.source_node) + dest_node = objects.ComputeNode(uuid=uuids.dest_node) + source_res_allocs = { + 'VCPU': self.instance.vcpus, + 'MEMORY_MB': self.instance.memory_mb, + # This would really include ephemeral and swap too but we're lazy. + 'DISK_GB': self.instance.root_gb + } + dest_alloc_request = { + 'allocations': [ + { + 'resource_provider': { + 'uuid': uuids.dest_node + }, + 'resources': source_res_allocs + } + ] + } + + @mock.patch.object(self.task.scheduler_client.reportclient, + 'get_allocations_for_instance', + return_value=source_res_allocs) + @mock.patch.object(self.task.scheduler_client.reportclient, + 'claim_resources', return_value=False) + def test(mock_claim, mock_get_allocs): + self.assertRaises(exception.MigrationPreCheckError, + self.task._claim_resources_on_destination, + source_node, dest_node) + mock_get_allocs.assert_called_once_with( + uuids.source_node, self.instance) + mock_claim.assert_called_once_with( + self.instance.uuid, dest_alloc_request, + self.instance.project_id, self.instance.user_id) + + test() + + def test_claim_resources_on_destination(self): + """Happy path test where everything is successful.""" + source_node = objects.ComputeNode(uuid=uuids.source_node) + dest_node = objects.ComputeNode(uuid=uuids.dest_node) + source_res_allocs = { + 'VCPU': self.instance.vcpus, + 'MEMORY_MB': self.instance.memory_mb, + # This would really include ephemeral and swap too but we're lazy. + 'DISK_GB': self.instance.root_gb + } + dest_alloc_request = { + 'allocations': [ + { + 'resource_provider': { + 'uuid': uuids.dest_node + }, + 'resources': source_res_allocs + } + ] + } + + @mock.patch.object(self.task.scheduler_client.reportclient, + 'get_allocations_for_instance', + return_value=source_res_allocs) + @mock.patch.object(self.task.scheduler_client.reportclient, + 'claim_resources', return_value=True) + def test(mock_claim, mock_get_allocs): + self.task._claim_resources_on_destination(source_node, dest_node) + mock_get_allocs.assert_called_once_with( + uuids.source_node, self.instance) + mock_claim.assert_called_once_with( + self.instance.uuid, dest_alloc_request, + self.instance.project_id, self.instance.user_id) + + test()