Merge "Allocate resources on forced dest host during live migration"

This commit is contained in:
Jenkins
2017-08-23 14:57:05 +00:00
committed by Gerrit Code Review
6 changed files with 228 additions and 30 deletions
+5 -1
View File
@@ -3867,7 +3867,9 @@ class API(base.Base):
# NOTE(sbauza): Force is a boolean by the new related API version
if force is False and host_name:
nodes = objects.ComputeNodeList.get_all_by_host(context, host_name)
# NOTE(sbauza): Unset the host to make sure we call the scheduler
# Unset the host to make sure we call the scheduler
# from the conductor LiveMigrationTask. Yes this is tightly-coupled
# to behavior in conductor and not great.
host_name = None
# FIXME(sbauza): Since only Ironic driver uses more than one
# compute per service but doesn't support live migrations,
@@ -3883,6 +3885,8 @@ class API(base.Base):
host=target.host,
node=target.hypervisor_hostname
)
# This is essentially a hint to the scheduler to only consider
# the specified host but still run it through the filters.
request_spec.requested_destination = destination
try:
+98 -2
View File
@@ -49,11 +49,29 @@ class LiveMigrationTask(base.TaskBase):
self._check_host_is_up(self.source)
if not self.destination:
# Either no host was specified in the API request and the user
# wants the scheduler to pick a destination host, or a host was
# specified but is not forcing it, so they want the scheduler
# filters to run on the specified host, like a scheduler hint.
self.destination = self._find_destination()
self.migration.dest_compute = self.destination
self.migration.save()
else:
self._check_requested_destination()
# This is the case that the user specified the 'force' flag when
# live migrating with a specific destination host so the scheduler
# is bypassed. There are still some minimal checks performed here
# though.
source_node, dest_node = self._check_requested_destination()
# Now that we're semi-confident in the force specified host, we
# need to copy the source compute node allocations in Placement
# to the destination compute node. Normally select_destinations()
# in the scheduler would do this for us, but when forcing the
# target host we don't call the scheduler.
# TODO(mriedem): In Queens, call select_destinations() with a
# skip_filters=True flag so the scheduler does the work of claiming
# resources on the destination in Placement but still bypass the
# scheduler filters, which honors the 'force' flag in the API.
self._claim_resources_on_destination(source_node, dest_node)
# TODO(johngarbutt) need to move complexity out of compute manager
# TODO(johngarbutt) disk_over_commit?
@@ -89,11 +107,83 @@ class LiveMigrationTask(base.TaskBase):
raise exception.ComputeServiceUnavailable(host=host)
def _check_requested_destination(self):
"""Performs basic pre-live migration checks for the forced host.
:returns: tuple of (source ComputeNode, destination ComputeNode)
"""
self._check_destination_is_not_source()
self._check_host_is_up(self.destination)
self._check_destination_has_enough_memory()
self._check_compatible_with_source_hypervisor(self.destination)
source_node, dest_node = self._check_compatible_with_source_hypervisor(
self.destination)
self._call_livem_checks_on_host(self.destination)
return source_node, dest_node
def _claim_resources_on_destination(self, source_node, dest_node):
"""Copies allocations from source node to dest node in Placement
:param source_node: source ComputeNode where the instance currently
lives
:param dest_node: destination ComputeNode where the instance is being
forced to live migrate.
"""
reportclient = self.scheduler_client.reportclient
# Get the current allocations for the source node and the instance.
source_node_allocations = reportclient.get_allocations_for_instance(
source_node.uuid, self.instance)
if source_node_allocations:
# Generate an allocation request for the destination node.
alloc_request = {
'allocations': [
{
'resource_provider': {
'uuid': dest_node.uuid
},
'resources': source_node_allocations
}
]
}
# The claim_resources method will check for existing allocations
# for the instance and effectively "double up" the allocations for
# both the source and destination node. That's why when requesting
# allocations for resources on the destination node before we live
# migrate, we use the existing resource allocations from the
# source node.
if reportclient.claim_resources(
self.instance.uuid, alloc_request,
self.instance.project_id, self.instance.user_id):
LOG.debug('Instance allocations successfully created on '
'destination node %(dest)s: %(alloc_request)s',
{'dest': dest_node.uuid,
'alloc_request': alloc_request},
instance=self.instance)
else:
# We have to fail even though the user requested that we force
# the host. This is because we need Placement to have an
# accurate reflection of what's allocated on all nodes so the
# scheduler can make accurate decisions about which nodes have
# capacity for building an instance. We also cannot rely on the
# resource tracker in the compute service automatically healing
# the allocations since that code is going away in Queens.
reason = (_('Unable to migrate instance %(instance_uuid)s to '
'host %(host)s. There is not enough capacity on '
'the host for the instance.') %
{'instance_uuid': self.instance.uuid,
'host': self.destination})
raise exception.MigrationPreCheckError(reason=reason)
else:
# This shouldn't happen, but it could be a case where there are
# older (Ocata) computes still so the existing allocations are
# getting overwritten by the update_available_resource periodic
# task in the compute service.
# TODO(mriedem): Make this an error when the auto-heal
# compatibility code in the resource tracker is removed.
LOG.warning('No instance allocations found for source node '
'%(source)s in Placement. Not creating allocations '
'for destination node %(dest)s and assuming the '
'compute service will heal the allocations.',
{'source': source_node.uuid, 'dest': dest_node.uuid},
instance=self.instance)
def _check_destination_is_not_source(self):
if self.destination == self.source:
@@ -101,6 +191,11 @@ class LiveMigrationTask(base.TaskBase):
instance_id=self.instance.uuid, host=self.destination)
def _check_destination_has_enough_memory(self):
# TODO(mriedem): This method can be removed when the forced host
# scenario is calling select_destinations() in the scheduler because
# Placement will be used to filter allocation candidates by MEMORY_MB.
# We likely can't remove it until the CachingScheduler is gone though
# since the CachingScheduler does not use Placement.
compute = self._get_compute_info(self.destination)
free_ram_mb = compute.free_ram_mb
total_ram_mb = compute.memory_mb
@@ -139,6 +234,7 @@ class LiveMigrationTask(base.TaskBase):
destination_version = destination_info.hypervisor_version
if source_version > destination_version:
raise exception.DestinationHypervisorTooOld()
return source_info, destination_info
def _call_livem_checks_on_host(self, destination):
try:
+11 -6
View File
@@ -911,7 +911,7 @@ class SchedulerReportClient(object):
self._delete_inventory(compute_node.uuid)
@safe_connect
def _get_allocations_for_instance(self, rp_uuid, instance):
def get_allocations_for_instance(self, rp_uuid, instance):
url = '/allocations/%s' % instance.uuid
resp = self.get(url)
if not resp:
@@ -925,8 +925,8 @@ class SchedulerReportClient(object):
def _allocate_for_instance(self, rp_uuid, instance):
my_allocations = _instance_to_allocations_dict(instance)
current_allocations = self._get_allocations_for_instance(rp_uuid,
instance)
current_allocations = self.get_allocations_for_instance(rp_uuid,
instance)
if current_allocations == my_allocations:
allocstr = ','.join(['%s=%s' % (k, v)
for k, v in my_allocations.items()])
@@ -943,9 +943,14 @@ class SchedulerReportClient(object):
LOG.info(_LI('Submitted allocation for instance'),
instance=instance)
# NOTE(jaypipes): Currently, this method is ONLY used by the scheduler to
# allocate resources on the selected destination hosts. This method should
# not be called by the resource tracker; instead, the
# NOTE(jaypipes): Currently, this method is ONLY used in two places:
# 1. By the scheduler to allocate resources on the selected destination
# hosts.
# 2. By the conductor LiveMigrationTask to allocate resources on a forced
# destination host. This is a short-term fix for Pike which should be
# replaced in Queens by conductor calling the scheduler in the force
# host case.
# This method should not be called by the resource tracker; instead, the
# _allocate_for_instance() method is used which does not perform any
# checking that a move operation is in place.
@safe_connect
@@ -20,6 +20,7 @@ from nova.tests import fixtures
from nova.tests.functional.notification_sample_tests \
import notification_sample_base
from nova.tests.unit import fake_notifier
from nova.virt import fake
class TestInstanceNotificationSampleWithMultipleCompute(
@@ -33,6 +34,7 @@ class TestInstanceNotificationSampleWithMultipleCompute(
self.useFixture(self.neutron)
self.cinder = fixtures.CinderFixture(self)
self.useFixture(self.cinder)
self.useFixture(fixtures.AllServicesCurrent())
def test_live_migration_actions(self):
server = self._boot_a_server(
@@ -40,6 +42,8 @@ class TestInstanceNotificationSampleWithMultipleCompute(
self._wait_for_notification('instance.create.end')
self._attach_volume_to_server(server, self.cinder.SWAP_OLD_VOL)
# server will boot on host1
fake.set_nodes(['host2'])
self.addCleanup(fake.restore_nodes)
self.useFixture(fixtures.ConfPatcher(host='host2'))
self.compute2 = self.start_service('compute', host='host2')
+5 -18
View File
@@ -1831,19 +1831,11 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
dest_usages = self._get_provider_usages(dest_rp_uuid)
# NOTE(lajos katona): When bug 1712008 is solved the dest host
# expected to have resource allocation:
# self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
# NOTE(lajos katona): the allocation on the destination host is empty,
# but when bug 1712008 is solved expected to be equal described by the
# flavor:
self.assertFlavorMatchesAllocation(
{'ram': 0, 'disk': 0, 'vcpus': 0}, dest_usages)
self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
allocations = self._get_allocations_by_server_uuid(server['id'])
# the server has just 1 allocation:
self.assertEqual(1, len(allocations))
# the server has an allocation on the source and dest nodes
self.assertEqual(2, len(allocations))
# NOTE(lajos katona): When bug 1712045 is solved the server has
# no allocation on the source:
@@ -1853,13 +1845,8 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
source_allocation = allocations[source_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor1, source_allocation)
# NOTE(lajos katona): When bug 1712008 solved the server should have
# an allocation for the destination:
# dest_allocation = allocations[dest_rp_uuid]['resources']
# self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
# Instead the server has no allocation for the destination host:
self.assertNotIn(dest_rp_uuid, allocations)
dest_allocation = allocations[dest_rp_uuid]['resources']
self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
self._delete_and_check_allocations(
server, source_rp_uuid, dest_rp_uuid)
@@ -62,14 +62,19 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
def test_execute_with_destination(self):
with test.nested(
mock.patch.object(self.task, '_check_host_is_up'),
mock.patch.object(self.task, '_check_requested_destination'),
mock.patch.object(self.task, '_check_requested_destination',
return_value=(mock.sentinel.source_node,
mock.sentinel.dest_node)),
mock.patch.object(self.task, '_claim_resources_on_destination'),
mock.patch.object(self.task.compute_rpcapi, 'live_migration'),
) as (mock_check_up, mock_check_dest, mock_mig):
) as (mock_check_up, mock_check_dest, mock_claim, mock_mig):
mock_mig.return_value = "bob"
self.assertEqual("bob", self.task.execute())
mock_check_up.assert_called_once_with(self.instance_host)
mock_check_dest.assert_called_once_with()
mock_claim.assert_called_once_with(
mock.sentinel.source_node, mock.sentinel.dest_node)
mock_mig.assert_called_once_with(
self.context,
host=self.instance_host,
@@ -161,7 +166,8 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
mock_get_info.return_value = hypervisor_details
mock_check.return_value = "migrate_data"
self.task._check_requested_destination()
self.assertEqual((hypervisor_details, hypervisor_details),
self.task._check_requested_destination())
self.assertEqual("migrate_data", self.task.migrate_data)
mock_get_host.assert_called_once_with(self.context, self.destination)
mock_is_up.assert_called_once_with("service")
@@ -475,3 +481,99 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
side_effect=messaging.MessagingTimeout):
self.assertRaises(exception.MigrationPreCheckError,
self.task._call_livem_checks_on_host, {})
def test_claim_resources_on_destination_no_source_allocations(self):
"""Tests the negative scenario where the instance does not have
allocations in Placement on the source compute node so no claim is
attempted on the destination compute node.
"""
source_node = objects.ComputeNode(uuid=uuids.source_node)
dest_node = objects.ComputeNode(uuid=uuids.dest_node)
@mock.patch.object(self.task.scheduler_client.reportclient,
'get_allocations_for_instance', return_value={})
@mock.patch.object(self.task.scheduler_client.reportclient,
'claim_resources',
new_callable=mock.NonCallableMock)
def test(mock_claim, mock_get_allocs):
self.task._claim_resources_on_destination(source_node, dest_node)
mock_get_allocs.assert_called_once_with(
uuids.source_node, self.instance)
test()
def test_claim_resources_on_destination_claim_fails(self):
"""Tests the negative scenario where the resource allocation claim
on the destination compute node fails, resulting in an error.
"""
source_node = objects.ComputeNode(uuid=uuids.source_node)
dest_node = objects.ComputeNode(uuid=uuids.dest_node)
source_res_allocs = {
'VCPU': self.instance.vcpus,
'MEMORY_MB': self.instance.memory_mb,
# This would really include ephemeral and swap too but we're lazy.
'DISK_GB': self.instance.root_gb
}
dest_alloc_request = {
'allocations': [
{
'resource_provider': {
'uuid': uuids.dest_node
},
'resources': source_res_allocs
}
]
}
@mock.patch.object(self.task.scheduler_client.reportclient,
'get_allocations_for_instance',
return_value=source_res_allocs)
@mock.patch.object(self.task.scheduler_client.reportclient,
'claim_resources', return_value=False)
def test(mock_claim, mock_get_allocs):
self.assertRaises(exception.MigrationPreCheckError,
self.task._claim_resources_on_destination,
source_node, dest_node)
mock_get_allocs.assert_called_once_with(
uuids.source_node, self.instance)
mock_claim.assert_called_once_with(
self.instance.uuid, dest_alloc_request,
self.instance.project_id, self.instance.user_id)
test()
def test_claim_resources_on_destination(self):
"""Happy path test where everything is successful."""
source_node = objects.ComputeNode(uuid=uuids.source_node)
dest_node = objects.ComputeNode(uuid=uuids.dest_node)
source_res_allocs = {
'VCPU': self.instance.vcpus,
'MEMORY_MB': self.instance.memory_mb,
# This would really include ephemeral and swap too but we're lazy.
'DISK_GB': self.instance.root_gb
}
dest_alloc_request = {
'allocations': [
{
'resource_provider': {
'uuid': uuids.dest_node
},
'resources': source_res_allocs
}
]
}
@mock.patch.object(self.task.scheduler_client.reportclient,
'get_allocations_for_instance',
return_value=source_res_allocs)
@mock.patch.object(self.task.scheduler_client.reportclient,
'claim_resources', return_value=True)
def test(mock_claim, mock_get_allocs):
self.task._claim_resources_on_destination(source_node, dest_node)
mock_get_allocs.assert_called_once_with(
uuids.source_node, self.instance)
mock_claim.assert_called_once_with(
self.instance.uuid, dest_alloc_request,
self.instance.project_id, self.instance.user_id)
test()