Merge "Allocate resources on forced dest host during live migration"
This commit is contained in:
+5
-1
@@ -3867,7 +3867,9 @@ class API(base.Base):
|
||||
# NOTE(sbauza): Force is a boolean by the new related API version
|
||||
if force is False and host_name:
|
||||
nodes = objects.ComputeNodeList.get_all_by_host(context, host_name)
|
||||
# NOTE(sbauza): Unset the host to make sure we call the scheduler
|
||||
# Unset the host to make sure we call the scheduler
|
||||
# from the conductor LiveMigrationTask. Yes this is tightly-coupled
|
||||
# to behavior in conductor and not great.
|
||||
host_name = None
|
||||
# FIXME(sbauza): Since only Ironic driver uses more than one
|
||||
# compute per service but doesn't support live migrations,
|
||||
@@ -3883,6 +3885,8 @@ class API(base.Base):
|
||||
host=target.host,
|
||||
node=target.hypervisor_hostname
|
||||
)
|
||||
# This is essentially a hint to the scheduler to only consider
|
||||
# the specified host but still run it through the filters.
|
||||
request_spec.requested_destination = destination
|
||||
|
||||
try:
|
||||
|
||||
@@ -49,11 +49,29 @@ class LiveMigrationTask(base.TaskBase):
|
||||
self._check_host_is_up(self.source)
|
||||
|
||||
if not self.destination:
|
||||
# Either no host was specified in the API request and the user
|
||||
# wants the scheduler to pick a destination host, or a host was
|
||||
# specified but is not forcing it, so they want the scheduler
|
||||
# filters to run on the specified host, like a scheduler hint.
|
||||
self.destination = self._find_destination()
|
||||
self.migration.dest_compute = self.destination
|
||||
self.migration.save()
|
||||
else:
|
||||
self._check_requested_destination()
|
||||
# This is the case that the user specified the 'force' flag when
|
||||
# live migrating with a specific destination host so the scheduler
|
||||
# is bypassed. There are still some minimal checks performed here
|
||||
# though.
|
||||
source_node, dest_node = self._check_requested_destination()
|
||||
# Now that we're semi-confident in the force specified host, we
|
||||
# need to copy the source compute node allocations in Placement
|
||||
# to the destination compute node. Normally select_destinations()
|
||||
# in the scheduler would do this for us, but when forcing the
|
||||
# target host we don't call the scheduler.
|
||||
# TODO(mriedem): In Queens, call select_destinations() with a
|
||||
# skip_filters=True flag so the scheduler does the work of claiming
|
||||
# resources on the destination in Placement but still bypass the
|
||||
# scheduler filters, which honors the 'force' flag in the API.
|
||||
self._claim_resources_on_destination(source_node, dest_node)
|
||||
|
||||
# TODO(johngarbutt) need to move complexity out of compute manager
|
||||
# TODO(johngarbutt) disk_over_commit?
|
||||
@@ -89,11 +107,83 @@ class LiveMigrationTask(base.TaskBase):
|
||||
raise exception.ComputeServiceUnavailable(host=host)
|
||||
|
||||
def _check_requested_destination(self):
|
||||
"""Performs basic pre-live migration checks for the forced host.
|
||||
|
||||
:returns: tuple of (source ComputeNode, destination ComputeNode)
|
||||
"""
|
||||
self._check_destination_is_not_source()
|
||||
self._check_host_is_up(self.destination)
|
||||
self._check_destination_has_enough_memory()
|
||||
self._check_compatible_with_source_hypervisor(self.destination)
|
||||
source_node, dest_node = self._check_compatible_with_source_hypervisor(
|
||||
self.destination)
|
||||
self._call_livem_checks_on_host(self.destination)
|
||||
return source_node, dest_node
|
||||
|
||||
def _claim_resources_on_destination(self, source_node, dest_node):
|
||||
"""Copies allocations from source node to dest node in Placement
|
||||
|
||||
:param source_node: source ComputeNode where the instance currently
|
||||
lives
|
||||
:param dest_node: destination ComputeNode where the instance is being
|
||||
forced to live migrate.
|
||||
"""
|
||||
reportclient = self.scheduler_client.reportclient
|
||||
# Get the current allocations for the source node and the instance.
|
||||
source_node_allocations = reportclient.get_allocations_for_instance(
|
||||
source_node.uuid, self.instance)
|
||||
if source_node_allocations:
|
||||
# Generate an allocation request for the destination node.
|
||||
alloc_request = {
|
||||
'allocations': [
|
||||
{
|
||||
'resource_provider': {
|
||||
'uuid': dest_node.uuid
|
||||
},
|
||||
'resources': source_node_allocations
|
||||
}
|
||||
]
|
||||
}
|
||||
# The claim_resources method will check for existing allocations
|
||||
# for the instance and effectively "double up" the allocations for
|
||||
# both the source and destination node. That's why when requesting
|
||||
# allocations for resources on the destination node before we live
|
||||
# migrate, we use the existing resource allocations from the
|
||||
# source node.
|
||||
if reportclient.claim_resources(
|
||||
self.instance.uuid, alloc_request,
|
||||
self.instance.project_id, self.instance.user_id):
|
||||
LOG.debug('Instance allocations successfully created on '
|
||||
'destination node %(dest)s: %(alloc_request)s',
|
||||
{'dest': dest_node.uuid,
|
||||
'alloc_request': alloc_request},
|
||||
instance=self.instance)
|
||||
else:
|
||||
# We have to fail even though the user requested that we force
|
||||
# the host. This is because we need Placement to have an
|
||||
# accurate reflection of what's allocated on all nodes so the
|
||||
# scheduler can make accurate decisions about which nodes have
|
||||
# capacity for building an instance. We also cannot rely on the
|
||||
# resource tracker in the compute service automatically healing
|
||||
# the allocations since that code is going away in Queens.
|
||||
reason = (_('Unable to migrate instance %(instance_uuid)s to '
|
||||
'host %(host)s. There is not enough capacity on '
|
||||
'the host for the instance.') %
|
||||
{'instance_uuid': self.instance.uuid,
|
||||
'host': self.destination})
|
||||
raise exception.MigrationPreCheckError(reason=reason)
|
||||
else:
|
||||
# This shouldn't happen, but it could be a case where there are
|
||||
# older (Ocata) computes still so the existing allocations are
|
||||
# getting overwritten by the update_available_resource periodic
|
||||
# task in the compute service.
|
||||
# TODO(mriedem): Make this an error when the auto-heal
|
||||
# compatibility code in the resource tracker is removed.
|
||||
LOG.warning('No instance allocations found for source node '
|
||||
'%(source)s in Placement. Not creating allocations '
|
||||
'for destination node %(dest)s and assuming the '
|
||||
'compute service will heal the allocations.',
|
||||
{'source': source_node.uuid, 'dest': dest_node.uuid},
|
||||
instance=self.instance)
|
||||
|
||||
def _check_destination_is_not_source(self):
|
||||
if self.destination == self.source:
|
||||
@@ -101,6 +191,11 @@ class LiveMigrationTask(base.TaskBase):
|
||||
instance_id=self.instance.uuid, host=self.destination)
|
||||
|
||||
def _check_destination_has_enough_memory(self):
|
||||
# TODO(mriedem): This method can be removed when the forced host
|
||||
# scenario is calling select_destinations() in the scheduler because
|
||||
# Placement will be used to filter allocation candidates by MEMORY_MB.
|
||||
# We likely can't remove it until the CachingScheduler is gone though
|
||||
# since the CachingScheduler does not use Placement.
|
||||
compute = self._get_compute_info(self.destination)
|
||||
free_ram_mb = compute.free_ram_mb
|
||||
total_ram_mb = compute.memory_mb
|
||||
@@ -139,6 +234,7 @@ class LiveMigrationTask(base.TaskBase):
|
||||
destination_version = destination_info.hypervisor_version
|
||||
if source_version > destination_version:
|
||||
raise exception.DestinationHypervisorTooOld()
|
||||
return source_info, destination_info
|
||||
|
||||
def _call_livem_checks_on_host(self, destination):
|
||||
try:
|
||||
|
||||
@@ -911,7 +911,7 @@ class SchedulerReportClient(object):
|
||||
self._delete_inventory(compute_node.uuid)
|
||||
|
||||
@safe_connect
|
||||
def _get_allocations_for_instance(self, rp_uuid, instance):
|
||||
def get_allocations_for_instance(self, rp_uuid, instance):
|
||||
url = '/allocations/%s' % instance.uuid
|
||||
resp = self.get(url)
|
||||
if not resp:
|
||||
@@ -925,8 +925,8 @@ class SchedulerReportClient(object):
|
||||
|
||||
def _allocate_for_instance(self, rp_uuid, instance):
|
||||
my_allocations = _instance_to_allocations_dict(instance)
|
||||
current_allocations = self._get_allocations_for_instance(rp_uuid,
|
||||
instance)
|
||||
current_allocations = self.get_allocations_for_instance(rp_uuid,
|
||||
instance)
|
||||
if current_allocations == my_allocations:
|
||||
allocstr = ','.join(['%s=%s' % (k, v)
|
||||
for k, v in my_allocations.items()])
|
||||
@@ -943,9 +943,14 @@ class SchedulerReportClient(object):
|
||||
LOG.info(_LI('Submitted allocation for instance'),
|
||||
instance=instance)
|
||||
|
||||
# NOTE(jaypipes): Currently, this method is ONLY used by the scheduler to
|
||||
# allocate resources on the selected destination hosts. This method should
|
||||
# not be called by the resource tracker; instead, the
|
||||
# NOTE(jaypipes): Currently, this method is ONLY used in two places:
|
||||
# 1. By the scheduler to allocate resources on the selected destination
|
||||
# hosts.
|
||||
# 2. By the conductor LiveMigrationTask to allocate resources on a forced
|
||||
# destination host. This is a short-term fix for Pike which should be
|
||||
# replaced in Queens by conductor calling the scheduler in the force
|
||||
# host case.
|
||||
# This method should not be called by the resource tracker; instead, the
|
||||
# _allocate_for_instance() method is used which does not perform any
|
||||
# checking that a move operation is in place.
|
||||
@safe_connect
|
||||
|
||||
@@ -20,6 +20,7 @@ from nova.tests import fixtures
|
||||
from nova.tests.functional.notification_sample_tests \
|
||||
import notification_sample_base
|
||||
from nova.tests.unit import fake_notifier
|
||||
from nova.virt import fake
|
||||
|
||||
|
||||
class TestInstanceNotificationSampleWithMultipleCompute(
|
||||
@@ -33,6 +34,7 @@ class TestInstanceNotificationSampleWithMultipleCompute(
|
||||
self.useFixture(self.neutron)
|
||||
self.cinder = fixtures.CinderFixture(self)
|
||||
self.useFixture(self.cinder)
|
||||
self.useFixture(fixtures.AllServicesCurrent())
|
||||
|
||||
def test_live_migration_actions(self):
|
||||
server = self._boot_a_server(
|
||||
@@ -40,6 +42,8 @@ class TestInstanceNotificationSampleWithMultipleCompute(
|
||||
self._wait_for_notification('instance.create.end')
|
||||
self._attach_volume_to_server(server, self.cinder.SWAP_OLD_VOL)
|
||||
# server will boot on host1
|
||||
fake.set_nodes(['host2'])
|
||||
self.addCleanup(fake.restore_nodes)
|
||||
self.useFixture(fixtures.ConfPatcher(host='host2'))
|
||||
self.compute2 = self.start_service('compute', host='host2')
|
||||
|
||||
|
||||
@@ -1831,19 +1831,11 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
|
||||
self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
|
||||
|
||||
dest_usages = self._get_provider_usages(dest_rp_uuid)
|
||||
# NOTE(lajos katona): When bug 1712008 is solved the dest host
|
||||
# expected to have resource allocation:
|
||||
# self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
|
||||
|
||||
# NOTE(lajos katona): the allocation on the destination host is empty,
|
||||
# but when bug 1712008 is solved expected to be equal described by the
|
||||
# flavor:
|
||||
self.assertFlavorMatchesAllocation(
|
||||
{'ram': 0, 'disk': 0, 'vcpus': 0}, dest_usages)
|
||||
self.assertFlavorMatchesAllocation(self.flavor1, dest_usages)
|
||||
|
||||
allocations = self._get_allocations_by_server_uuid(server['id'])
|
||||
# the server has just 1 allocation:
|
||||
self.assertEqual(1, len(allocations))
|
||||
# the server has an allocation on the source and dest nodes
|
||||
self.assertEqual(2, len(allocations))
|
||||
|
||||
# NOTE(lajos katona): When bug 1712045 is solved the server has
|
||||
# no allocation on the source:
|
||||
@@ -1853,13 +1845,8 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
|
||||
source_allocation = allocations[source_rp_uuid]['resources']
|
||||
self.assertFlavorMatchesAllocation(self.flavor1, source_allocation)
|
||||
|
||||
# NOTE(lajos katona): When bug 1712008 solved the server should have
|
||||
# an allocation for the destination:
|
||||
# dest_allocation = allocations[dest_rp_uuid]['resources']
|
||||
# self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
|
||||
|
||||
# Instead the server has no allocation for the destination host:
|
||||
self.assertNotIn(dest_rp_uuid, allocations)
|
||||
dest_allocation = allocations[dest_rp_uuid]['resources']
|
||||
self.assertFlavorMatchesAllocation(self.flavor1, dest_allocation)
|
||||
|
||||
self._delete_and_check_allocations(
|
||||
server, source_rp_uuid, dest_rp_uuid)
|
||||
|
||||
@@ -62,14 +62,19 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
|
||||
def test_execute_with_destination(self):
|
||||
with test.nested(
|
||||
mock.patch.object(self.task, '_check_host_is_up'),
|
||||
mock.patch.object(self.task, '_check_requested_destination'),
|
||||
mock.patch.object(self.task, '_check_requested_destination',
|
||||
return_value=(mock.sentinel.source_node,
|
||||
mock.sentinel.dest_node)),
|
||||
mock.patch.object(self.task, '_claim_resources_on_destination'),
|
||||
mock.patch.object(self.task.compute_rpcapi, 'live_migration'),
|
||||
) as (mock_check_up, mock_check_dest, mock_mig):
|
||||
) as (mock_check_up, mock_check_dest, mock_claim, mock_mig):
|
||||
mock_mig.return_value = "bob"
|
||||
|
||||
self.assertEqual("bob", self.task.execute())
|
||||
mock_check_up.assert_called_once_with(self.instance_host)
|
||||
mock_check_dest.assert_called_once_with()
|
||||
mock_claim.assert_called_once_with(
|
||||
mock.sentinel.source_node, mock.sentinel.dest_node)
|
||||
mock_mig.assert_called_once_with(
|
||||
self.context,
|
||||
host=self.instance_host,
|
||||
@@ -161,7 +166,8 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
|
||||
mock_get_info.return_value = hypervisor_details
|
||||
mock_check.return_value = "migrate_data"
|
||||
|
||||
self.task._check_requested_destination()
|
||||
self.assertEqual((hypervisor_details, hypervisor_details),
|
||||
self.task._check_requested_destination())
|
||||
self.assertEqual("migrate_data", self.task.migrate_data)
|
||||
mock_get_host.assert_called_once_with(self.context, self.destination)
|
||||
mock_is_up.assert_called_once_with("service")
|
||||
@@ -475,3 +481,99 @@ class LiveMigrationTaskTestCase(test.NoDBTestCase):
|
||||
side_effect=messaging.MessagingTimeout):
|
||||
self.assertRaises(exception.MigrationPreCheckError,
|
||||
self.task._call_livem_checks_on_host, {})
|
||||
|
||||
def test_claim_resources_on_destination_no_source_allocations(self):
|
||||
"""Tests the negative scenario where the instance does not have
|
||||
allocations in Placement on the source compute node so no claim is
|
||||
attempted on the destination compute node.
|
||||
"""
|
||||
source_node = objects.ComputeNode(uuid=uuids.source_node)
|
||||
dest_node = objects.ComputeNode(uuid=uuids.dest_node)
|
||||
|
||||
@mock.patch.object(self.task.scheduler_client.reportclient,
|
||||
'get_allocations_for_instance', return_value={})
|
||||
@mock.patch.object(self.task.scheduler_client.reportclient,
|
||||
'claim_resources',
|
||||
new_callable=mock.NonCallableMock)
|
||||
def test(mock_claim, mock_get_allocs):
|
||||
self.task._claim_resources_on_destination(source_node, dest_node)
|
||||
mock_get_allocs.assert_called_once_with(
|
||||
uuids.source_node, self.instance)
|
||||
|
||||
test()
|
||||
|
||||
def test_claim_resources_on_destination_claim_fails(self):
|
||||
"""Tests the negative scenario where the resource allocation claim
|
||||
on the destination compute node fails, resulting in an error.
|
||||
"""
|
||||
source_node = objects.ComputeNode(uuid=uuids.source_node)
|
||||
dest_node = objects.ComputeNode(uuid=uuids.dest_node)
|
||||
source_res_allocs = {
|
||||
'VCPU': self.instance.vcpus,
|
||||
'MEMORY_MB': self.instance.memory_mb,
|
||||
# This would really include ephemeral and swap too but we're lazy.
|
||||
'DISK_GB': self.instance.root_gb
|
||||
}
|
||||
dest_alloc_request = {
|
||||
'allocations': [
|
||||
{
|
||||
'resource_provider': {
|
||||
'uuid': uuids.dest_node
|
||||
},
|
||||
'resources': source_res_allocs
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@mock.patch.object(self.task.scheduler_client.reportclient,
|
||||
'get_allocations_for_instance',
|
||||
return_value=source_res_allocs)
|
||||
@mock.patch.object(self.task.scheduler_client.reportclient,
|
||||
'claim_resources', return_value=False)
|
||||
def test(mock_claim, mock_get_allocs):
|
||||
self.assertRaises(exception.MigrationPreCheckError,
|
||||
self.task._claim_resources_on_destination,
|
||||
source_node, dest_node)
|
||||
mock_get_allocs.assert_called_once_with(
|
||||
uuids.source_node, self.instance)
|
||||
mock_claim.assert_called_once_with(
|
||||
self.instance.uuid, dest_alloc_request,
|
||||
self.instance.project_id, self.instance.user_id)
|
||||
|
||||
test()
|
||||
|
||||
def test_claim_resources_on_destination(self):
|
||||
"""Happy path test where everything is successful."""
|
||||
source_node = objects.ComputeNode(uuid=uuids.source_node)
|
||||
dest_node = objects.ComputeNode(uuid=uuids.dest_node)
|
||||
source_res_allocs = {
|
||||
'VCPU': self.instance.vcpus,
|
||||
'MEMORY_MB': self.instance.memory_mb,
|
||||
# This would really include ephemeral and swap too but we're lazy.
|
||||
'DISK_GB': self.instance.root_gb
|
||||
}
|
||||
dest_alloc_request = {
|
||||
'allocations': [
|
||||
{
|
||||
'resource_provider': {
|
||||
'uuid': uuids.dest_node
|
||||
},
|
||||
'resources': source_res_allocs
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@mock.patch.object(self.task.scheduler_client.reportclient,
|
||||
'get_allocations_for_instance',
|
||||
return_value=source_res_allocs)
|
||||
@mock.patch.object(self.task.scheduler_client.reportclient,
|
||||
'claim_resources', return_value=True)
|
||||
def test(mock_claim, mock_get_allocs):
|
||||
self.task._claim_resources_on_destination(source_node, dest_node)
|
||||
mock_get_allocs.assert_called_once_with(
|
||||
uuids.source_node, self.instance)
|
||||
mock_claim.assert_called_once_with(
|
||||
self.instance.uuid, dest_alloc_request,
|
||||
self.instance.project_id, self.instance.user_id)
|
||||
|
||||
test()
|
||||
|
||||
Reference in New Issue
Block a user