Merge "Use 2nd RPC server in compute operations"
This commit is contained in:
+41
@@ -138,6 +138,44 @@
|
||||
block_migrate_cinder_iscsi: true
|
||||
post-run: playbooks/nova-live-migration/post-run.yaml
|
||||
|
||||
- job:
|
||||
name: nova-graceful-shutdown
|
||||
parent: devstack-multinode
|
||||
description: |
|
||||
Run Nova graceful shutdown tests.
|
||||
run: playbooks/nova-graceful-shutdown/run.yaml
|
||||
timeout: 10800
|
||||
vars:
|
||||
devstack_services:
|
||||
neutron-trunk: true
|
||||
openstack-cli-server: true
|
||||
s-account: false
|
||||
s-container: false
|
||||
s-object: false
|
||||
s-proxy: false
|
||||
c-bak: false
|
||||
tempest_test_regex: ''
|
||||
devstack_localrc:
|
||||
<<: *uec_image_vars
|
||||
SERVICE_GRACEFUL_SHUTDOWN_TIMEOUT: 180
|
||||
NOVA_ALLOW_MOVE_TO_SAME_HOST: false
|
||||
LIVE_MIGRATION_AVAILABLE: true
|
||||
USE_BLOCK_MIGRATION_FOR_LIVE_MIGRATION: true
|
||||
group-vars:
|
||||
subnode:
|
||||
devstack_services:
|
||||
openstack-cli-server: true
|
||||
s-account: false
|
||||
s-container: false
|
||||
s-object: false
|
||||
s-proxy: false
|
||||
c-bak: false
|
||||
devstack_localrc:
|
||||
SERVICE_GRACEFUL_SHUTDOWN_TIMEOUT: 180
|
||||
NOVA_ALLOW_MOVE_TO_SAME_HOST: false
|
||||
LIVE_MIGRATION_AVAILABLE: true
|
||||
USE_BLOCK_MIGRATION_FOR_LIVE_MIGRATION: true
|
||||
|
||||
- job:
|
||||
name: nova-alt-configurations
|
||||
parent: tempest-multinode-full-py3
|
||||
@@ -829,6 +867,9 @@
|
||||
- ^nova/network/.*$
|
||||
- nova/virt/libvirt/vif.py
|
||||
- nova-live-migration
|
||||
# NOTE(gmaan): We will be running the graceful shutdown testing in
|
||||
# check pipeline only and not required to test in gate as such.
|
||||
- nova-graceful-shutdown
|
||||
- nova-live-migration-ceph
|
||||
- nova-lvm
|
||||
- nova-multi-cell
|
||||
|
||||
@@ -642,7 +642,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):
|
||||
class ComputeManager(manager.Manager):
|
||||
"""Manages the running instances from creation to destruction."""
|
||||
|
||||
target = messaging.Target(version='6.4')
|
||||
target = messaging.Target(version='6.5')
|
||||
|
||||
def __init__(self, compute_driver=None, *args, **kwargs):
|
||||
"""Load configuration options and connect to the hypervisor."""
|
||||
|
||||
+92
-13
@@ -414,6 +414,7 @@ class ComputeAPI(object):
|
||||
* 6.2 - Add target_state parameter to rebuild_instance()
|
||||
* 6.3 - Add delete_attachment parameter to remove_volume_connection
|
||||
* 6.4 - Add allow_share() and deny_share()
|
||||
* 6.5 - Add 2nd RPC server with new topic 'compute-alt'
|
||||
'''
|
||||
|
||||
VERSION_ALIASES = {
|
||||
@@ -572,6 +573,33 @@ class ComputeAPI(object):
|
||||
serializer=serializer,
|
||||
call_monitor_timeout=cmt)
|
||||
|
||||
def prepare_for_alt_rpcserver(
|
||||
self, client, server, version, **kwargs):
|
||||
# NOTE(gmaan): By override the 'topic' in prepare() method, we make
|
||||
# this rpc client to send the message to the different RPC server,
|
||||
# which listen to RPC_TOPIC_ALT (the RPC server which is active during
|
||||
# compute service graceful shutdown).
|
||||
topic = RPC_TOPIC_ALT
|
||||
msg = _("RPC: Sending the message to topic: %s") % topic
|
||||
|
||||
# NOTE(gmann): The old compute will not have the new 2nd RPC server
|
||||
# so we need to handle it with RPC versioning. For the old compute,
|
||||
# it will fallback to send the message to the original RPC server,
|
||||
# which listen to RPC_TOPIC.
|
||||
if not client.can_send_version('6.5'):
|
||||
topic = RPC_TOPIC
|
||||
msg = _("Fallback to send the message to original topic: %s as "
|
||||
"RPC version is too old.") % topic
|
||||
|
||||
LOG.debug(msg)
|
||||
|
||||
params = {
|
||||
'server': server,
|
||||
'version': version,
|
||||
'topic': topic}
|
||||
params.update(kwargs)
|
||||
return client.prepare(**params)
|
||||
|
||||
def add_fixed_ip_to_instance(self, ctxt, instance, network_id):
|
||||
version = self._ver(ctxt, '5.0')
|
||||
cctxt = self.router.client(ctxt).prepare(
|
||||
@@ -612,6 +640,12 @@ class ComputeAPI(object):
|
||||
kwargs.pop('migration')
|
||||
kwargs.pop('limits')
|
||||
version = '5.0'
|
||||
# NOTE(gmaan): Most of the live migration RPC methods use the
|
||||
# 'compute-alt' topic, but this RPC method should use the 'compute'
|
||||
# topic. If a shutdown is initiated on the destination compute, the
|
||||
# RPC server for the 'compute' topic will be stopped. If a live
|
||||
# migration request arrives after that, the destination compute node
|
||||
# should not take it.
|
||||
cctxt = client.prepare(server=destination, version=version,
|
||||
call_monitor_timeout=CONF.rpc_response_timeout,
|
||||
timeout=CONF.long_rpc_timeout)
|
||||
@@ -621,6 +655,10 @@ class ComputeAPI(object):
|
||||
version = self._ver(ctxt, '5.0')
|
||||
client = self.router.client(ctxt)
|
||||
source = _compute_host(None, instance)
|
||||
# NOTE(gmaan): Like check_can_live_migrate_destination, this RPC
|
||||
# method should use topic 'compute'. If a shutdown is initiated
|
||||
# on the source compute and, after that, a live migration request
|
||||
# arrives, the source compute should not take it.
|
||||
cctxt = client.prepare(server=source, version=version)
|
||||
return cctxt.call(ctxt, 'check_can_live_migrate_source',
|
||||
instance=instance,
|
||||
@@ -867,8 +905,14 @@ class ComputeAPI(object):
|
||||
|
||||
def validate_console_port(self, ctxt, instance, port, console_type):
|
||||
version = self._ver(ctxt, '5.0')
|
||||
cctxt = self.router.client(ctxt).prepare(
|
||||
server=_compute_host(None, instance), version=version)
|
||||
client = self.router.client(ctxt)
|
||||
# NOTE(gmaan): Send this RPC request to 'compute-alt' topic. This is
|
||||
# called when the console is already requested. If shutdown is
|
||||
# requested after that, compute should finish the port validation
|
||||
# so that users can get their requested console.
|
||||
cctxt = self.prepare_for_alt_rpcserver(
|
||||
client,
|
||||
server=_compute_host(None, instance), version=version)
|
||||
return cctxt.call(ctxt, 'validate_console_port',
|
||||
instance=instance, port=port,
|
||||
console_type=console_type)
|
||||
@@ -904,7 +948,13 @@ class ComputeAPI(object):
|
||||
migration, migrate_data=None):
|
||||
version = self._ver(ctxt, '5.0')
|
||||
client = self.router.client(ctxt)
|
||||
cctxt = client.prepare(server=host, version=version)
|
||||
# NOTE(gmaan): Send this RPC request to 'compute-alt' topic. At this
|
||||
# stage, both the source and destination compute have already confirmed
|
||||
# that live migration can proceed. If the shutdown is initiated after
|
||||
# that, the compute should finish the live migration using the
|
||||
# 'compute-alt' RPC server.
|
||||
cctxt = self.prepare_for_alt_rpcserver(
|
||||
client, server=host, version=version)
|
||||
cctxt.cast(ctxt, 'live_migration', instance=instance,
|
||||
dest=dest, block_migration=block_migration,
|
||||
migrate_data=migrate_data, migration=migration)
|
||||
@@ -933,7 +983,12 @@ class ComputeAPI(object):
|
||||
def post_live_migration_at_destination(self, ctxt, instance,
|
||||
block_migration, host):
|
||||
version = self._ver(ctxt, '5.0')
|
||||
cctxt = self.router.client(ctxt).prepare(
|
||||
client = self.router.client(ctxt)
|
||||
# NOTE(gmaan): Send this RPC request to 'compute-alt' topic. If the
|
||||
# shutdown is initiated during live migration, the compute should
|
||||
# finish the live migration using the 'compute-alt' RPC server.
|
||||
cctxt = self.prepare_for_alt_rpcserver(
|
||||
client,
|
||||
server=host, version=version,
|
||||
call_monitor_timeout=CONF.rpc_response_timeout,
|
||||
timeout=CONF.long_rpc_timeout)
|
||||
@@ -951,9 +1006,14 @@ class ComputeAPI(object):
|
||||
version = '5.0'
|
||||
# We just need to honor the argument in the v5.0 RPC API method
|
||||
msg_args['block_migration'] = None
|
||||
cctxt = client.prepare(server=host, version=version,
|
||||
timeout=CONF.long_rpc_timeout,
|
||||
call_monitor_timeout=CONF.rpc_response_timeout)
|
||||
# NOTE(gmaan): Send this RPC request to 'compute-alt' topic. If the
|
||||
# shutdown is initiated during live migration, the compute should
|
||||
# finish the live migration using the 'compute-alt' RPC server.
|
||||
cctxt = self.prepare_for_alt_rpcserver(
|
||||
client,
|
||||
server=host, version=version,
|
||||
timeout=CONF.long_rpc_timeout,
|
||||
call_monitor_timeout=CONF.rpc_response_timeout)
|
||||
return cctxt.call(ctxt, 'pre_live_migration',
|
||||
instance=instance,
|
||||
disk=disk, migrate_data=migrate_data,
|
||||
@@ -1161,8 +1221,12 @@ class ComputeAPI(object):
|
||||
if not client.can_send_version(version):
|
||||
kwargs.pop('delete_attachment')
|
||||
version = self._ver(ctxt, '5.0')
|
||||
|
||||
cctxt = client.prepare(server=host, version=version)
|
||||
# NOTE(gmaan): This is called during live migration rollback. Send
|
||||
# this RPC request to 'compute-alt' topic. If the shutdown is initiated
|
||||
# during live migration rollback, the compute should finish the it
|
||||
# using the 'compute-alt' RPC server.
|
||||
cctxt = self.prepare_for_alt_rpcserver(
|
||||
client, server=host, version=version)
|
||||
return cctxt.call(ctxt, 'remove_volume_connection', **kwargs)
|
||||
|
||||
def rescue_instance(self, ctxt, instance, rescue_password,
|
||||
@@ -1262,7 +1326,12 @@ class ComputeAPI(object):
|
||||
migrate_data):
|
||||
version = self._ver(ctxt, '5.0')
|
||||
client = self.router.client(ctxt)
|
||||
cctxt = client.prepare(server=host, version=version)
|
||||
# NOTE(gmaan): This is called during live migration rollback. Send
|
||||
# this RPC request to 'compute-alt' topic. If the shutdown is initiated
|
||||
# during live migration rollback, the compute should finish it using
|
||||
# the 'compute-alt' RPC server.
|
||||
cctxt = self.prepare_for_alt_rpcserver(
|
||||
client, server=host, version=version)
|
||||
cctxt.cast(ctxt, 'rollback_live_migration_at_destination',
|
||||
instance=instance, destroy_disks=destroy_disks,
|
||||
migrate_data=migrate_data)
|
||||
@@ -1286,7 +1355,12 @@ class ComputeAPI(object):
|
||||
"""
|
||||
version = self._ver(ctxt, '5.3')
|
||||
client = self.router.client(ctxt)
|
||||
cctxt = client.prepare(server=host, version=version)
|
||||
# NOTE(gmaan): This is called during live migration rollback. Send
|
||||
# this RPC request to 'compute-alt' topic. If the shutdown is initiated
|
||||
# during live migration rollback, the compute should finish it using
|
||||
# the 'compute-alt' RPC server.
|
||||
cctxt = self.prepare_for_alt_rpcserver(
|
||||
client, server=host, version=version)
|
||||
cctxt.call(ctxt, 'drop_move_claim_at_destination', instance=instance)
|
||||
|
||||
def set_admin_password(self, ctxt, instance, new_pass):
|
||||
@@ -1523,8 +1597,13 @@ class ComputeAPI(object):
|
||||
def external_instance_event(self, ctxt, instances, events, host=None):
|
||||
instance = instances[0]
|
||||
version = self._ver(ctxt, '5.0')
|
||||
cctxt = self.router.client(ctxt).prepare(
|
||||
server=_compute_host(host, instance),
|
||||
client = self.router.client(ctxt)
|
||||
# NOTE(gmaan): This is initiated by the external services (for
|
||||
# example, neutron send event for network change) and let's not block
|
||||
# them during shutdown. Make this RPC request to 'compute-alt' topic.
|
||||
cctxt = self.prepare_for_alt_rpcserver(
|
||||
client,
|
||||
_compute_host(host, instance),
|
||||
version=version)
|
||||
cctxt.cast(ctxt, 'external_instance_event', instances=instances,
|
||||
events=events)
|
||||
|
||||
@@ -37,7 +37,7 @@ __all__ = [
|
||||
|
||||
|
||||
# NOTE(danms): This is the global service version counter
|
||||
SERVICE_VERSION = 70
|
||||
SERVICE_VERSION = 71
|
||||
|
||||
|
||||
# NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
|
||||
@@ -249,6 +249,9 @@ SERVICE_VERSION_HISTORY = (
|
||||
# Version 70: Compute RPC v6.4:
|
||||
# Compute manager supports USB controller model traits
|
||||
{'compute_rpc': '6.4'},
|
||||
# Version 71: Compute RPC v6.5:
|
||||
# Add 2nd RPC server for compute service
|
||||
{'compute_rpc': '6.5'},
|
||||
)
|
||||
|
||||
# This is the version after which we can rely on having a persistent
|
||||
|
||||
@@ -461,6 +461,12 @@ class TestCase(base.BaseTestCase):
|
||||
if host is not None:
|
||||
# Make sure that CONF.host is relevant to the right hostname
|
||||
self.useFixture(nova_fixtures.ConfPatcher(host=host))
|
||||
# By default, service creates a RPC server for auto populated
|
||||
# 'topic' from service binary name. For compute service, we need
|
||||
# to create the 2nd RPC server which will be done by pass the
|
||||
# 'topic_alt' explicitly.
|
||||
if name == 'compute' and 'topic_alt' not in kwargs:
|
||||
kwargs['topic_alt'] = compute_rpcapi.RPC_TOPIC_ALT
|
||||
|
||||
if name == 'compute' and self.USES_DB:
|
||||
# NOTE(danms): We need to create the HostMapping first, because
|
||||
|
||||
@@ -131,10 +131,13 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
prepare_extra_kwargs = {}
|
||||
cm_timeout = kwargs.pop('call_monitor_timeout', None)
|
||||
timeout = kwargs.pop('timeout', None)
|
||||
topic_alt = kwargs.pop('topic_alt', None)
|
||||
if cm_timeout:
|
||||
prepare_extra_kwargs['call_monitor_timeout'] = cm_timeout
|
||||
if timeout:
|
||||
prepare_extra_kwargs['timeout'] = timeout
|
||||
if topic_alt:
|
||||
prepare_extra_kwargs['topic'] = topic_alt
|
||||
|
||||
# NOTE(sbauza): If expected args are provided, we need to use them
|
||||
# for the expected kwargs and just add the needed _return_value that
|
||||
@@ -368,7 +371,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
def test_validate_console_port(self):
|
||||
self._test_compute_api('validate_console_port', 'call',
|
||||
instance=self.fake_instance_obj, port="5900",
|
||||
console_type="novnc", version='6.0')
|
||||
console_type="novnc", version='6.0',
|
||||
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
|
||||
|
||||
def test_host_maintenance_mode(self):
|
||||
self._test_compute_api('host_maintenance_mode', 'call',
|
||||
@@ -387,7 +391,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
instance=self.fake_instance_obj, dest='dest',
|
||||
block_migration='blockity_block', host='tsoh',
|
||||
migration='migration',
|
||||
migrate_data={}, version='6.0')
|
||||
migrate_data={}, version='6.0',
|
||||
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
|
||||
|
||||
def test_live_migration_force_complete(self):
|
||||
migration = migration_obj.Migration()
|
||||
@@ -420,7 +425,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
self._test_compute_api('post_live_migration_at_destination', 'call',
|
||||
instance=self.fake_instance_obj,
|
||||
block_migration='block_migration', host='host', version='6.0',
|
||||
timeout=1234, call_monitor_timeout=60)
|
||||
timeout=1234, call_monitor_timeout=60,
|
||||
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
|
||||
|
||||
def test_pause_instance(self):
|
||||
self._test_compute_api('pause_instance', 'cast',
|
||||
@@ -448,7 +454,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
expected_args, instance=self.fake_instance_obj,
|
||||
block_migration='block_migration', disk='disk', host='host',
|
||||
migrate_data=None, version='6.0',
|
||||
call_monitor_timeout=60, timeout=1234)
|
||||
call_monitor_timeout=60, timeout=1234,
|
||||
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
|
||||
|
||||
def test_supports_numa_live_migration(self):
|
||||
mock_client = mock.MagicMock()
|
||||
@@ -506,10 +513,19 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
instance=self.fake_instance_obj, block_migration=False,
|
||||
disk_over_commit=False)
|
||||
|
||||
def test_rollback_live_migration_at_destination(self):
|
||||
self._test_compute_api('rollback_live_migration_at_destination',
|
||||
'cast', instance=self.fake_instance_obj,
|
||||
host='host', destroy_disks=True,
|
||||
migrate_data=None, version='6.0',
|
||||
_return_value=None,
|
||||
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
|
||||
|
||||
def test_drop_move_claim_at_destination(self):
|
||||
self._test_compute_api('drop_move_claim_at_destination', 'call',
|
||||
instance=self.fake_instance_obj, host='host',
|
||||
version='6.0', _return_value=None)
|
||||
version='6.0', _return_value=None,
|
||||
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
|
||||
|
||||
def test_prep_resize(self):
|
||||
self._test_compute_api('prep_resize', 'cast',
|
||||
@@ -965,7 +981,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
def test_remove_volume_connection(self):
|
||||
self._test_compute_api('remove_volume_connection', 'call',
|
||||
instance=self.fake_instance_obj, volume_id='id', host='host',
|
||||
delete_attachment=True, version='6.3')
|
||||
delete_attachment=True, version='6.3',
|
||||
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
|
||||
|
||||
def test_rescue_instance(self):
|
||||
self._test_compute_api('rescue_instance', 'cast',
|
||||
@@ -1218,7 +1235,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
self._test_compute_api('external_instance_event', 'cast',
|
||||
instances=[self.fake_instance_obj],
|
||||
events=['event'],
|
||||
version='6.0')
|
||||
version='6.0',
|
||||
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
|
||||
|
||||
def test_build_and_run_instance(self):
|
||||
# With rpcapi 5.11, when a list of accel_uuids is passed as a param,
|
||||
@@ -1354,3 +1372,45 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
instance=self.fake_instance_obj,
|
||||
share_mapping=self.get_fake_share_mapping(),
|
||||
version='6.4')
|
||||
|
||||
def test_prepare_for_alt_rpcserver_select_topic_alt(self):
|
||||
rpcapi = compute_rpcapi.ComputeAPI()
|
||||
mock_client = mock.MagicMock()
|
||||
mock_client.can_send_version.return_value = True
|
||||
rpcapi.prepare_for_alt_rpcserver(
|
||||
mock_client, server='fake_host', version='6.5')
|
||||
mock_client.can_send_version.assert_called_once_with('6.5')
|
||||
mock_client.prepare.assert_called_once_with(
|
||||
server='fake_host',
|
||||
version='6.5',
|
||||
topic=compute_rpcapi.RPC_TOPIC_ALT)
|
||||
|
||||
def test_prepare_for_alt_rpcserver_fallback_topic_for_old_compute(self):
|
||||
rpcapi = compute_rpcapi.ComputeAPI()
|
||||
mock_client = mock.MagicMock()
|
||||
mock_client.can_send_version.return_value = False
|
||||
rpcapi.prepare_for_alt_rpcserver(
|
||||
mock_client, server='fake_host', version='6.0')
|
||||
mock_client.can_send_version.assert_called_once_with('6.5')
|
||||
mock_client.prepare.assert_called_once_with(
|
||||
server='fake_host',
|
||||
version='6.0',
|
||||
topic=compute_rpcapi.RPC_TOPIC)
|
||||
|
||||
def test_prepare_for_alt_rpcserver_with_extra_kwargs(self):
|
||||
rpcapi = compute_rpcapi.ComputeAPI()
|
||||
mock_client = mock.MagicMock()
|
||||
mock_client.can_send_version.return_value = True
|
||||
rpcapi.prepare_for_alt_rpcserver(
|
||||
mock_client,
|
||||
server='fake_host',
|
||||
version='6.5',
|
||||
call_monitor_timeout=60,
|
||||
timeout=120)
|
||||
mock_client.can_send_version.assert_called_once_with('6.5')
|
||||
mock_client.prepare.assert_called_once_with(
|
||||
server='fake_host',
|
||||
version='6.5',
|
||||
topic=compute_rpcapi.RPC_TOPIC_ALT,
|
||||
call_monitor_timeout=60,
|
||||
timeout=120)
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
---
|
||||
- hosts: all
|
||||
roles:
|
||||
- orchestrate-devstack
|
||||
- hosts: controller
|
||||
roles:
|
||||
- run-graceful-shutdown-tests
|
||||
@@ -0,0 +1,55 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Nova services now support graceful shutdown on ``SIGTERM``. When a service
|
||||
receives ``SIGTERM``, it will stop accepting new RPC requests and wait for
|
||||
in-progress tasks to reach a safe termination point.
|
||||
|
||||
The compute service creates a second RPC server on an ``compute-alt`` topic
|
||||
which remains active during graceful shutdown, allowing compute service to
|
||||
finish the in-progress tasks.
|
||||
|
||||
Currently below operations are using second RPC server:
|
||||
|
||||
* Live migration
|
||||
* Server external Event
|
||||
* Get Console output
|
||||
|
||||
Nova added two new configuration options which will control this behavior:
|
||||
|
||||
* ``[DEFAULT]/graceful_shutdown_timeout`` - The overall time the service
|
||||
waits before forcefully exit. This is defaults to 180 seconds for each
|
||||
Nova services.
|
||||
* ``[DEFAULT]/manager_shutdown_timeout`` - The time the service manager
|
||||
waits for in-progress tasks to complete during graceful shutdown. This
|
||||
is defaults to 160 seconds for each service manager. This must be less
|
||||
than ``graceful_shutdown_timeout``.
|
||||
|
||||
You can increase these timeouts based on the traffic and how long the
|
||||
long-running (e.g. live migrations) tasks take in your deployment.
|
||||
|
||||
We plan to improve the graceful shutdown in future releases by task
|
||||
tracking and transitioning resources to a recoverable state. Until then,
|
||||
this feature is experimental.
|
||||
upgrade:
|
||||
- |
|
||||
The default value of ``[DEFAULT]/graceful_shutdown_timeout`` has been
|
||||
changed from 60 to 180 seconds for all Nova services. This means that
|
||||
when a Nova service receives ``SIGTERM``, it will now wait up to 180
|
||||
seconds for a graceful shutdown before being forcefully terminated.
|
||||
Operators using external system (e.g. k8s, systemd) to manage the
|
||||
Nova serviecs should ensure that their service stop timeouts are set
|
||||
to at least ``graceful_shutdown_timeout`` to avoid forcefully killing
|
||||
service before Nova finish its graceful shutdown. For example, the
|
||||
systemd ``TimeoutStopSec`` should be set to at least 180 seconds (or
|
||||
greater) for Nova services.
|
||||
- |
|
||||
A new configuration option ``[DEFAULT]/manager_shutdown_timeout`` has been
|
||||
added with a default value of 160 seconds. This controls how long the
|
||||
service manager waits for in-progress tasks to finish during graceful
|
||||
shutdown. Operators may want to tune this value based on how long their
|
||||
typical long-running operations (e.g. live migrations) take to complete.
|
||||
- |
|
||||
The compute service now creates a second RPC server on the ``compute-alt``
|
||||
topic. This means each compute worker will create an additional RabbitMQ
|
||||
queue.
|
||||
@@ -0,0 +1 @@
|
||||
Run Nova graceful shutdown tests and verify the operations.
|
||||
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
source /opt/stack/devstack/openrc admin
|
||||
set -x
|
||||
set -e
|
||||
|
||||
confirm_resize() {
|
||||
local server=$1
|
||||
|
||||
echo "Confirming resize on ${server}"
|
||||
openstack server resize confirm "${server}"
|
||||
|
||||
count=0
|
||||
while true; do
|
||||
status=$(openstack server show "${server}" -f value -c status 2>/dev/null || echo "NOT_FOUND")
|
||||
if [ "${status}" == "ACTIVE" ] || [ "${status}" == "ERROR" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
count=$((count+1))
|
||||
if [ ${count} -eq 10 ]; then
|
||||
echo "Timed out waiting for ${server} to be ACTIVE or Error after confirm resize"
|
||||
break
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
cleanup_server() {
|
||||
local server=$1
|
||||
|
||||
status=$(openstack server show "${server}" -f value -c status 2>/dev/null || echo "NOT_FOUND")
|
||||
|
||||
if [ "${status}" == "VERIFY_RESIZE" ]; then
|
||||
confirm_resize "${server}"
|
||||
fi
|
||||
|
||||
status=$(openstack server show "${server}" -f value -c status 2>/dev/null || echo "NOT_FOUND")
|
||||
if [ "${status}" == "ACTIVE" ] || [ "${status}" == "ERROR" ]; then
|
||||
echo "Deleting ${server} (status: ${status})"
|
||||
openstack server delete --wait "${server}"
|
||||
else
|
||||
echo "Skipping ${server} deletion (status: ${status})"
|
||||
fi
|
||||
}
|
||||
|
||||
for server in "$@"; do
|
||||
cleanup_server "${server}"
|
||||
done
|
||||
+39
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
set -x
|
||||
set -e
|
||||
|
||||
COMPUTE_HOST=$1
|
||||
EXPECTED_STATE=${2:-active}
|
||||
|
||||
get_service_status() {
|
||||
local host=$1
|
||||
local status
|
||||
status=$(ssh "${host}" systemctl is-active devstack@n-cpu || true)
|
||||
echo "${status}"
|
||||
}
|
||||
|
||||
wait_for_service_state() {
|
||||
local host=$1
|
||||
local expected=$2
|
||||
local timeout=${3:-30}
|
||||
local count=0
|
||||
local status
|
||||
|
||||
status=$(get_service_status "${host}")
|
||||
while [ "${status}" != "${expected}" ]; do
|
||||
sleep 5
|
||||
count=$((count+1))
|
||||
if [ ${count} -eq ${timeout} ]; then
|
||||
echo "Timed out waiting for compute service on ${host} to be ${expected} (current: ${status})"
|
||||
exit 5
|
||||
fi
|
||||
status=$(get_service_status "${host}")
|
||||
done
|
||||
echo "Compute service on ${host} is ${expected}"
|
||||
}
|
||||
|
||||
if [ "${EXPECTED_STATE}" == "active" ] && [ "$(get_service_status "${COMPUTE_HOST}")" != "active" ]; then
|
||||
ssh "${COMPUTE_HOST}" sudo systemctl start devstack@n-cpu
|
||||
fi
|
||||
|
||||
wait_for_service_state "${COMPUTE_HOST}" "${EXPECTED_STATE}"
|
||||
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
source /opt/stack/devstack/openrc admin
|
||||
set -x
|
||||
set -e
|
||||
|
||||
timeout=196
|
||||
|
||||
server_lm=$1
|
||||
|
||||
image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}')
|
||||
flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}')
|
||||
network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}')
|
||||
|
||||
echo "Creating test server on subnode for graceful shutdown live migration test"
|
||||
openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \
|
||||
--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} --wait ${server_lm}
|
||||
|
||||
echo "Starting live migration of ${server_lm} to ${CONTROLLER_HOSTNAME}"
|
||||
openstack server migrate --live-migration \
|
||||
--host ${CONTROLLER_HOSTNAME} ${server_lm}
|
||||
|
||||
# Wait for the migration to be in progress before returning so that the
|
||||
# SIGTERM can be sent while the migrations are in progress.
|
||||
count=0
|
||||
while true; do
|
||||
migration_status=$(openstack server migration list ${server_lm} \
|
||||
-f value -c Status 2>/dev/null | head -1)
|
||||
server_status=$(openstack server show ${server_lm} \
|
||||
-f value -c status 2>/dev/null)
|
||||
task_state=$(openstack server show ${server_lm} \
|
||||
-f value -c OS-EXT-STS:task_state 2>/dev/null)
|
||||
if [ "${migration_status}" == "preparing" ] || \
|
||||
[ "${migration_status}" == "running" ] || \
|
||||
[ "${task_state}" == "migrating" ]; then
|
||||
echo "Live migration is in progress (status: ${migration_status}, task_state: ${task_state})"
|
||||
break
|
||||
elif [ "${migration_status}" == "completed" ] || \
|
||||
{ [ "${server_status}" == "ACTIVE" ] && \
|
||||
{ [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; }; then
|
||||
echo "Live migration has already completed"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
count=$((count+1))
|
||||
if [ ${count} -eq ${timeout} ]; then
|
||||
echo "Timed out waiting for migrations to start"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
@@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
source /opt/stack/devstack/openrc admin
|
||||
set -x
|
||||
set -e
|
||||
|
||||
server=$1
|
||||
|
||||
# Wait for the server to finish live migration and become ACTIVE with
|
||||
# no task_state, which indicates the migration has completed.
|
||||
timeout=360
|
||||
count=0
|
||||
migration_start=$(date +%s)
|
||||
while true; do
|
||||
status=$(openstack server show ${server} -f value -c status)
|
||||
task_state=$(openstack server show ${server} -f value -c OS-EXT-STS:task_state)
|
||||
|
||||
if [ "${status}" == "ACTIVE" ] && { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; then
|
||||
migration_end=$(date +%s)
|
||||
migration_duration=$((migration_end - migration_start))
|
||||
echo "Migration is completed in ${migration_duration} seconds."
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "${status}" == "ERROR" ]; then
|
||||
echo "Server went to ERROR status during live migration"
|
||||
exit 3
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
count=$((count+1))
|
||||
if [ ${count} -eq ${timeout} ]; then
|
||||
echo "Timed out waiting for live migration to complete"
|
||||
exit 5
|
||||
fi
|
||||
done
|
||||
|
||||
# Make sure the server moved to the controller.
|
||||
host=$(openstack server show ${server} -f value -c OS-EXT-SRV-ATTR:host)
|
||||
if [[ ${host} != ${CONTROLLER_HOSTNAME} ]]; then
|
||||
echo "Unexpected host ${host} for server after live migration during graceful shutdown."
|
||||
exit 4
|
||||
fi
|
||||
|
||||
echo "Live migration during graceful shutdown completed successfully"
|
||||
echo "Server ${server} is ACTIVE on ${host}"
|
||||
@@ -0,0 +1,56 @@
|
||||
- name: Graceful shutdown source compute live migration
|
||||
block:
|
||||
- name: Start live migrations of test servers
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "start_live_migration.sh server-lm1"
|
||||
environment:
|
||||
SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
|
||||
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
|
||||
register: start_live_migrations_result
|
||||
failed_when: start_live_migrations_result.rc not in [0, 2]
|
||||
|
||||
- name: Set fact if migrations completed or timed out before SIGTERM to source compute
|
||||
set_fact:
|
||||
live_migrations_completed_or_timeout: "{{ start_live_migrations_result.rc == 2 }}"
|
||||
|
||||
- name: Run graceful shutdown tests
|
||||
when: not live_migrations_completed_or_timeout
|
||||
block:
|
||||
- name: Send SIGTERM to source compute to start the source compute graceful shutdown
|
||||
delegate_to: compute1
|
||||
become: true
|
||||
shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
|
||||
|
||||
- name: Verify live migration is completed during graceful shutdown
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "verify_live_migration.sh server-lm1"
|
||||
environment:
|
||||
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
|
||||
|
||||
# Sleep for 180 sec: default graceful_shutdown_timeout
|
||||
- name: Sleep for 180 seconds to allow source compute graceful shutdown to complete
|
||||
pause:
|
||||
seconds: 180
|
||||
|
||||
- name: Verify compute service is stopped after graceful shutdown
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }} inactive"
|
||||
|
||||
- name: Start and verify subnode compute service is running
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }}"
|
||||
|
||||
- name: Cleanup test servers
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "cleanup_test_servers.sh server-lm1"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Fail if any test is skipped
|
||||
fail:
|
||||
msg: "One or more test is skipped due to operation is either completed or timed out before SIGTERM signal."
|
||||
when: live_migrations_completed_or_timeout
|
||||
Reference in New Issue
Block a user