Merge "Use 2nd RPC server in compute operations"

This commit is contained in:
Zuul
2026-02-26 17:44:59 +00:00
committed by Gerrit Code Review
14 changed files with 510 additions and 22 deletions
+41
View File
@@ -138,6 +138,44 @@
block_migrate_cinder_iscsi: true
post-run: playbooks/nova-live-migration/post-run.yaml
- job:
name: nova-graceful-shutdown
parent: devstack-multinode
description: |
Run Nova graceful shutdown tests.
run: playbooks/nova-graceful-shutdown/run.yaml
timeout: 10800
vars:
devstack_services:
neutron-trunk: true
openstack-cli-server: true
s-account: false
s-container: false
s-object: false
s-proxy: false
c-bak: false
tempest_test_regex: ''
devstack_localrc:
<<: *uec_image_vars
SERVICE_GRACEFUL_SHUTDOWN_TIMEOUT: 180
NOVA_ALLOW_MOVE_TO_SAME_HOST: false
LIVE_MIGRATION_AVAILABLE: true
USE_BLOCK_MIGRATION_FOR_LIVE_MIGRATION: true
group-vars:
subnode:
devstack_services:
openstack-cli-server: true
s-account: false
s-container: false
s-object: false
s-proxy: false
c-bak: false
devstack_localrc:
SERVICE_GRACEFUL_SHUTDOWN_TIMEOUT: 180
NOVA_ALLOW_MOVE_TO_SAME_HOST: false
LIVE_MIGRATION_AVAILABLE: true
USE_BLOCK_MIGRATION_FOR_LIVE_MIGRATION: true
- job:
name: nova-alt-configurations
parent: tempest-multinode-full-py3
@@ -829,6 +867,9 @@
- ^nova/network/.*$
- nova/virt/libvirt/vif.py
- nova-live-migration
# NOTE(gmaan): We will be running the graceful shutdown testing in
# check pipeline only and not required to test in gate as such.
- nova-graceful-shutdown
- nova-live-migration-ceph
- nova-lvm
- nova-multi-cell
+1 -1
View File
@@ -642,7 +642,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):
class ComputeManager(manager.Manager):
"""Manages the running instances from creation to destruction."""
target = messaging.Target(version='6.4')
target = messaging.Target(version='6.5')
def __init__(self, compute_driver=None, *args, **kwargs):
"""Load configuration options and connect to the hypervisor."""
+92 -13
View File
@@ -414,6 +414,7 @@ class ComputeAPI(object):
* 6.2 - Add target_state parameter to rebuild_instance()
* 6.3 - Add delete_attachment parameter to remove_volume_connection
* 6.4 - Add allow_share() and deny_share()
* 6.5 - Add 2nd RPC server with new topic 'compute-alt'
'''
VERSION_ALIASES = {
@@ -572,6 +573,33 @@ class ComputeAPI(object):
serializer=serializer,
call_monitor_timeout=cmt)
def prepare_for_alt_rpcserver(
self, client, server, version, **kwargs):
# NOTE(gmaan): By override the 'topic' in prepare() method, we make
# this rpc client to send the message to the different RPC server,
# which listen to RPC_TOPIC_ALT (the RPC server which is active during
# compute service graceful shutdown).
topic = RPC_TOPIC_ALT
msg = _("RPC: Sending the message to topic: %s") % topic
# NOTE(gmann): The old compute will not have the new 2nd RPC server
# so we need to handle it with RPC versioning. For the old compute,
# it will fallback to send the message to the original RPC server,
# which listen to RPC_TOPIC.
if not client.can_send_version('6.5'):
topic = RPC_TOPIC
msg = _("Fallback to send the message to original topic: %s as "
"RPC version is too old.") % topic
LOG.debug(msg)
params = {
'server': server,
'version': version,
'topic': topic}
params.update(kwargs)
return client.prepare(**params)
def add_fixed_ip_to_instance(self, ctxt, instance, network_id):
version = self._ver(ctxt, '5.0')
cctxt = self.router.client(ctxt).prepare(
@@ -612,6 +640,12 @@ class ComputeAPI(object):
kwargs.pop('migration')
kwargs.pop('limits')
version = '5.0'
# NOTE(gmaan): Most of the live migration RPC methods use the
# 'compute-alt' topic, but this RPC method should use the 'compute'
# topic. If a shutdown is initiated on the destination compute, the
# RPC server for the 'compute' topic will be stopped. If a live
# migration request arrives after that, the destination compute node
# should not take it.
cctxt = client.prepare(server=destination, version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
@@ -621,6 +655,10 @@ class ComputeAPI(object):
version = self._ver(ctxt, '5.0')
client = self.router.client(ctxt)
source = _compute_host(None, instance)
# NOTE(gmaan): Like check_can_live_migrate_destination, this RPC
# method should use topic 'compute'. If a shutdown is initiated
# on the source compute and, after that, a live migration request
# arrives, the source compute should not take it.
cctxt = client.prepare(server=source, version=version)
return cctxt.call(ctxt, 'check_can_live_migrate_source',
instance=instance,
@@ -867,8 +905,14 @@ class ComputeAPI(object):
def validate_console_port(self, ctxt, instance, port, console_type):
version = self._ver(ctxt, '5.0')
cctxt = self.router.client(ctxt).prepare(
server=_compute_host(None, instance), version=version)
client = self.router.client(ctxt)
# NOTE(gmaan): Send this RPC request to 'compute-alt' topic. This is
# called when the console is already requested. If shutdown is
# requested after that, compute should finish the port validation
# so that users can get their requested console.
cctxt = self.prepare_for_alt_rpcserver(
client,
server=_compute_host(None, instance), version=version)
return cctxt.call(ctxt, 'validate_console_port',
instance=instance, port=port,
console_type=console_type)
@@ -904,7 +948,13 @@ class ComputeAPI(object):
migration, migrate_data=None):
version = self._ver(ctxt, '5.0')
client = self.router.client(ctxt)
cctxt = client.prepare(server=host, version=version)
# NOTE(gmaan): Send this RPC request to 'compute-alt' topic. At this
# stage, both the source and destination compute have already confirmed
# that live migration can proceed. If the shutdown is initiated after
# that, the compute should finish the live migration using the
# 'compute-alt' RPC server.
cctxt = self.prepare_for_alt_rpcserver(
client, server=host, version=version)
cctxt.cast(ctxt, 'live_migration', instance=instance,
dest=dest, block_migration=block_migration,
migrate_data=migrate_data, migration=migration)
@@ -933,7 +983,12 @@ class ComputeAPI(object):
def post_live_migration_at_destination(self, ctxt, instance,
block_migration, host):
version = self._ver(ctxt, '5.0')
cctxt = self.router.client(ctxt).prepare(
client = self.router.client(ctxt)
# NOTE(gmaan): Send this RPC request to 'compute-alt' topic. If the
# shutdown is initiated during live migration, the compute should
# finish the live migration using the 'compute-alt' RPC server.
cctxt = self.prepare_for_alt_rpcserver(
client,
server=host, version=version,
call_monitor_timeout=CONF.rpc_response_timeout,
timeout=CONF.long_rpc_timeout)
@@ -951,9 +1006,14 @@ class ComputeAPI(object):
version = '5.0'
# We just need to honor the argument in the v5.0 RPC API method
msg_args['block_migration'] = None
cctxt = client.prepare(server=host, version=version,
timeout=CONF.long_rpc_timeout,
call_monitor_timeout=CONF.rpc_response_timeout)
# NOTE(gmaan): Send this RPC request to 'compute-alt' topic. If the
# shutdown is initiated during live migration, the compute should
# finish the live migration using the 'compute-alt' RPC server.
cctxt = self.prepare_for_alt_rpcserver(
client,
server=host, version=version,
timeout=CONF.long_rpc_timeout,
call_monitor_timeout=CONF.rpc_response_timeout)
return cctxt.call(ctxt, 'pre_live_migration',
instance=instance,
disk=disk, migrate_data=migrate_data,
@@ -1161,8 +1221,12 @@ class ComputeAPI(object):
if not client.can_send_version(version):
kwargs.pop('delete_attachment')
version = self._ver(ctxt, '5.0')
cctxt = client.prepare(server=host, version=version)
# NOTE(gmaan): This is called during live migration rollback. Send
# this RPC request to 'compute-alt' topic. If the shutdown is initiated
# during live migration rollback, the compute should finish the it
# using the 'compute-alt' RPC server.
cctxt = self.prepare_for_alt_rpcserver(
client, server=host, version=version)
return cctxt.call(ctxt, 'remove_volume_connection', **kwargs)
def rescue_instance(self, ctxt, instance, rescue_password,
@@ -1262,7 +1326,12 @@ class ComputeAPI(object):
migrate_data):
version = self._ver(ctxt, '5.0')
client = self.router.client(ctxt)
cctxt = client.prepare(server=host, version=version)
# NOTE(gmaan): This is called during live migration rollback. Send
# this RPC request to 'compute-alt' topic. If the shutdown is initiated
# during live migration rollback, the compute should finish it using
# the 'compute-alt' RPC server.
cctxt = self.prepare_for_alt_rpcserver(
client, server=host, version=version)
cctxt.cast(ctxt, 'rollback_live_migration_at_destination',
instance=instance, destroy_disks=destroy_disks,
migrate_data=migrate_data)
@@ -1286,7 +1355,12 @@ class ComputeAPI(object):
"""
version = self._ver(ctxt, '5.3')
client = self.router.client(ctxt)
cctxt = client.prepare(server=host, version=version)
# NOTE(gmaan): This is called during live migration rollback. Send
# this RPC request to 'compute-alt' topic. If the shutdown is initiated
# during live migration rollback, the compute should finish it using
# the 'compute-alt' RPC server.
cctxt = self.prepare_for_alt_rpcserver(
client, server=host, version=version)
cctxt.call(ctxt, 'drop_move_claim_at_destination', instance=instance)
def set_admin_password(self, ctxt, instance, new_pass):
@@ -1523,8 +1597,13 @@ class ComputeAPI(object):
def external_instance_event(self, ctxt, instances, events, host=None):
instance = instances[0]
version = self._ver(ctxt, '5.0')
cctxt = self.router.client(ctxt).prepare(
server=_compute_host(host, instance),
client = self.router.client(ctxt)
# NOTE(gmaan): This is initiated by the external services (for
# example, neutron send event for network change) and let's not block
# them during shutdown. Make this RPC request to 'compute-alt' topic.
cctxt = self.prepare_for_alt_rpcserver(
client,
_compute_host(host, instance),
version=version)
cctxt.cast(ctxt, 'external_instance_event', instances=instances,
events=events)
+4 -1
View File
@@ -37,7 +37,7 @@ __all__ = [
# NOTE(danms): This is the global service version counter
SERVICE_VERSION = 70
SERVICE_VERSION = 71
# NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@@ -249,6 +249,9 @@ SERVICE_VERSION_HISTORY = (
# Version 70: Compute RPC v6.4:
# Compute manager supports USB controller model traits
{'compute_rpc': '6.4'},
# Version 71: Compute RPC v6.5:
# Add 2nd RPC server for compute service
{'compute_rpc': '6.5'},
)
# This is the version after which we can rely on having a persistent
+6
View File
@@ -461,6 +461,12 @@ class TestCase(base.BaseTestCase):
if host is not None:
# Make sure that CONF.host is relevant to the right hostname
self.useFixture(nova_fixtures.ConfPatcher(host=host))
# By default, service creates a RPC server for auto populated
# 'topic' from service binary name. For compute service, we need
# to create the 2nd RPC server which will be done by pass the
# 'topic_alt' explicitly.
if name == 'compute' and 'topic_alt' not in kwargs:
kwargs['topic_alt'] = compute_rpcapi.RPC_TOPIC_ALT
if name == 'compute' and self.USES_DB:
# NOTE(danms): We need to create the HostMapping first, because
+67 -7
View File
@@ -131,10 +131,13 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
prepare_extra_kwargs = {}
cm_timeout = kwargs.pop('call_monitor_timeout', None)
timeout = kwargs.pop('timeout', None)
topic_alt = kwargs.pop('topic_alt', None)
if cm_timeout:
prepare_extra_kwargs['call_monitor_timeout'] = cm_timeout
if timeout:
prepare_extra_kwargs['timeout'] = timeout
if topic_alt:
prepare_extra_kwargs['topic'] = topic_alt
# NOTE(sbauza): If expected args are provided, we need to use them
# for the expected kwargs and just add the needed _return_value that
@@ -368,7 +371,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
def test_validate_console_port(self):
self._test_compute_api('validate_console_port', 'call',
instance=self.fake_instance_obj, port="5900",
console_type="novnc", version='6.0')
console_type="novnc", version='6.0',
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_host_maintenance_mode(self):
self._test_compute_api('host_maintenance_mode', 'call',
@@ -387,7 +391,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
instance=self.fake_instance_obj, dest='dest',
block_migration='blockity_block', host='tsoh',
migration='migration',
migrate_data={}, version='6.0')
migrate_data={}, version='6.0',
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_live_migration_force_complete(self):
migration = migration_obj.Migration()
@@ -420,7 +425,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
self._test_compute_api('post_live_migration_at_destination', 'call',
instance=self.fake_instance_obj,
block_migration='block_migration', host='host', version='6.0',
timeout=1234, call_monitor_timeout=60)
timeout=1234, call_monitor_timeout=60,
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_pause_instance(self):
self._test_compute_api('pause_instance', 'cast',
@@ -448,7 +454,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
expected_args, instance=self.fake_instance_obj,
block_migration='block_migration', disk='disk', host='host',
migrate_data=None, version='6.0',
call_monitor_timeout=60, timeout=1234)
call_monitor_timeout=60, timeout=1234,
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_supports_numa_live_migration(self):
mock_client = mock.MagicMock()
@@ -506,10 +513,19 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
instance=self.fake_instance_obj, block_migration=False,
disk_over_commit=False)
def test_rollback_live_migration_at_destination(self):
self._test_compute_api('rollback_live_migration_at_destination',
'cast', instance=self.fake_instance_obj,
host='host', destroy_disks=True,
migrate_data=None, version='6.0',
_return_value=None,
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_drop_move_claim_at_destination(self):
self._test_compute_api('drop_move_claim_at_destination', 'call',
instance=self.fake_instance_obj, host='host',
version='6.0', _return_value=None)
version='6.0', _return_value=None,
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_prep_resize(self):
self._test_compute_api('prep_resize', 'cast',
@@ -965,7 +981,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
def test_remove_volume_connection(self):
self._test_compute_api('remove_volume_connection', 'call',
instance=self.fake_instance_obj, volume_id='id', host='host',
delete_attachment=True, version='6.3')
delete_attachment=True, version='6.3',
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_rescue_instance(self):
self._test_compute_api('rescue_instance', 'cast',
@@ -1218,7 +1235,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
self._test_compute_api('external_instance_event', 'cast',
instances=[self.fake_instance_obj],
events=['event'],
version='6.0')
version='6.0',
topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
def test_build_and_run_instance(self):
# With rpcapi 5.11, when a list of accel_uuids is passed as a param,
@@ -1354,3 +1372,45 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
instance=self.fake_instance_obj,
share_mapping=self.get_fake_share_mapping(),
version='6.4')
def test_prepare_for_alt_rpcserver_select_topic_alt(self):
rpcapi = compute_rpcapi.ComputeAPI()
mock_client = mock.MagicMock()
mock_client.can_send_version.return_value = True
rpcapi.prepare_for_alt_rpcserver(
mock_client, server='fake_host', version='6.5')
mock_client.can_send_version.assert_called_once_with('6.5')
mock_client.prepare.assert_called_once_with(
server='fake_host',
version='6.5',
topic=compute_rpcapi.RPC_TOPIC_ALT)
def test_prepare_for_alt_rpcserver_fallback_topic_for_old_compute(self):
rpcapi = compute_rpcapi.ComputeAPI()
mock_client = mock.MagicMock()
mock_client.can_send_version.return_value = False
rpcapi.prepare_for_alt_rpcserver(
mock_client, server='fake_host', version='6.0')
mock_client.can_send_version.assert_called_once_with('6.5')
mock_client.prepare.assert_called_once_with(
server='fake_host',
version='6.0',
topic=compute_rpcapi.RPC_TOPIC)
def test_prepare_for_alt_rpcserver_with_extra_kwargs(self):
rpcapi = compute_rpcapi.ComputeAPI()
mock_client = mock.MagicMock()
mock_client.can_send_version.return_value = True
rpcapi.prepare_for_alt_rpcserver(
mock_client,
server='fake_host',
version='6.5',
call_monitor_timeout=60,
timeout=120)
mock_client.can_send_version.assert_called_once_with('6.5')
mock_client.prepare.assert_called_once_with(
server='fake_host',
version='6.5',
topic=compute_rpcapi.RPC_TOPIC_ALT,
call_monitor_timeout=60,
timeout=120)
@@ -0,0 +1,7 @@
---
- hosts: all
roles:
- orchestrate-devstack
- hosts: controller
roles:
- run-graceful-shutdown-tests
@@ -0,0 +1,55 @@
---
features:
- |
Nova services now support graceful shutdown on ``SIGTERM``. When a service
receives ``SIGTERM``, it will stop accepting new RPC requests and wait for
in-progress tasks to reach a safe termination point.
The compute service creates a second RPC server on an ``compute-alt`` topic
which remains active during graceful shutdown, allowing compute service to
finish the in-progress tasks.
Currently below operations are using second RPC server:
* Live migration
* Server external Event
* Get Console output
Nova added two new configuration options which will control this behavior:
* ``[DEFAULT]/graceful_shutdown_timeout`` - The overall time the service
waits before forcefully exit. This is defaults to 180 seconds for each
Nova services.
* ``[DEFAULT]/manager_shutdown_timeout`` - The time the service manager
waits for in-progress tasks to complete during graceful shutdown. This
is defaults to 160 seconds for each service manager. This must be less
than ``graceful_shutdown_timeout``.
You can increase these timeouts based on the traffic and how long the
long-running (e.g. live migrations) tasks take in your deployment.
We plan to improve the graceful shutdown in future releases by task
tracking and transitioning resources to a recoverable state. Until then,
this feature is experimental.
upgrade:
- |
The default value of ``[DEFAULT]/graceful_shutdown_timeout`` has been
changed from 60 to 180 seconds for all Nova services. This means that
when a Nova service receives ``SIGTERM``, it will now wait up to 180
seconds for a graceful shutdown before being forcefully terminated.
Operators using external system (e.g. k8s, systemd) to manage the
Nova serviecs should ensure that their service stop timeouts are set
to at least ``graceful_shutdown_timeout`` to avoid forcefully killing
service before Nova finish its graceful shutdown. For example, the
systemd ``TimeoutStopSec`` should be set to at least 180 seconds (or
greater) for Nova services.
- |
A new configuration option ``[DEFAULT]/manager_shutdown_timeout`` has been
added with a default value of 160 seconds. This controls how long the
service manager waits for in-progress tasks to finish during graceful
shutdown. Operators may want to tune this value based on how long their
typical long-running operations (e.g. live migrations) take to complete.
- |
The compute service now creates a second RPC server on the ``compute-alt``
topic. This means each compute worker will create an additional RabbitMQ
queue.
@@ -0,0 +1 @@
Run Nova graceful shutdown tests and verify the operations.
@@ -0,0 +1,47 @@
#!/bin/bash
source /opt/stack/devstack/openrc admin
set -x
set -e
confirm_resize() {
local server=$1
echo "Confirming resize on ${server}"
openstack server resize confirm "${server}"
count=0
while true; do
status=$(openstack server show "${server}" -f value -c status 2>/dev/null || echo "NOT_FOUND")
if [ "${status}" == "ACTIVE" ] || [ "${status}" == "ERROR" ]; then
break
fi
sleep 5
count=$((count+1))
if [ ${count} -eq 10 ]; then
echo "Timed out waiting for ${server} to be ACTIVE or Error after confirm resize"
break
fi
done
}
cleanup_server() {
local server=$1
status=$(openstack server show "${server}" -f value -c status 2>/dev/null || echo "NOT_FOUND")
if [ "${status}" == "VERIFY_RESIZE" ]; then
confirm_resize "${server}"
fi
status=$(openstack server show "${server}" -f value -c status 2>/dev/null || echo "NOT_FOUND")
if [ "${status}" == "ACTIVE" ] || [ "${status}" == "ERROR" ]; then
echo "Deleting ${server} (status: ${status})"
openstack server delete --wait "${server}"
else
echo "Skipping ${server} deletion (status: ${status})"
fi
}
for server in "$@"; do
cleanup_server "${server}"
done
@@ -0,0 +1,39 @@
#!/bin/bash
set -x
set -e
COMPUTE_HOST=$1
EXPECTED_STATE=${2:-active}
get_service_status() {
local host=$1
local status
status=$(ssh "${host}" systemctl is-active devstack@n-cpu || true)
echo "${status}"
}
wait_for_service_state() {
local host=$1
local expected=$2
local timeout=${3:-30}
local count=0
local status
status=$(get_service_status "${host}")
while [ "${status}" != "${expected}" ]; do
sleep 5
count=$((count+1))
if [ ${count} -eq ${timeout} ]; then
echo "Timed out waiting for compute service on ${host} to be ${expected} (current: ${status})"
exit 5
fi
status=$(get_service_status "${host}")
done
echo "Compute service on ${host} is ${expected}"
}
if [ "${EXPECTED_STATE}" == "active" ] && [ "$(get_service_status "${COMPUTE_HOST}")" != "active" ]; then
ssh "${COMPUTE_HOST}" sudo systemctl start devstack@n-cpu
fi
wait_for_service_state "${COMPUTE_HOST}" "${EXPECTED_STATE}"
@@ -0,0 +1,49 @@
#!/bin/bash
source /opt/stack/devstack/openrc admin
set -x
set -e
timeout=196
server_lm=$1
image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}')
flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}')
network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}')
echo "Creating test server on subnode for graceful shutdown live migration test"
openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \
--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} --wait ${server_lm}
echo "Starting live migration of ${server_lm} to ${CONTROLLER_HOSTNAME}"
openstack server migrate --live-migration \
--host ${CONTROLLER_HOSTNAME} ${server_lm}
# Wait for the migration to be in progress before returning so that the
# SIGTERM can be sent while the migrations are in progress.
count=0
while true; do
migration_status=$(openstack server migration list ${server_lm} \
-f value -c Status 2>/dev/null | head -1)
server_status=$(openstack server show ${server_lm} \
-f value -c status 2>/dev/null)
task_state=$(openstack server show ${server_lm} \
-f value -c OS-EXT-STS:task_state 2>/dev/null)
if [ "${migration_status}" == "preparing" ] || \
[ "${migration_status}" == "running" ] || \
[ "${task_state}" == "migrating" ]; then
echo "Live migration is in progress (status: ${migration_status}, task_state: ${task_state})"
break
elif [ "${migration_status}" == "completed" ] || \
{ [ "${server_status}" == "ACTIVE" ] && \
{ [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; }; then
echo "Live migration has already completed"
exit 2
fi
count=$((count+1))
if [ ${count} -eq ${timeout} ]; then
echo "Timed out waiting for migrations to start"
exit 2
fi
done
@@ -0,0 +1,45 @@
#!/bin/bash
source /opt/stack/devstack/openrc admin
set -x
set -e
server=$1
# Wait for the server to finish live migration and become ACTIVE with
# no task_state, which indicates the migration has completed.
timeout=360
count=0
migration_start=$(date +%s)
while true; do
status=$(openstack server show ${server} -f value -c status)
task_state=$(openstack server show ${server} -f value -c OS-EXT-STS:task_state)
if [ "${status}" == "ACTIVE" ] && { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; then
migration_end=$(date +%s)
migration_duration=$((migration_end - migration_start))
echo "Migration is completed in ${migration_duration} seconds."
break
fi
if [ "${status}" == "ERROR" ]; then
echo "Server went to ERROR status during live migration"
exit 3
fi
sleep 5
count=$((count+1))
if [ ${count} -eq ${timeout} ]; then
echo "Timed out waiting for live migration to complete"
exit 5
fi
done
# Make sure the server moved to the controller.
host=$(openstack server show ${server} -f value -c OS-EXT-SRV-ATTR:host)
if [[ ${host} != ${CONTROLLER_HOSTNAME} ]]; then
echo "Unexpected host ${host} for server after live migration during graceful shutdown."
exit 4
fi
echo "Live migration during graceful shutdown completed successfully"
echo "Server ${server} is ACTIVE on ${host}"
@@ -0,0 +1,56 @@
- name: Graceful shutdown source compute live migration
block:
- name: Start live migrations of test servers
become: true
become_user: stack
script: "start_live_migration.sh server-lm1"
environment:
SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
register: start_live_migrations_result
failed_when: start_live_migrations_result.rc not in [0, 2]
- name: Set fact if migrations completed or timed out before SIGTERM to source compute
set_fact:
live_migrations_completed_or_timeout: "{{ start_live_migrations_result.rc == 2 }}"
- name: Run graceful shutdown tests
when: not live_migrations_completed_or_timeout
block:
- name: Send SIGTERM to source compute to start the source compute graceful shutdown
delegate_to: compute1
become: true
shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
- name: Verify live migration is completed during graceful shutdown
become: true
become_user: stack
script: "verify_live_migration.sh server-lm1"
environment:
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
# Sleep for 180 sec: default graceful_shutdown_timeout
- name: Sleep for 180 seconds to allow source compute graceful shutdown to complete
pause:
seconds: 180
- name: Verify compute service is stopped after graceful shutdown
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }} inactive"
- name: Start and verify subnode compute service is running
become: true
become_user: stack
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }}"
- name: Cleanup test servers
become: true
become_user: stack
script: "cleanup_test_servers.sh server-lm1"
ignore_errors: true
- name: Fail if any test is skipped
fail:
msg: "One or more test is skipped due to operation is either completed or timed out before SIGTERM signal."
when: live_migrations_completed_or_timeout