Merge "Use 2nd RPC server in compute operations"

2026-02-26 17:44:59 +00:00
parent fbfc44f73b d5ffb58a8d
commit 44a7c5c2b0
14 changed files with 510 additions and 22 deletions
@@ -138,6 +138,44 @@
              block_migrate_cinder_iscsi: true
    post-run: playbooks/nova-live-migration/post-run.yaml

+- job:
+    name: nova-graceful-shutdown
+    parent: devstack-multinode
+    description: |
+      Run Nova graceful shutdown tests.
+    run: playbooks/nova-graceful-shutdown/run.yaml
+    timeout: 10800
+    vars:
+      devstack_services:
+        neutron-trunk: true
+        openstack-cli-server: true
+        s-account: false
+        s-container: false
+        s-object: false
+        s-proxy: false
+        c-bak: false
+      tempest_test_regex: ''
+      devstack_localrc:
+        <<: *uec_image_vars
+        SERVICE_GRACEFUL_SHUTDOWN_TIMEOUT: 180
+        NOVA_ALLOW_MOVE_TO_SAME_HOST: false
+        LIVE_MIGRATION_AVAILABLE: true
+        USE_BLOCK_MIGRATION_FOR_LIVE_MIGRATION: true
+    group-vars:
+      subnode:
+        devstack_services:
+          openstack-cli-server: true
+          s-account: false
+          s-container: false
+          s-object: false
+          s-proxy: false
+          c-bak: false
+        devstack_localrc:
+          SERVICE_GRACEFUL_SHUTDOWN_TIMEOUT: 180
+          NOVA_ALLOW_MOVE_TO_SAME_HOST: false
+          LIVE_MIGRATION_AVAILABLE: true
+          USE_BLOCK_MIGRATION_FOR_LIVE_MIGRATION: true
+
 - job:
    name: nova-alt-configurations
    parent: tempest-multinode-full-py3
@@ -829,6 +867,9 @@
              - ^nova/network/.*$
              - nova/virt/libvirt/vif.py
        - nova-live-migration
+        # NOTE(gmaan): We will be running the graceful shutdown testing in
+        # check pipeline only and not required to test in gate as such.
+        - nova-graceful-shutdown
        - nova-live-migration-ceph
        - nova-lvm
        - nova-multi-cell
@@ -642,7 +642,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):
 class ComputeManager(manager.Manager):
    """Manages the running instances from creation to destruction."""

-    target = messaging.Target(version='6.4')
+    target = messaging.Target(version='6.5')

    def __init__(self, compute_driver=None, *args, **kwargs):
        """Load configuration options and connect to the hypervisor."""
@@ -414,6 +414,7 @@ class ComputeAPI(object):
        * 6.2 - Add target_state parameter to rebuild_instance()
        * 6.3 - Add delete_attachment parameter to remove_volume_connection
        * 6.4 - Add allow_share() and deny_share()
+        * 6.5 - Add 2nd RPC server with new topic 'compute-alt'
    '''

    VERSION_ALIASES = {
@@ -572,6 +573,33 @@ class ComputeAPI(object):
                              serializer=serializer,
                              call_monitor_timeout=cmt)

+    def prepare_for_alt_rpcserver(
+            self, client, server, version, **kwargs):
+        # NOTE(gmaan): By override the 'topic' in prepare() method, we make
+        # this rpc client to send the message to the different RPC server,
+        # which listen to RPC_TOPIC_ALT (the RPC server which is active during
+        # compute service graceful shutdown).
+        topic = RPC_TOPIC_ALT
+        msg = _("RPC: Sending the message to topic: %s") % topic
+
+        # NOTE(gmann): The old compute will not have the new 2nd RPC server
+        # so we need to handle it with RPC versioning. For the old compute,
+        # it will fallback to send the message to the original RPC server,
+        # which listen to RPC_TOPIC.
+        if not client.can_send_version('6.5'):
+            topic = RPC_TOPIC
+            msg = _("Fallback to send the message to original topic: %s as "
+                    "RPC version is too old.") % topic
+
+        LOG.debug(msg)
+
+        params = {
+            'server': server,
+            'version': version,
+            'topic': topic}
+        params.update(kwargs)
+        return client.prepare(**params)
+
    def add_fixed_ip_to_instance(self, ctxt, instance, network_id):
        version = self._ver(ctxt, '5.0')
        cctxt = self.router.client(ctxt).prepare(
@@ -612,6 +640,12 @@ class ComputeAPI(object):
            kwargs.pop('migration')
            kwargs.pop('limits')
            version = '5.0'
+        # NOTE(gmaan): Most of the live migration RPC methods use the
+        # 'compute-alt' topic, but this RPC method should use the 'compute'
+        # topic. If a shutdown is initiated on the destination compute, the
+        # RPC server for the 'compute' topic will be stopped. If a live
+        # migration request arrives after that, the destination compute node
+        # should not take it.
        cctxt = client.prepare(server=destination, version=version,
                               call_monitor_timeout=CONF.rpc_response_timeout,
                               timeout=CONF.long_rpc_timeout)
@@ -621,6 +655,10 @@ class ComputeAPI(object):
        version = self._ver(ctxt, '5.0')
        client = self.router.client(ctxt)
        source = _compute_host(None, instance)
+        # NOTE(gmaan): Like check_can_live_migrate_destination, this RPC
+        # method should use topic 'compute'. If a shutdown is initiated
+        # on the source compute and, after that, a live migration request
+        # arrives, the source compute should not take it.
        cctxt = client.prepare(server=source, version=version)
        return cctxt.call(ctxt, 'check_can_live_migrate_source',
                          instance=instance,
@@ -867,8 +905,14 @@ class ComputeAPI(object):

    def validate_console_port(self, ctxt, instance, port, console_type):
        version = self._ver(ctxt, '5.0')
-        cctxt = self.router.client(ctxt).prepare(
-                server=_compute_host(None, instance), version=version)
+        client = self.router.client(ctxt)
+        # NOTE(gmaan): Send this RPC request to 'compute-alt' topic. This is
+        # called when the console is already requested. If shutdown is
+        # requested after that, compute should finish the port validation
+        # so that users can get their requested console.
+        cctxt = self.prepare_for_alt_rpcserver(
+            client,
+            server=_compute_host(None, instance), version=version)
        return cctxt.call(ctxt, 'validate_console_port',
                          instance=instance, port=port,
                          console_type=console_type)
@@ -904,7 +948,13 @@ class ComputeAPI(object):
                       migration, migrate_data=None):
        version = self._ver(ctxt, '5.0')
        client = self.router.client(ctxt)
-        cctxt = client.prepare(server=host, version=version)
+        # NOTE(gmaan): Send this RPC request to 'compute-alt' topic. At this
+        # stage, both the source and destination compute have already confirmed
+        # that live migration can proceed. If the shutdown is initiated after
+        # that, the compute should finish the live migration using the
+        # 'compute-alt' RPC server.
+        cctxt = self.prepare_for_alt_rpcserver(
+            client, server=host, version=version)
        cctxt.cast(ctxt, 'live_migration', instance=instance,
                   dest=dest, block_migration=block_migration,
                   migrate_data=migrate_data, migration=migration)
@@ -933,7 +983,12 @@ class ComputeAPI(object):
    def post_live_migration_at_destination(self, ctxt, instance,
            block_migration, host):
        version = self._ver(ctxt, '5.0')
-        cctxt = self.router.client(ctxt).prepare(
+        client = self.router.client(ctxt)
+        # NOTE(gmaan): Send this RPC request to 'compute-alt' topic. If the
+        # shutdown is initiated during live migration, the compute should
+        # finish the live migration using the 'compute-alt' RPC server.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client,
                server=host, version=version,
                call_monitor_timeout=CONF.rpc_response_timeout,
                timeout=CONF.long_rpc_timeout)
@@ -951,9 +1006,14 @@ class ComputeAPI(object):
            version = '5.0'
            # We just need to honor the argument in the v5.0 RPC API method
            msg_args['block_migration'] = None
-        cctxt = client.prepare(server=host, version=version,
-                               timeout=CONF.long_rpc_timeout,
-                               call_monitor_timeout=CONF.rpc_response_timeout)
+        # NOTE(gmaan): Send this RPC request to 'compute-alt' topic. If the
+        # shutdown is initiated during live migration, the compute should
+        # finish the live migration using the 'compute-alt' RPC server.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client,
+                server=host, version=version,
+                timeout=CONF.long_rpc_timeout,
+                call_monitor_timeout=CONF.rpc_response_timeout)
        return cctxt.call(ctxt, 'pre_live_migration',
                          instance=instance,
                          disk=disk, migrate_data=migrate_data,
@@ -1161,8 +1221,12 @@ class ComputeAPI(object):
        if not client.can_send_version(version):
            kwargs.pop('delete_attachment')
            version = self._ver(ctxt, '5.0')
-
-        cctxt = client.prepare(server=host, version=version)
+        # NOTE(gmaan): This is called during live migration rollback. Send
+        # this RPC request to 'compute-alt' topic. If the shutdown is initiated
+        # during live migration rollback, the compute should finish the it
+        # using the 'compute-alt' RPC server.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client, server=host, version=version)
        return cctxt.call(ctxt, 'remove_volume_connection', **kwargs)

    def rescue_instance(self, ctxt, instance, rescue_password,
@@ -1262,7 +1326,12 @@ class ComputeAPI(object):
                                               migrate_data):
        version = self._ver(ctxt, '5.0')
        client = self.router.client(ctxt)
-        cctxt = client.prepare(server=host, version=version)
+        # NOTE(gmaan): This is called during live migration rollback. Send
+        # this RPC request to 'compute-alt' topic. If the shutdown is initiated
+        # during live migration rollback, the compute should finish it using
+        # the 'compute-alt' RPC server.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client, server=host, version=version)
        cctxt.cast(ctxt, 'rollback_live_migration_at_destination',
                   instance=instance, destroy_disks=destroy_disks,
                   migrate_data=migrate_data)
@@ -1286,7 +1355,12 @@ class ComputeAPI(object):
        """
        version = self._ver(ctxt, '5.3')
        client = self.router.client(ctxt)
-        cctxt = client.prepare(server=host, version=version)
+        # NOTE(gmaan): This is called during live migration rollback. Send
+        # this RPC request to 'compute-alt' topic. If the shutdown is initiated
+        # during live migration rollback, the compute should finish it using
+        # the 'compute-alt' RPC server.
+        cctxt = self.prepare_for_alt_rpcserver(
+                client, server=host, version=version)
        cctxt.call(ctxt, 'drop_move_claim_at_destination', instance=instance)

    def set_admin_password(self, ctxt, instance, new_pass):
@@ -1523,8 +1597,13 @@ class ComputeAPI(object):
    def external_instance_event(self, ctxt, instances, events, host=None):
        instance = instances[0]
        version = self._ver(ctxt, '5.0')
-        cctxt = self.router.client(ctxt).prepare(
-            server=_compute_host(host, instance),
+        client = self.router.client(ctxt)
+        # NOTE(gmaan): This is initiated by the external services (for
+        # example, neutron send event for network change) and let's not block
+        # them during shutdown. Make this RPC request to 'compute-alt' topic.
+        cctxt = self.prepare_for_alt_rpcserver(
+            client,
+            _compute_host(host, instance),
            version=version)
        cctxt.cast(ctxt, 'external_instance_event', instances=instances,
                   events=events)
@@ -37,7 +37,7 @@ __all__ = [


 # NOTE(danms): This is the global service version counter
-SERVICE_VERSION = 70
+SERVICE_VERSION = 71


 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@@ -249,6 +249,9 @@ SERVICE_VERSION_HISTORY = (
    # Version 70: Compute RPC v6.4:
    # Compute manager supports USB controller model traits
    {'compute_rpc': '6.4'},
+    # Version 71: Compute RPC v6.5:
+    # Add 2nd RPC server for compute service
+    {'compute_rpc': '6.5'},
 )

 # This is the version after which we can rely on having a persistent
@@ -461,6 +461,12 @@ class TestCase(base.BaseTestCase):
        if host is not None:
            # Make sure that CONF.host is relevant to the right hostname
            self.useFixture(nova_fixtures.ConfPatcher(host=host))
+        # By default, service creates a RPC server for auto populated
+        # 'topic' from service binary name. For compute service, we need
+        # to create the 2nd RPC server which will be done by pass the
+        # 'topic_alt' explicitly.
+        if name == 'compute' and 'topic_alt' not in kwargs:
+            kwargs['topic_alt'] = compute_rpcapi.RPC_TOPIC_ALT

        if name == 'compute' and self.USES_DB:
            # NOTE(danms): We need to create the HostMapping first, because
@@ -131,10 +131,13 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        prepare_extra_kwargs = {}
        cm_timeout = kwargs.pop('call_monitor_timeout', None)
        timeout = kwargs.pop('timeout', None)
+        topic_alt = kwargs.pop('topic_alt', None)
        if cm_timeout:
            prepare_extra_kwargs['call_monitor_timeout'] = cm_timeout
        if timeout:
            prepare_extra_kwargs['timeout'] = timeout
+        if topic_alt:
+            prepare_extra_kwargs['topic'] = topic_alt

        # NOTE(sbauza): If expected args are provided, we need to use them
        # for the expected kwargs and just add the needed _return_value that
@@ -368,7 +371,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
    def test_validate_console_port(self):
        self._test_compute_api('validate_console_port', 'call',
                instance=self.fake_instance_obj, port="5900",
-                console_type="novnc", version='6.0')
+                console_type="novnc", version='6.0',
+                topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_host_maintenance_mode(self):
        self._test_compute_api('host_maintenance_mode', 'call',
@@ -387,7 +391,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
                instance=self.fake_instance_obj, dest='dest',
                block_migration='blockity_block', host='tsoh',
                migration='migration',
-                migrate_data={}, version='6.0')
+                migrate_data={}, version='6.0',
+                topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_live_migration_force_complete(self):
        migration = migration_obj.Migration()
@@ -420,7 +425,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        self._test_compute_api('post_live_migration_at_destination', 'call',
                instance=self.fake_instance_obj,
                block_migration='block_migration', host='host', version='6.0',
-                timeout=1234, call_monitor_timeout=60)
+                timeout=1234, call_monitor_timeout=60,
+                topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_pause_instance(self):
        self._test_compute_api('pause_instance', 'cast',
@@ -448,7 +454,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
                expected_args, instance=self.fake_instance_obj,
                block_migration='block_migration', disk='disk', host='host',
                migrate_data=None, version='6.0',
-                call_monitor_timeout=60, timeout=1234)
+                call_monitor_timeout=60, timeout=1234,
+                topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_supports_numa_live_migration(self):
        mock_client = mock.MagicMock()
@@ -506,10 +513,19 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
            instance=self.fake_instance_obj, block_migration=False,
            disk_over_commit=False)

+    def test_rollback_live_migration_at_destination(self):
+        self._test_compute_api('rollback_live_migration_at_destination',
+                               'cast', instance=self.fake_instance_obj,
+                               host='host', destroy_disks=True,
+                               migrate_data=None, version='6.0',
+                               _return_value=None,
+                               topic_alt=compute_rpcapi.RPC_TOPIC_ALT)
+
    def test_drop_move_claim_at_destination(self):
        self._test_compute_api('drop_move_claim_at_destination', 'call',
                               instance=self.fake_instance_obj, host='host',
-                               version='6.0', _return_value=None)
+                               version='6.0', _return_value=None,
+                               topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_prep_resize(self):
        self._test_compute_api('prep_resize', 'cast',
@@ -965,7 +981,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
    def test_remove_volume_connection(self):
        self._test_compute_api('remove_volume_connection', 'call',
                instance=self.fake_instance_obj, volume_id='id', host='host',
-                delete_attachment=True, version='6.3')
+                delete_attachment=True, version='6.3',
+                topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_rescue_instance(self):
        self._test_compute_api('rescue_instance', 'cast',
@@ -1218,7 +1235,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
        self._test_compute_api('external_instance_event', 'cast',
                               instances=[self.fake_instance_obj],
                               events=['event'],
-                               version='6.0')
+                               version='6.0',
+                               topic_alt=compute_rpcapi.RPC_TOPIC_ALT)

    def test_build_and_run_instance(self):
        # With rpcapi 5.11, when a list of accel_uuids is passed as a param,
@@ -1354,3 +1372,45 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
            instance=self.fake_instance_obj,
            share_mapping=self.get_fake_share_mapping(),
            version='6.4')
+
+    def test_prepare_for_alt_rpcserver_select_topic_alt(self):
+        rpcapi = compute_rpcapi.ComputeAPI()
+        mock_client = mock.MagicMock()
+        mock_client.can_send_version.return_value = True
+        rpcapi.prepare_for_alt_rpcserver(
+            mock_client, server='fake_host', version='6.5')
+        mock_client.can_send_version.assert_called_once_with('6.5')
+        mock_client.prepare.assert_called_once_with(
+            server='fake_host',
+            version='6.5',
+            topic=compute_rpcapi.RPC_TOPIC_ALT)
+
+    def test_prepare_for_alt_rpcserver_fallback_topic_for_old_compute(self):
+        rpcapi = compute_rpcapi.ComputeAPI()
+        mock_client = mock.MagicMock()
+        mock_client.can_send_version.return_value = False
+        rpcapi.prepare_for_alt_rpcserver(
+            mock_client, server='fake_host', version='6.0')
+        mock_client.can_send_version.assert_called_once_with('6.5')
+        mock_client.prepare.assert_called_once_with(
+            server='fake_host',
+            version='6.0',
+            topic=compute_rpcapi.RPC_TOPIC)
+
+    def test_prepare_for_alt_rpcserver_with_extra_kwargs(self):
+        rpcapi = compute_rpcapi.ComputeAPI()
+        mock_client = mock.MagicMock()
+        mock_client.can_send_version.return_value = True
+        rpcapi.prepare_for_alt_rpcserver(
+            mock_client,
+            server='fake_host',
+            version='6.5',
+            call_monitor_timeout=60,
+            timeout=120)
+        mock_client.can_send_version.assert_called_once_with('6.5')
+        mock_client.prepare.assert_called_once_with(
+            server='fake_host',
+            version='6.5',
+            topic=compute_rpcapi.RPC_TOPIC_ALT,
+            call_monitor_timeout=60,
+            timeout=120)
@@ -0,0 +1,7 @@
+---
+- hosts: all
+  roles:
+    - orchestrate-devstack
+- hosts: controller
+  roles:
+    - run-graceful-shutdown-tests
@@ -0,0 +1,55 @@
+---
+features:
+  - |
+    Nova services now support graceful shutdown on ``SIGTERM``. When a service
+    receives ``SIGTERM``, it will stop accepting new RPC requests and wait for
+    in-progress tasks to reach a safe termination point.
+
+    The compute service creates a second RPC server on an ``compute-alt`` topic
+    which remains active during graceful shutdown, allowing compute service to
+    finish the in-progress tasks.
+
+    Currently below operations are using second RPC server:
+
+    * Live migration
+    * Server external Event
+    * Get Console output
+
+    Nova added two new configuration options which will control this behavior:
+
+    * ``[DEFAULT]/graceful_shutdown_timeout`` - The overall time the service
+      waits before forcefully exit. This is defaults to 180 seconds for each
+      Nova services.
+    * ``[DEFAULT]/manager_shutdown_timeout`` - The time the service manager
+      waits for in-progress tasks to complete during graceful shutdown. This
+      is defaults to 160 seconds for each service manager. This must be less
+      than ``graceful_shutdown_timeout``.
+
+    You can increase these timeouts based on the traffic and how long the
+    long-running (e.g. live migrations) tasks take in your deployment.
+
+    We plan to improve the graceful shutdown in future releases by task
+    tracking and transitioning resources to a recoverable state. Until then,
+    this feature is experimental.
+upgrade:
+  - |
+    The default value of ``[DEFAULT]/graceful_shutdown_timeout`` has been
+    changed from 60 to 180 seconds for all Nova services. This means that
+    when a Nova service receives ``SIGTERM``, it will now wait up to 180
+    seconds for a graceful shutdown before being forcefully terminated.
+    Operators using external system (e.g. k8s, systemd) to manage the
+    Nova serviecs should ensure that their service stop timeouts are set
+    to at least ``graceful_shutdown_timeout`` to avoid forcefully killing
+    service before Nova finish its graceful shutdown. For example, the
+    systemd ``TimeoutStopSec`` should be set to at least 180 seconds (or
+    greater) for Nova services.
+  - |
+    A new configuration option ``[DEFAULT]/manager_shutdown_timeout`` has been
+    added with a default value of 160 seconds. This controls how long the
+    service manager waits for in-progress tasks to finish during graceful
+    shutdown. Operators may want to tune this value based on how long their
+    typical long-running operations (e.g. live migrations) take to complete.
+  - |
+    The compute service now creates a second RPC server on the ``compute-alt``
+    topic. This means each compute worker will create an additional RabbitMQ
+    queue.
@@ -0,0 +1 @@
+Run Nova graceful shutdown tests and verify the operations.
@@ -0,0 +1,47 @@
+#!/bin/bash
+source /opt/stack/devstack/openrc admin
+set -x
+set -e
+
+confirm_resize() {
+    local server=$1
+
+    echo "Confirming resize on ${server}"
+    openstack server resize confirm "${server}"
+
+    count=0
+    while true; do
+        status=$(openstack server show "${server}" -f value -c status 2>/dev/null || echo "NOT_FOUND")
+        if [ "${status}" == "ACTIVE" ] || [ "${status}" == "ERROR" ]; then
+            break
+        fi
+        sleep 5
+        count=$((count+1))
+        if [ ${count} -eq 10 ]; then
+            echo "Timed out waiting for ${server} to be ACTIVE or Error after confirm resize"
+            break
+        fi
+    done
+}
+
+cleanup_server() {
+    local server=$1
+
+    status=$(openstack server show "${server}" -f value -c status 2>/dev/null || echo "NOT_FOUND")
+
+    if [ "${status}" == "VERIFY_RESIZE" ]; then
+        confirm_resize "${server}"
+    fi
+
+    status=$(openstack server show "${server}" -f value -c status 2>/dev/null || echo "NOT_FOUND")
+    if [ "${status}" == "ACTIVE" ] || [ "${status}" == "ERROR" ]; then
+        echo "Deleting ${server} (status: ${status})"
+        openstack server delete --wait "${server}"
+    else
+        echo "Skipping ${server} deletion (status: ${status})"
+    fi
+}
+
+for server in "$@"; do
+    cleanup_server "${server}"
+done
@@ -0,0 +1,39 @@
+#!/bin/bash
+set -x
+set -e
+
+COMPUTE_HOST=$1
+EXPECTED_STATE=${2:-active}
+
+get_service_status() {
+  local host=$1
+  local status
+  status=$(ssh "${host}" systemctl is-active devstack@n-cpu || true)
+  echo "${status}"
+}
+
+wait_for_service_state() {
+  local host=$1
+  local expected=$2
+  local timeout=${3:-30}
+  local count=0
+  local status
+
+  status=$(get_service_status "${host}")
+  while [ "${status}" != "${expected}" ]; do
+    sleep 5
+    count=$((count+1))
+    if [ ${count} -eq ${timeout} ]; then
+      echo "Timed out waiting for compute service on ${host} to be ${expected} (current: ${status})"
+      exit 5
+    fi
+    status=$(get_service_status "${host}")
+  done
+  echo "Compute service on ${host} is ${expected}"
+}
+
+if [ "${EXPECTED_STATE}" == "active" ] && [ "$(get_service_status "${COMPUTE_HOST}")" != "active" ]; then
+    ssh "${COMPUTE_HOST}" sudo systemctl start devstack@n-cpu
+fi
+
+wait_for_service_state "${COMPUTE_HOST}" "${EXPECTED_STATE}"
@@ -0,0 +1,49 @@
+#!/bin/bash
+source /opt/stack/devstack/openrc admin
+set -x
+set -e
+
+timeout=196
+
+server_lm=$1
+
+image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}')
+flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}')
+network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}')
+
+echo "Creating test server on subnode for graceful shutdown live migration test"
+openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \
+--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} --wait ${server_lm}
+
+echo "Starting live migration of ${server_lm} to ${CONTROLLER_HOSTNAME}"
+openstack server migrate --live-migration \
+--host ${CONTROLLER_HOSTNAME} ${server_lm}
+
+# Wait for the migration to be in progress before returning so that the
+# SIGTERM can be sent while the migrations are in progress.
+count=0
+while true; do
+    migration_status=$(openstack server migration list ${server_lm} \
+        -f value -c Status 2>/dev/null | head -1)
+    server_status=$(openstack server show ${server_lm} \
+        -f value -c status 2>/dev/null)
+    task_state=$(openstack server show ${server_lm} \
+        -f value -c OS-EXT-STS:task_state 2>/dev/null)
+    if [ "${migration_status}" == "preparing" ] || \
+       [ "${migration_status}" == "running" ] || \
+       [ "${task_state}" == "migrating" ]; then
+        echo "Live migration is in progress (status: ${migration_status}, task_state: ${task_state})"
+        break
+    elif [ "${migration_status}" == "completed" ] || \
+         { [ "${server_status}" == "ACTIVE" ] && \
+           { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; }; then
+        echo "Live migration has already completed"
+        exit 2
+    fi
+
+    count=$((count+1))
+    if [ ${count} -eq ${timeout} ]; then
+        echo "Timed out waiting for migrations to start"
+        exit 2
+    fi
+done
@@ -0,0 +1,45 @@
+#!/bin/bash
+source /opt/stack/devstack/openrc admin
+set -x
+set -e
+
+server=$1
+
+# Wait for the server to finish live migration and become ACTIVE with
+# no task_state, which indicates the migration has completed.
+timeout=360
+count=0
+migration_start=$(date +%s)
+while true; do
+    status=$(openstack server show ${server} -f value -c status)
+    task_state=$(openstack server show ${server} -f value -c OS-EXT-STS:task_state)
+
+    if [ "${status}" == "ACTIVE" ] && { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; then
+        migration_end=$(date +%s)
+        migration_duration=$((migration_end - migration_start))
+        echo "Migration is completed in ${migration_duration} seconds."
+        break
+    fi
+
+    if [ "${status}" == "ERROR" ]; then
+        echo "Server went to ERROR status during live migration"
+        exit 3
+    fi
+
+    sleep 5
+    count=$((count+1))
+    if [ ${count} -eq ${timeout} ]; then
+        echo "Timed out waiting for live migration to complete"
+        exit 5
+    fi
+done
+
+# Make sure the server moved to the controller.
+host=$(openstack server show ${server} -f value -c OS-EXT-SRV-ATTR:host)
+if [[ ${host} != ${CONTROLLER_HOSTNAME} ]]; then
+    echo "Unexpected host ${host} for server after live migration during graceful shutdown."
+    exit 4
+fi
+
+echo "Live migration during graceful shutdown completed successfully"
+echo "Server ${server} is ACTIVE on ${host}"
@@ -0,0 +1,56 @@
+- name: Graceful shutdown source compute live migration
+  block:
+    - name: Start live migrations of test servers
+      become: true
+      become_user: stack
+      script: "start_live_migration.sh server-lm1"
+      environment:
+        SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
+        CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
+      register: start_live_migrations_result
+      failed_when: start_live_migrations_result.rc not in [0, 2]
+
+    - name: Set fact if migrations completed or timed out before SIGTERM to source compute
+      set_fact:
+        live_migrations_completed_or_timeout: "{{ start_live_migrations_result.rc == 2 }}"
+
+    - name: Run graceful shutdown tests
+      when: not live_migrations_completed_or_timeout
+      block:
+        - name: Send SIGTERM to source compute to start the source compute graceful shutdown
+          delegate_to: compute1
+          become: true
+          shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
+
+        - name: Verify live migration is completed during graceful shutdown
+          become: true
+          become_user: stack
+          script: "verify_live_migration.sh server-lm1"
+          environment:
+            CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
+
+        # Sleep for 180 sec: default graceful_shutdown_timeout
+        - name: Sleep for 180 seconds to allow source compute graceful shutdown to complete
+          pause:
+            seconds: 180
+
+        - name: Verify compute service is stopped after graceful shutdown
+          become: true
+          become_user: stack
+          script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }} inactive"
+
+    - name: Start and verify subnode compute service is running
+      become: true
+      become_user: stack
+      script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }}"
+
+    - name: Cleanup test servers
+      become: true
+      become_user: stack
+      script: "cleanup_test_servers.sh server-lm1"
+      ignore_errors: true
+
+- name: Fail if any test is skipped
+  fail:
+    msg: "One or more test is skipped due to operation is either completed or timed out before SIGTERM signal."
+  when: live_migrations_completed_or_timeout
				`@@ -0,0 +1 @@`
				`Run Nova graceful shutdown tests and verify the operations.`