Prepare resize/cold migration for graceful shutdown
During graceful shutdown, compute service keep a 2nd RPC
server active which can be used to finish the in-progress
operations. Like live migration, resize and cold migrations
also perform RPC call among source and destination compute.
For those operation also, we can use 2nd RPC server and make
sure they will be completed during graceful shutdown.
A quick overview of what all RPC methods are involved in the
resize/cold migration and what all will be using 2nd RPC server:
Resize/cold migration
- prep_resize: No, resize/migration is not started yet.
- resize_instance: Yes, here the resize/migration starts.
- finish_resize: Yes
- cross cell resize case:
- prep_snapshot_based_resize_at_dest: NO, this is initial check and
migration is not started
- prep_snapshot_based_resize_at_source: Yes, this start the migration
Confirm resize: NO
- confirm_resize: NO
- cross cell confirm resize case:
- confirm_snapshot_based_resize - NO
Revert resize:
- revert_resize - NO
- check_instance_shared_storage: YES. This is called from dest to source
so we need source to respond to it so that revert can continue.
- finish_revert_resize on source- YES, at this stage, revert resize is
in progress and abandoning it here can lead migration to unreocverable
state.
- cross cell revert case:
- revert_snapshot_based_resize_at_dest: NO
- finish_revert_snapshot_based_resize_at_source: YES
Partial implement blueprint nova-services-graceful-shutdown-part1
Change-Id: If08b698d012a75b587144501d829403ec616f685
Signed-off-by: Ghanshyam Maan <gmaan.os14@gmail.com>
This commit is contained in:
@@ -0,0 +1,46 @@
|
||||
#!/bin/bash
|
||||
source /opt/stack/devstack/openrc admin
|
||||
set -x
|
||||
set -e
|
||||
|
||||
timeout=196
|
||||
|
||||
server_cm=$1
|
||||
|
||||
image_id=$(openstack image list -f value -c ID | awk 'NR==1{print $1}')
|
||||
flavor_id=$(openstack flavor list -f value -c ID | awk 'NR==1{print $1}')
|
||||
network_id=$(openstack network list --no-share -f value -c ID | awk 'NR==1{print $1}')
|
||||
|
||||
echo "Creating test server on subnode for graceful shutdown cold migration test"
|
||||
openstack --os-compute-api-version 2.74 server create --image ${image_id} --flavor ${flavor_id} \
|
||||
--nic net-id=${network_id} --host ${SUBNODE_HOSTNAME} --wait ${server_cm}
|
||||
|
||||
echo "Starting cold migration of ${server_cm} to ${CONTROLLER_HOSTNAME}"
|
||||
openstack --os-compute-api-version 2.56 server migrate \
|
||||
--host ${CONTROLLER_HOSTNAME} ${server_cm}
|
||||
|
||||
# Wait for the migrations to be in progress before returning so that the
|
||||
# SIGTERM can be sent while the migrations are in progress.
|
||||
count=0
|
||||
while true; do
|
||||
cold_migration_status=$(openstack server migration list ${server_cm} -f value -c Status 2>/dev/null | head -1)
|
||||
server_task_state=$(openstack server show ${server_cm} -f value -c OS-EXT-STS:task_state 2>/dev/null)
|
||||
server_status=$(openstack server show ${server_cm} -f value -c status 2>/dev/null)
|
||||
if [ "${cold_migration_status}" == "migrating" ] || \
|
||||
[ "${cold_migration_status}" == "post-migrating" ] || \
|
||||
[ "${server_task_state}" == "resize_migrating" ] || \
|
||||
[ "${server_task_state}" == "resize_migrated" ] || \
|
||||
[ "${server_task_state}" == "resize_finish" ]; then
|
||||
echo "Cold migration is in progress"
|
||||
break
|
||||
elif [ "${cold_migration_status}" == "finished" ] || [ "${server_status}" == "VERIFY_RESIZE" ]; then
|
||||
echo "Cold migration appears to have already completed"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
count=$((count+1))
|
||||
if [ ${count} -eq ${timeout} ]; then
|
||||
echo "Timed out waiting for migrations to start"
|
||||
exit 2
|
||||
fi
|
||||
done
|
||||
@@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
source /opt/stack/devstack/openrc admin
|
||||
set -x
|
||||
set -e
|
||||
|
||||
server=$1
|
||||
|
||||
# Wait for the server to finish cold migration and reach VERIFY_RESIZE state,
|
||||
# which indicates the migration has completed and is awaiting confirmation.
|
||||
timeout=360
|
||||
count=0
|
||||
migration_start=$(date +%s)
|
||||
while true; do
|
||||
status=$(openstack server show ${server} -f value -c status)
|
||||
task_state=$(openstack server show ${server} -f value -c OS-EXT-STS:task_state)
|
||||
|
||||
if [ "${status}" == "VERIFY_RESIZE" ] && { [ "${task_state}" == "None" ] || [ -z "${task_state}" ]; }; then
|
||||
migration_end=$(date +%s)
|
||||
migration_duration=$((migration_end - migration_start))
|
||||
echo "Cold migration completed in ${migration_duration} seconds."
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "${status}" == "ERROR" ]; then
|
||||
echo "Server went to ERROR status during cold migration"
|
||||
exit 6
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
count=$((count+1))
|
||||
if [ ${count} -eq ${timeout} ]; then
|
||||
echo "Timed out waiting for cold migration to complete"
|
||||
exit 5
|
||||
fi
|
||||
done
|
||||
@@ -50,7 +50,57 @@
|
||||
script: "cleanup_test_servers.sh server-lm1"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Graceful shutdown source compute cold migration
|
||||
block:
|
||||
- name: Start cold migrations of test servers
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "start_cold_migration.sh server-cm1"
|
||||
environment:
|
||||
SUBNODE_HOSTNAME: "{{ hostvars['compute1']['ansible_hostname'] }}"
|
||||
CONTROLLER_HOSTNAME: "{{ hostvars['controller']['ansible_hostname'] }}"
|
||||
register: start_cold_migrations_result
|
||||
failed_when: start_cold_migrations_result.rc not in [0, 2]
|
||||
|
||||
- name: Set fact if migration is completed or timed out before SIGTERM to source compute
|
||||
set_fact:
|
||||
cold_migrations_completed_or_timeout: "{{ start_cold_migrations_result.rc == 2 }}"
|
||||
|
||||
- name: Run graceful shutdown tests
|
||||
when: not cold_migrations_completed_or_timeout
|
||||
block:
|
||||
- name: Send SIGTERM to source compute to start the source compute graceful shutdown
|
||||
delegate_to: compute1
|
||||
become: true
|
||||
shell: "kill -15 $(systemctl show devstack@n-cpu -p MainPID --value)"
|
||||
|
||||
- name: Verify cold migration is completed during graceful shutdown
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "verify_cold_migration.sh server-cm1"
|
||||
|
||||
# Sleep for 180 sec: default graceful_shutdown_timeout
|
||||
- name: Sleep for 180 seconds to allow source compute graceful shutdown to complete
|
||||
pause:
|
||||
seconds: 180
|
||||
|
||||
- name: Verify compute service is stopped after graceful shutdown
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }} inactive"
|
||||
|
||||
- name: Start and verify subnode compute service is running
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "start_and_verify_compute_service.sh {{ hostvars['compute1']['ansible_hostname'] }}"
|
||||
|
||||
- name: Cleanup test servers
|
||||
become: true
|
||||
become_user: stack
|
||||
script: "cleanup_test_servers.sh server-cm1"
|
||||
ignore_errors: true
|
||||
|
||||
- name: Fail if any test is skipped
|
||||
fail:
|
||||
msg: "One or more test is skipped due to operation is either completed or timed out before SIGTERM signal."
|
||||
when: live_migrations_completed_or_timeout
|
||||
when: live_migrations_completed_or_timeout or cold_migrations_completed_or_timeout
|
||||
|
||||
Reference in New Issue
Block a user