From 1a42eb9ec13b04c96f6eb380a53d1e656f087f36 Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Wed, 6 Feb 2019 18:49:41 -0500 Subject: [PATCH] Change live_migration_wait_for_vif_plug=True by default This resolves the TODO to make the option default to True so that the source compute service will wait for the "network-vif-plugged" event, initiated by vif plugging during pre_live_migration on the destination compute servie, before initiating the guest transfer in the hypervisor. There are certain networking backends that will not send the neutron event for vif plugging alone (which is arguably a bug) but OVS and linuxbridge, probably the two most widely used in OpenStack deployments, are known to work with this config. While in here, the Timeout message is fleshed out to give more help with what the cause of the timeout could be and possible recourse. Change-Id: I8da38aec0fe4808273b8587ace3df9dbbc3ab576 --- nova/compute/manager.py | 16 +++++++++++++--- nova/conf/compute.py | 13 ++++--------- .../notification_sample_tests/test_instance.py | 3 +++ nova/tests/live_migration/hooks/ceph.sh | 3 --- nova/tests/unit/compute/test_compute.py | 1 - ...plug-stein-default-true-12103b09b8ac686a.yaml | 7 +++++++ 6 files changed, 27 insertions(+), 16 deletions(-) create mode 100644 releasenotes/notes/live_migration_wait_for_vif_plug-stein-default-true-12103b09b8ac686a.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 9fb4b3568b..3e9965da58 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6296,12 +6296,22 @@ class ComputeManager(manager.Manager): self._cleanup_pre_live_migration( context, dest, instance, migration, migrate_data) except eventlet.timeout.Timeout: - msg = 'Timed out waiting for events: %s' - LOG.warning(msg, events, instance=instance) + # We only get here if wait_for_vif_plugged is True which means + # live_migration_wait_for_vif_plug=True on the destination host. + msg = ( + 'Timed out waiting for events: %(events)s. If these timeouts ' + 'are a persistent issue it could mean the networking backend ' + 'on host %(dest)s does not support sending these events ' + 'unless there are port binding host changes which does not ' + 'happen at this point in the live migration process. You may ' + 'need to disable the live_migration_wait_for_vif_plug option ' + 'on host %(dest)s.') + subs = {'events': events, 'dest': dest} + LOG.warning(msg, subs, instance=instance) if CONF.vif_plugging_is_fatal: self._cleanup_pre_live_migration( context, dest, instance, migration, migrate_data) - raise exception.MigrationError(reason=msg % events) + raise exception.MigrationError(reason=msg % subs) except Exception: with excutils.save_and_reraise_exception(): LOG.exception('Pre live migration failed at %s', diff --git a/nova/conf/compute.py b/nova/conf/compute.py index 4f78cdaf07..16c24b5f33 100644 --- a/nova/conf/compute.py +++ b/nova/conf/compute.py @@ -745,8 +745,7 @@ For example:: cpu_shared_set = "4-12,^8,15" """), cfg.BoolOpt('live_migration_wait_for_vif_plug', - # TODO(mriedem): Change to default=True starting in Stein. - default=False, + default=True, help=""" Determine if the source compute host should wait for a ``network-vif-plugged`` event from the (neutron) networking service before starting the actual transfer @@ -764,12 +763,9 @@ event may be triggered and then received on the source compute host and the source compute can wait for that event to ensure networking is set up on the destination host before starting the guest transfer in the hypervisor. -By default, this is False for two reasons: +.. note:: -1. Backward compatibility: deployments should test this out and ensure it works - for them before enabling it. - -2. The compute service cannot reliably determine which types of virtual + The compute service cannot reliably determine which types of virtual interfaces (``port.binding:vif_type``) will send ``network-vif-plugged`` events without an accompanying port ``binding:host_id`` change. Open vSwitch and linuxbridge should be OK, but OpenDaylight is at least @@ -780,8 +776,7 @@ Possible values: * True: wait for ``network-vif-plugged`` events before starting guest transfer * False: do not wait for ``network-vif-plugged`` events before starting guest - transfer (this is how things have always worked before this option - was introduced) + transfer (this is the legacy behavior) Related options: diff --git a/nova/tests/functional/notification_sample_tests/test_instance.py b/nova/tests/functional/notification_sample_tests/test_instance.py index de2d8c98ea..e1c48e5efe 100644 --- a/nova/tests/functional/notification_sample_tests/test_instance.py +++ b/nova/tests/functional/notification_sample_tests/test_instance.py @@ -43,6 +43,9 @@ class TestInstanceNotificationSampleWithMultipleCompute( self.useFixture(fixtures.AllServicesCurrent()) def test_multiple_compute_actions(self): + # There are not going to be real network-vif-plugged events coming + # so don't wait for them. + self.flags(live_migration_wait_for_vif_plug=False, group='compute') server = self._boot_a_server( extra_params={'networks': [{'port': self.neutron.port_1['id']}]}) self._wait_for_notification('instance.create.end') diff --git a/nova/tests/live_migration/hooks/ceph.sh b/nova/tests/live_migration/hooks/ceph.sh index e5f46bb774..b6aa9e1be2 100755 --- a/nova/tests/live_migration/hooks/ceph.sh +++ b/nova/tests/live_migration/hooks/ceph.sh @@ -80,9 +80,6 @@ function _ceph_configure_nova { $ANSIBLE all --sudo -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_pool value=${NOVA_CEPH_POOL}" $ANSIBLE all --sudo -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=libvirt option=images_rbd_ceph_conf value=${CEPH_CONF_FILE}" - # Configure nova-compute to wait for network-vif-plugged events. - $ANSIBLE all --sudo -f 5 -i "$WORKSPACE/inventory" -m ini_file -a "dest=${NOVA_CONF} section=compute option=live_migration_wait_for_vif_plug value=True" - sudo ceph -c ${CEPH_CONF_FILE} auth get-or-create client.${CINDER_CEPH_USER} \ mon "allow r" \ osd "allow class-read object_prefix rbd_children, allow rwx pool=${CINDER_CEPH_POOL}, allow rwx pool=${NOVA_CEPH_POOL},allow rwx pool=${GLANCE_CEPH_POOL}" | \ diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 0d2df5a414..1bd6e6626e 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -6157,7 +6157,6 @@ class ComputeTestCase(BaseTestCase, with mock.patch.object(self.compute.network_api, 'setup_networks_on_host') as mock_setup: - self.flags(live_migration_wait_for_vif_plug=True, group='compute') ret = self.compute.pre_live_migration(c, instance=instance, block_migration=False, disk=None, diff --git a/releasenotes/notes/live_migration_wait_for_vif_plug-stein-default-true-12103b09b8ac686a.yaml b/releasenotes/notes/live_migration_wait_for_vif_plug-stein-default-true-12103b09b8ac686a.yaml new file mode 100644 index 0000000000..40e77c50bf --- /dev/null +++ b/releasenotes/notes/live_migration_wait_for_vif_plug-stein-default-true-12103b09b8ac686a.yaml @@ -0,0 +1,7 @@ +--- +upgrade: + - | + The default value for the ``[compute]/live_migration_wait_for_vif_plug`` + configuration option has been changed to True. As noted in the help text + for the option, some networking backends will not work with this set to + True, although OVS and linuxbridge will.