diff --git a/nova/compute/manager.py b/nova/compute/manager.py index b9e2442b9e..8a04bb20b1 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -560,7 +560,7 @@ class ComputeVirtAPI(virtapi.VirtAPI): class ComputeManager(manager.Manager): """Manages the running instances from creation to destruction.""" - target = messaging.Target(version='5.9') + target = messaging.Target(version='5.10') def __init__(self, compute_driver=None, *args, **kwargs): """Load configuration options and connect to the hypervisor.""" @@ -4686,6 +4686,153 @@ class ComputeManager(manager.Manager): self.rt.drop_move_claim(ctxt, instance, instance.node, instance_type=instance.new_flavor) + @wrap_exception() + @reverts_task_state + @wrap_instance_event(prefix='compute') + @errors_out_migration + @wrap_instance_fault + def finish_revert_snapshot_based_resize_at_source( + self, ctxt, instance, migration): + """Reverts a snapshot-based resize at the source host. + + Spawn the guest and re-connect volumes/VIFs on the source host and + revert the instance to use the old_flavor for resource usage reporting. + + Updates allocations in the placement service to move the source node + allocations, held by the migration record, to the instance and drop + the allocations held by the instance on the destination node. + + :param ctxt: nova auth request context targeted at the target cell + :param instance: Instance object whose vm_state is "resized" and + task_state is "resize_reverting". + :param migration: Migration object whose status is "reverting". + """ + + @utils.synchronized(instance.uuid) + def do_revert(): + LOG.info('Reverting resize on source host.', instance=instance) + with self._error_out_instance_on_exception(ctxt, instance): + self._finish_revert_snapshot_based_resize_at_source( + ctxt, instance, migration) + do_revert() + + # Broadcast to all schedulers that the instance is on this host. + # This is best effort so if anything fails just log it. + try: + self._update_scheduler_instance_info(ctxt, instance) + except Exception as e: + LOG.warning('finish_revert_snapshot_based_resize_at_source failed ' + 'during post-processing. Error: %s', e, + instance=instance) + + def _finish_revert_snapshot_based_resize_at_source( + self, ctxt, instance, migration): + """Private version of finish_revert_snapshot_based_resize_at_source. + + This allows the main method to be decorated with error handlers. + + :param ctxt: nova auth request context targeted at the source cell + :param instance: Instance object whose vm_state is "resized" and + task_state is "resize_reverting". + :param migration: Migration object whose status is "reverting". + """ + # Delete stashed old_vm_state information. We will use this to + # determine if the guest should be powered on when we spawn it. + old_vm_state = instance.system_metadata.pop( + 'old_vm_state', vm_states.ACTIVE) + + # Update instance host/node and flavor-related fields. After this + # if anything fails the instance will get rebuilt/rebooted on this + # host. + self._finish_revert_resize_update_instance_flavor_host_node( + instance, migration) + + # Move the allocations against the source compute node resource + # provider, held by the migration, to the instance which will drop + # the destination compute node resource provider allocations held by + # the instance. This puts the allocations against the source node + # back to the old_flavor and owned by the instance. + try: + self._revert_allocation(ctxt, instance, migration) + except exception.AllocationMoveFailed: + # Log the error but do not re-raise because we want to continue to + # process ports and volumes below. + LOG.error('Reverting allocation in placement for migration ' + '%(migration_uuid)s failed. You may need to manually ' + 'remove the allocations for the migration consumer ' + 'against the source node resource provider ' + '%(source_provider)s and the allocations for the ' + 'instance consumer against the destination node ' + 'resource provider %(dest_provider)s and then run the ' + '"nova-manage placement heal_allocations" command.', + {'instance_uuid': instance.uuid, + 'migration_uuid': migration.uuid, + 'source_provider': migration.source_node, + 'dest_provider': migration.dest_node}, + instance=instance) + + bdms = instance.get_bdms() + # prep_snapshot_based_resize_at_source created empty volume attachments + # that we need to update here to get the connection_info before calling + # driver.finish_revert_migration which will connect the volumes to this + # host. + LOG.debug('Updating volume attachments for target host %s.', + self.host, instance=instance) + # TODO(mriedem): We should probably make _update_volume_attachments + # (optionally) graceful to errors so we (1) try to process all + # attachments and (2) continue to process networking below. + self._update_volume_attachments(ctxt, instance, bdms) + + LOG.debug('Updating port bindings for source host %s.', + self.host, instance=instance) + # TODO(mriedem): Calculate provider mappings when we support + # cross-cell resize/migrate with ports having resource requests. + self._finish_revert_resize_network_migrate_finish( + ctxt, instance, migration, provider_mappings=None) + network_info = self.network_api.get_instance_nw_info(ctxt, instance) + + # Remember that prep_snapshot_based_resize_at_source destroyed the + # guest but left the disks intact so we cannot call spawn() here but + # finish_revert_migration should do the job. + block_device_info = self._get_instance_block_device_info( + ctxt, instance, bdms=bdms) + power_on = old_vm_state == vm_states.ACTIVE + driver_error = None + try: + self.driver.finish_revert_migration( + ctxt, instance, network_info, migration, + block_device_info=block_device_info, power_on=power_on) + except Exception as e: + driver_error = e + # Leave a hint about hard rebooting the guest and reraise so the + # instance is put into ERROR state. + with excutils.save_and_reraise_exception(logger=LOG): + LOG.error('An error occurred during finish_revert_migration. ' + 'The instance may need to be hard rebooted. Error: ' + '%s', driver_error, instance=instance) + else: + # Perform final cleanup of the instance in the database. + instance.drop_migration_context() + # If the original vm_state was STOPPED, set it back to STOPPED. + vm_state = vm_states.ACTIVE if power_on else vm_states.STOPPED + self._update_instance_after_spawn( + ctxt, instance, vm_state=vm_state) + instance.save(expected_task_state=[task_states.RESIZE_REVERTING]) + finally: + # Complete any volume attachments so the volumes are in-use. We + # do this regardless of finish_revert_migration failing because + # the instance is back on this host now and we do not want to leave + # the volumes in a pending state in case the instance is hard + # rebooted. + LOG.debug('Completing volume attachments for instance on source ' + 'host.', instance=instance) + with excutils.save_and_reraise_exception( + reraise=driver_error is not None, logger=LOG): + self._complete_volume_attachments(ctxt, bdms) + + migration.status = 'reverted' + migration.save() + @wrap_exception() @reverts_task_state @wrap_instance_event(prefix='compute') @@ -4789,6 +4936,27 @@ class ComputeManager(manager.Manager): LOG.error('Timeout waiting for Neutron events: %s', events, instance=instance) + def _finish_revert_resize_update_instance_flavor_host_node(self, instance, + migration): + """Updates host/node and flavor-related fields on the instance. + + This is used when finish the revert resize operation on the source + host and updates the instance flavor-related fields back to the old + flavor and then nulls out the old/new_flavor fields. + + The instance host/node fields are also set back to the source compute + host/node. + + :param instance: Instance object + :param migration: Migration object + """ + self._set_instance_info(instance, instance.old_flavor) + instance.old_flavor = None + instance.new_flavor = None + instance.host = migration.source_compute + instance.node = migration.source_node + instance.save(expected_task_state=[task_states.RESIZE_REVERTING]) + @wrap_exception() @reverts_task_state @wrap_instance_event(prefix='compute') @@ -4817,12 +4985,8 @@ class ComputeManager(manager.Manager): old_vm_state = instance.system_metadata.pop('old_vm_state', vm_states.ACTIVE) - self._set_instance_info(instance, instance.old_flavor) - instance.old_flavor = None - instance.new_flavor = None - instance.host = migration.source_compute - instance.node = migration.source_node - instance.save() + self._finish_revert_resize_update_instance_flavor_host_node( + instance, migration) try: source_allocations = self._revert_allocation( diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index d42a96efda..a948f654e0 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -375,6 +375,7 @@ class ComputeAPI(object): * 5.7 - Add finish_snapshot_based_resize_at_dest() * 5.8 - Add confirm_snapshot_based_resize_at_source() * 5.9 - Add revert_snapshot_based_resize_at_dest() + * 5.10 - Add finish_revert_snapshot_based_resize_at_source() ''' VERSION_ALIASES = { @@ -732,6 +733,41 @@ class ComputeAPI(object): instance=instance, migration=migration, snapshot_id=snapshot_id, request_spec=request_spec) + def finish_revert_snapshot_based_resize_at_source( + self, ctxt, instance, migration): + """Reverts a snapshot-based resize at the source host. + + Spawn the guest and re-connect volumes/VIFs on the source host and + revert the instance to use the old_flavor for resource usage reporting. + + Updates allocations in the placement service to move the source node + allocations, held by the migration record, to the instance and drop + the allocations held by the instance on the destination node. + + This is a synchronous RPC call using the ``long_rpc_timeout`` + configuration option. + + :param ctxt: nova auth request context targeted at the source cell + :param instance: Instance object whose vm_state is "resized" and + task_state is "resize_reverting". + :param migration: Migration object whose status is "reverting". + :raises: nova.exception.MigrationError if the source compute is too + old to perform the operation + :raises: oslo_messaging.exceptions.MessagingTimeout if the RPC call + times out + """ + version = '5.10' + client = self.router.client(ctxt) + if not client.can_send_version(version): + raise exception.MigrationError(reason=_('Compute too old')) + cctxt = client.prepare(server=migration.source_compute, + version=version, + call_monitor_timeout=CONF.rpc_response_timeout, + timeout=CONF.long_rpc_timeout) + return cctxt.call( + ctxt, 'finish_revert_snapshot_based_resize_at_source', + instance=instance, migration=migration) + def get_console_output(self, ctxt, instance, tail_length): version = '5.0' cctxt = self.router.client(ctxt).prepare( diff --git a/nova/objects/service.py b/nova/objects/service.py index 3e9c764c41..413827f1b3 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 46 +SERVICE_VERSION = 47 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -173,6 +173,9 @@ SERVICE_VERSION_HISTORY = ( {'compute_rpc': '5.8'}, # Version 46: Compute RPC v5.9: revert_snapshot_based_resize_at_dest {'compute_rpc': '5.9'}, + # Version 47: Compute RPC v5.10: + # finish_revert_snapshot_based_resize_at_source + {'compute_rpc': '5.10'}, ) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index e819875be7..561700fc1e 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -7773,7 +7773,7 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, instance_uuid=self.instance.uuid, new_instance_type_id=7, dest_compute='dest_compute', - dest_node=None, + dest_node='dest_node', dest_host=None, source_compute='source_compute', source_node='source_node', @@ -11261,6 +11261,223 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase, self.context, self.instance, self.instance.node, instance_type=self.instance.new_flavor) + @mock.patch('nova.compute.manager.ComputeManager.' + '_finish_revert_snapshot_based_resize_at_source') + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.manager.ComputeManager.' + '_update_scheduler_instance_info') + @mock.patch('nova.compute.utils.add_instance_fault_from_exc') + def test_finish_revert_snapshot_based_resize_at_source_error_handling( + self, mock_add_fault, mock_update_sched, mock_inst_save, + mock_finish_revert): + """Tests error handling (context managers, decorators, post-processing) + in finish_revert_snapshot_based_resize_at_source. + """ + self.instance.task_state = task_states.RESIZE_REVERTING + # First make _finish_revert_snapshot_based_resize_at_source fail. + error = test.TestingException('oops') + mock_finish_revert.side_effect = error + self.assertRaises( + test.TestingException, + self.compute.finish_revert_snapshot_based_resize_at_source, + self.context, self.instance, self.migration) + mock_finish_revert.assert_called_once_with( + self.context, self.instance, self.migration) + # _error_out_instance_on_exception should have set the instance status + # to ERROR. + mock_inst_save.assert_called() + self.assertEqual(vm_states.ERROR, self.instance.vm_state) + # We should not have updated the scheduler since we failed. + mock_update_sched.assert_not_called() + # reverts_task_state should have set the task_state to None. + self.assertIsNone(self.instance.task_state) + # errors_out_migration should have set the migration status to error. + self.migration.save.assert_called_once_with() + self.assertEqual('error', self.migration.status) + # wrap_instance_fault should have recorded a fault. + mock_add_fault.assert_called_once_with( + self.context, self.instance, error, test.MatchType(tuple)) + # wrap_exception should have sent an error notification. + self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS)) + self.assertEqual( + 'compute.%s' % fields.NotificationAction.EXCEPTION, + fake_notifier.VERSIONED_NOTIFICATIONS[0]['event_type']) + + # Now run it again but _finish_revert_snapshot_based_resize_at_source + # will pass and _update_scheduler_instance_info will fail but not be + # fatal (just logged). + mock_finish_revert.reset_mock(side_effect=True) # reset side_effect + mock_update_sched.side_effect = test.TestingException('scheduler oops') + self.compute.finish_revert_snapshot_based_resize_at_source( + self.context, self.instance, self.migration) + mock_finish_revert.assert_called_once_with( + self.context, self.instance, self.migration) + mock_update_sched.assert_called_once_with(self.context, self.instance) + self.assertIn('finish_revert_snapshot_based_resize_at_source failed ' + 'during post-processing. Error: scheduler oops', + self.stdlog.logger.output) + + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.manager.ComputeManager._revert_allocation', + side_effect=exception.AllocationMoveFailed('placement down')) + @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid') + @mock.patch('nova.compute.manager.ComputeManager.' + '_update_volume_attachments') + @mock.patch('nova.compute.manager.ComputeManager.' + '_finish_revert_resize_network_migrate_finish') + @mock.patch('nova.compute.manager.ComputeManager.' + '_get_instance_block_device_info') + @mock.patch('nova.objects.Instance.drop_migration_context') + @mock.patch('nova.compute.manager.ComputeManager.' + '_update_instance_after_spawn') + @mock.patch('nova.compute.manager.ComputeManager.' + '_complete_volume_attachments') + def test_finish_revert_snapshot_based_resize_at_source( + self, mock_complete_attachments, mock_update_after_spawn, + mock_drop_mig_context, mock_get_bdi, mock_net_migrate_finish, + mock_update_attachments, mock_get_bdms, mock_revert_allocs, + mock_inst_save): + """Happy path test for finish_revert_snapshot_based_resize_at_source. + Also makes sure some cleanups that are handled gracefully do not raise. + """ + # Make _update_scheduler_instance_info a no-op. + self.flags(track_instance_changes=False, group='filter_scheduler') + # Configure the instance with old_vm_state = STOPPED so the guest is + # created but not powered on. + self.instance.system_metadata['old_vm_state'] = vm_states.STOPPED + # Configure the instance with an old_flavor for the revert. + old_flavor = fake_flavor.fake_flavor_obj(self.context) + self.instance.old_flavor = old_flavor + with test.nested( + mock.patch.object(self.compute.network_api, + 'get_instance_nw_info'), + mock.patch.object(self.compute.driver, 'finish_revert_migration') + ) as ( + mock_get_nw_info, mock_driver_finish + ): + # Run the code. + self.compute.finish_revert_snapshot_based_resize_at_source( + self.context, self.instance, self.migration) + # Assert the migration status was updated. + self.migration.save.assert_called_once_with() + self.assertEqual('reverted', self.migration.status) + # Assert the instance host/node and flavor info was updated. + self.assertEqual(self.migration.source_compute, self.instance.host) + self.assertEqual(self.migration.source_node, self.instance.node) + self.assertIs(self.instance.flavor, old_flavor) + self.assertEqual(old_flavor.id, self.instance.instance_type_id) + # Assert _revert_allocation was called, raised and logged the error. + mock_revert_allocs.assert_called_once_with( + self.context, self.instance, self.migration) + self.assertIn('Reverting allocation in placement for migration %s ' + 'failed.' % self.migration.uuid, + self.stdlog.logger.output) + # Assert that volume attachments were updated. + mock_update_attachments.assert_called_once_with( + self.context, self.instance, mock_get_bdms.return_value) + # Assert that port bindings were updated to point at the source host. + mock_net_migrate_finish.assert_called_once_with( + self.context, self.instance, self.migration, + provider_mappings=None) + # Assert the driver finished reverting the migration. + mock_get_bdi.assert_called_once_with( + self.context, self.instance, bdms=mock_get_bdms.return_value) + mock_driver_finish.assert_called_once_with( + self.context, self.instance, mock_get_nw_info.return_value, + self.migration, block_device_info=mock_get_bdi.return_value, + power_on=False) + # Assert final DB cleanup for the instance. + mock_drop_mig_context.assert_called_once_with() + mock_update_after_spawn.assert_called_once_with( + self.context, self.instance, vm_state=vm_states.STOPPED) + mock_inst_save.assert_has_calls([ + mock.call(expected_task_state=[task_states.RESIZE_REVERTING])] * 2) + # And finally that the volume attachments were completed. + mock_complete_attachments.assert_called_once_with( + self.context, mock_get_bdms.return_value) + + @mock.patch('nova.objects.Instance.save') + @mock.patch('nova.compute.manager.ComputeManager._revert_allocation') + @mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid') + @mock.patch('nova.compute.manager.ComputeManager.' + '_update_volume_attachments') + @mock.patch('nova.compute.manager.ComputeManager.' + '_finish_revert_resize_network_migrate_finish') + @mock.patch('nova.compute.manager.ComputeManager.' + '_get_instance_block_device_info') + @mock.patch('nova.objects.Instance.drop_migration_context') + @mock.patch('nova.compute.manager.ComputeManager.' + '_update_instance_after_spawn') + @mock.patch('nova.compute.manager.ComputeManager.' + '_complete_volume_attachments', + side_effect=test.TestingException('vol complete failed')) + def test_finish_revert_snapshot_based_resize_at_source_driver_fails( + self, mock_complete_attachments, mock_update_after_spawn, + mock_drop_mig_context, mock_get_bdi, mock_net_migrate_finish, + mock_update_attachments, mock_get_bdms, mock_revert_allocs, + mock_inst_save): + """Test for _finish_revert_snapshot_based_resize_at_source where the + driver call to finish_revert_migration fails. + """ + self.instance.system_metadata['old_vm_state'] = vm_states.ACTIVE + # Configure the instance with an old_flavor for the revert. + old_flavor = fake_flavor.fake_flavor_obj(self.context) + self.instance.old_flavor = old_flavor + with test.nested( + mock.patch.object(self.compute.network_api, + 'get_instance_nw_info'), + mock.patch.object(self.compute.driver, 'finish_revert_migration', + side_effect=test.TestingException('driver fail')) + ) as ( + mock_get_nw_info, mock_driver_finish, + ): + # Run the code. + ex = self.assertRaises( + test.TestingException, + self.compute._finish_revert_snapshot_based_resize_at_source, + self.context, self.instance, self.migration) + # Assert the driver call (note power_on=True b/c old_vm_state=active) + mock_get_bdi.assert_called_once_with( + self.context, self.instance, bdms=mock_get_bdms.return_value) + mock_driver_finish.assert_called_once_with( + self.context, self.instance, mock_get_nw_info.return_value, + self.migration, block_device_info=mock_get_bdi.return_value, + power_on=True) + # _complete_volume_attachments is called even though the driver call + # failed. + mock_complete_attachments.assert_called_once_with( + self.context, mock_get_bdms.return_value) + # finish_revert_migration failed but _complete_volume_attachments + # is still called and in this case also fails so the resulting + # exception should be the one from _complete_volume_attachments + # but the finish_revert_migration error should have been logged. + self.assertIn('vol complete failed', six.text_type(ex)) + self.assertIn('driver fail', self.stdlog.logger.output) + # Assert the migration status was not updated. + self.migration.save.assert_not_called() + # Assert the instance host/node and flavor info was updated. + self.assertEqual(self.migration.source_compute, self.instance.host) + self.assertEqual(self.migration.source_node, self.instance.node) + self.assertIs(self.instance.flavor, old_flavor) + self.assertEqual(old_flavor.id, self.instance.instance_type_id) + # Assert allocations were reverted. + mock_revert_allocs.assert_called_once_with( + self.context, self.instance, self.migration) + # Assert that volume attachments were updated. + mock_update_attachments.assert_called_once_with( + self.context, self.instance, mock_get_bdms.return_value) + # Assert that port bindings were updated to point at the source host. + mock_net_migrate_finish.assert_called_once_with( + self.context, self.instance, self.migration, + provider_mappings=None) + # Assert final DB cleanup for the instance did not happen. + mock_drop_mig_context.assert_not_called() + mock_update_after_spawn.assert_not_called() + # _finish_revert_resize_update_instance_flavor_host_node updated the + # instance. + mock_inst_save.assert_called_once_with( + expected_task_state=[task_states.RESIZE_REVERTING]) + class ComputeManagerInstanceUsageAuditTestCase(test.TestCase): def setUp(self): diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index c6786fbe43..22c37ec057 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -655,6 +655,36 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): migration=migration_obj.Migration(dest_compute='dest')) self.assertIn('Compute too old', six.text_type(ex)) + def test_finish_revert_snapshot_based_resize_at_source(self): + """Tests happy path for finish_revert_snapshot_based_resize_at_source. + """ + self.flags(long_rpc_timeout=1234) + self._test_compute_api( + 'finish_revert_snapshot_based_resize_at_source', 'call', + # compute method kwargs + instance=self.fake_instance_obj, + migration=migration_obj.Migration(source_compute='source'), + # client.prepare kwargs + version='5.10', prepare_server='source', + call_monitor_timeout=60, timeout=1234) + + @mock.patch('nova.rpc.ClientRouter.client') + def test_finish_revert_snapshot_based_resize_at_source_old_compute( + self, client): + """Tests when the source compute service is too old to call + finish_revert_snapshot_based_resize_at_source so MigrationError is + raised. + """ + client.return_value.can_send_version.return_value = False + rpcapi = compute_rpcapi.ComputeAPI() + ex = self.assertRaises( + exception.MigrationError, + rpcapi.finish_revert_snapshot_based_resize_at_source, + self.context, + instance=self.fake_instance_obj, + migration=migration_obj.Migration(source_compute='source')) + self.assertIn('Compute too old', six.text_type(ex)) + def test_reboot_instance(self): self.maxDiff = None self._test_compute_api('reboot_instance', 'cast', diff --git a/nova/virt/fake.py b/nova/virt/fake.py index 16c951daa0..42e1117e66 100644 --- a/nova/virt/fake.py +++ b/nova/virt/fake.py @@ -254,8 +254,9 @@ class FakeDriver(driver.ComputeDriver): def finish_revert_migration(self, context, instance, network_info, migration, block_device_info=None, power_on=True): + state = power_state.RUNNING if power_on else power_state.SHUTDOWN self.instances[instance.uuid] = FakeInstance( - instance.name, power_state.RUNNING, instance.uuid) + instance.name, state, instance.uuid) def post_live_migration_at_destination(self, context, instance, network_info,