diff --git a/doc/source/admin/manage-volumes.rst b/doc/source/admin/manage-volumes.rst index ef45d2c7aa..d274e8db7c 100644 --- a/doc/source/admin/manage-volumes.rst +++ b/doc/source/admin/manage-volumes.rst @@ -81,6 +81,18 @@ hosting the instance. This could even include refreshing different elements of the attachment to ensure the latest configuration changes within the environment have been applied. +.. note:: + + If you encounter any dangling volume attachments in either the Nova or + Cinder databases, a ``hard reboot`` of the affected instance can help + resolve the issue. During the instance reboot process, Nova performs + a synchronization mechanism that verifies the availability of volume + attachments in the Cinder database. Any missing or dangling/stale + attachments are detected and deleted from both Nova and Cinder during + ``hard reboot`` process. + + + Checking an existing attachment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/nova/compute/manager.py b/nova/compute/manager.py index e6ad4ac154..b87a695536 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -4174,6 +4174,56 @@ class ComputeManager(manager.Manager): accel_info = [] return accel_info + def _delete_dangling_bdms(self, context, instance, bdms): + """Deletes dangling or stale attachments for volume from + Nova and Cinder DB so both service DBs can be in sync. + + Retrieves volume attachments from the Nova block_device_mapping + table and verifies them with the Cinder volume_attachment table. + If attachment is not present in any one of the DBs, delete + attachments from the other DB. + + :param context: The nova request context. + :param instance: instance object. + :param instance: BlockDeviceMappingList list object. + """ + + # attachments present in nova DB, ones nova knows about + nova_attachments = [] + bdms_to_delete = [] + for bdm in bdms.objects: + if bdm.volume_id and bdm.source_type == 'volume' and \ + bdm.destination_type == 'volume': + try: + self.volume_api.attachment_get(context, bdm.attachment_id) + except exception.VolumeAttachmentNotFound: + LOG.info( + f"Removing stale volume attachment " + f"'{bdm.attachment_id}' from instance for " + f"volume '{bdm.volume_id}'.", instance=instance) + bdm.destroy() + bdms_to_delete.append(bdm) + else: + nova_attachments.append(bdm.attachment_id) + + cinder_attachments = self.volume_api.attachment_get_all( + context, instance.uuid) + cinder_attachments = [each['id'] for each in cinder_attachments] + + if len(set(cinder_attachments) - set(nova_attachments)): + LOG.info( + "Removing stale volume attachments of instance from " + "Cinder", instance=instance) + for each_attach in set(cinder_attachments) - set(nova_attachments): + # delete only cinder known attachments, from cinder DB. + LOG.debug( + f"Removing attachment '{each_attach}'", instance=instance) + self.volume_api.attachment_delete(context, each_attach) + + # refresh bdms object + for bdm in bdms_to_delete: + bdms.objects.remove(bdm) + @wrap_exception() @reverts_task_state @wrap_instance_event(prefix='compute') @@ -4203,6 +4253,9 @@ class ComputeManager(manager.Manager): bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( context, instance.uuid) + + self._delete_dangling_bdms(context, instance, bdms) + block_device_info = self._get_instance_block_device_info( context, instance, bdms=bdms) @@ -5636,7 +5689,7 @@ class ComputeManager(manager.Manager): # in case _prep_resize fails. instance_state = instance.vm_state with self._error_out_instance_on_exception( - context, instance, instance_state=instance_state),\ + context, instance, instance_state=instance_state), \ errors_out_migration_ctxt(migration): self._send_prep_resize_notifications( diff --git a/nova/tests/functional/compute/test_attached_volumes.py b/nova/tests/functional/compute/test_attached_volumes.py index 7c064004d5..4304e3c812 100644 --- a/nova/tests/functional/compute/test_attached_volumes.py +++ b/nova/tests/functional/compute/test_attached_volumes.py @@ -66,10 +66,8 @@ class TestAttachedVolumes( # verify if volume attachment is present at nova bdm_list = self._get_bdm_list(server) - # TODO(auniyal): Reboot should remove stale bdms # after fix bdm should not have any volume - self.assertEqual(1, len(bdm_list)) - self.assertEqual(volume_id, bdm_list[0][0]) + self.assertEqual(0, len(bdm_list)) def test_delete_multiple_stale_attachment_from_nova(self): volumes = [ @@ -109,13 +107,12 @@ class TestAttachedVolumes( bdm_list_2 = self._get_bdm_list(server) bdm_attc_vols = [bdm[0] for bdm in bdm_list_2] - # after fix only 2 volumes should be attached instead 4 - self.assertEqual(4, len(bdm_list_2)) + self.assertEqual(2, len(bdm_list_2)) self.assertIn(bdm_list[0][0], bdm_attc_vols) - self.assertIn(bdm_list[1][0], bdm_attc_vols) + self.assertNotIn(bdm_list[1][0], bdm_attc_vols) self.assertIn(bdm_list[2][0], bdm_attc_vols) - self.assertIn(bdm_list[3][0], bdm_attc_vols) + self.assertNotIn(bdm_list[3][0], bdm_attc_vols) def test_delete_multiple_stale_attachment_from_cinder(self): volume_id = 'aeb9b5f4-1fe9-4964-ab65-5e168be4de8e' @@ -149,9 +146,7 @@ class TestAttachedVolumes( # rebooting server should remove stale attachments server = self._reboot_server(server, hard=True) - # TODO(auniyal): Reboot should remove only stale attachments - # from cinder too # verify if how many volume attachments are present at cinder attached_volume_ids = self.cinder.attachment_ids_for_instance( server['id']) - self.assertEqual(attachments + 1, len(attached_volume_ids)) + self.assertEqual(1, len(attached_volume_ids)) diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index 6259de7cdf..0d592fbdbe 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -1333,6 +1333,7 @@ class ProviderUsageBaseTestCase(test.TestCase, PlacementInstanceHelperMixin): self.policy = self.useFixture(nova_fixtures.RealPolicyFixture()) self.neutron = self.useFixture(nova_fixtures.NeutronFixture(self)) self.glance = self.useFixture(nova_fixtures.GlanceFixture(self)) + self.useFixture(nova_fixtures.CinderFixture(self)) self.placement = self.useFixture(func_fixtures.PlacementFixture()).api self.useFixture(nova_fixtures.AllServicesCurrent()) diff --git a/nova/tests/functional/regressions/test_bug_1835822.py b/nova/tests/functional/regressions/test_bug_1835822.py index 5736e9ea3d..92c1fffe3c 100644 --- a/nova/tests/functional/regressions/test_bug_1835822.py +++ b/nova/tests/functional/regressions/test_bug_1835822.py @@ -27,6 +27,7 @@ class RegressionTest1835822( self.useFixture(nova_fixtures.GlanceFixture(self)) self.useFixture(nova_fixtures.NeutronFixture(self)) self.useFixture(func_fixtures.PlacementFixture()) + self.useFixture(nova_fixtures.CinderFixture(self)) self.api = self.useFixture(nova_fixtures.OSAPIFixture( api_version='v2.1')).api diff --git a/nova/tests/functional/test_instance_actions.py b/nova/tests/functional/test_instance_actions.py index 060133ce93..d13b6ce4e2 100644 --- a/nova/tests/functional/test_instance_actions.py +++ b/nova/tests/functional/test_instance_actions.py @@ -85,6 +85,7 @@ class InstanceActionEventFaultsTestCase( self.useFixture(nova_fixtures.NeutronFixture(self)) self.useFixture(func_fixtures.PlacementFixture()) self.useFixture(nova_fixtures.RealPolicyFixture()) + self.useFixture(nova_fixtures.CinderFixture(self)) # Start the compute services. self.start_service('conductor') diff --git a/nova/tests/functional/test_server_faults.py b/nova/tests/functional/test_server_faults.py index edc3c3b377..632bb21f0a 100644 --- a/nova/tests/functional/test_server_faults.py +++ b/nova/tests/functional/test_server_faults.py @@ -33,6 +33,7 @@ class ServerFaultTestCase(test.TestCase, self.useFixture(func_fixtures.PlacementFixture()) self.useFixture(nova_fixtures.GlanceFixture(self)) self.useFixture(nova_fixtures.RealPolicyFixture()) + self.useFixture(nova_fixtures.CinderFixture(self)) # Start the compute services. self.start_service('conductor') diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 2004bcc8fb..447ec3199a 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -1471,6 +1471,132 @@ class ComputeVolumeTestCase(BaseTestCase): self.assertEqual(1, attach_block_devices.call_count) get_swap.assert_called_once_with([]) + def _test__delete_dangling_bdms( + self, instance, nova_bdms, cinder_attachments, valid=False): + with test.nested( + mock.patch.object(objects.BlockDeviceMappingList, + 'get_by_instance_uuid'), + mock.patch.object(objects.BlockDeviceMapping, 'destroy'), + mock.patch.object(cinder.API, 'attachment_get'), + mock.patch.object(cinder.API, 'attachment_get_all'), + mock.patch.object(cinder.API, 'attachment_delete') + ) as (mock_get_bdms, mock_destroy, mock_attach_get, + mock_all_attachments, mock_attachment_delete): + mock_get_bdms.return_value = nova_bdms + mock_all_attachments.return_value = cinder_attachments + + if not valid: + err = exception.VolumeAttachmentNotFound( + attachment_id=None) + mock_attach_get.side_effect = err + + self.compute._delete_dangling_bdms( + self.context, instance, nova_bdms) + + return mock_destroy, mock_attachment_delete + + def test_dangling_bdms_nothing_to_delete(self): + """no bdm, no attachments""" + instance = self._create_fake_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[]) + mock_dstr, mock_atach_del = self._test__delete_dangling_bdms( + instance, bdms, []) + self.assertTrue(mock_dstr.assert_not_called) + self.assertTrue(mock_atach_del.assert_not_called) + + def test_dangling_bdms_delete_from_bdm(self): + """valid source type: + able to retrieve from valid target attachmnet from cinder + bdm should not get deleted. + there is a 'valid' flag passed while + calling _test__delete_dangling_bdms + + invalid source type: + attachment is of image type, bdm should not get deleted + """ + instance = self._create_fake_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol1, + 'attachment_id': uuids.fake_attachment_1, + 'source_type': 'volume', + 'destination_type': 'volume'})), + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol2, + 'attachment_id': uuids.fake_attachment_2, + 'source_type': 'image', + 'destination_type': 'volume'})) + ]) + + mock_destroy, _ = self._test__delete_dangling_bdms( + instance, bdms, [], True) + # bdm.destroy never gets called + self.assertFalse(mock_destroy.called) + self.assertEqual(mock_destroy.call_count, 0) + # u_bdms are valid bdms, both image and volume are valid bdm + self.assertEqual(len(bdms), 2) + + def test_dangling_bdms_delete_from_multi_bdm(self): + """nova has bdms but they are not present at cinder side + both bdms should be deleted + """ + instance = self._create_fake_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol1, + 'attachment_id': uuids.fake_attachment_1, + 'source_type': 'volume', + 'destination_type': 'volume'})), + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol2, + 'attachment_id': uuids.fake_attachment_2, + 'source_type': 'volume', + 'destination_type': 'volume'})) + ]) + + mock_destroy, _ = self._test__delete_dangling_bdms( + instance, bdms, []) + self.assertTrue(mock_destroy.called) + self.assertEqual(mock_destroy.call_count, 2) + self.assertEqual(len(bdms), 0) + + def test_dangling_bdms_delete_cinder_attachments(self): + """out of 2 one cinder attachment is present in nova side""" + instance = self._create_fake_instance_obj() + bdms = objects.BlockDeviceMappingList(objects=[ + objects.BlockDeviceMapping( + **fake_block_device.AnonFakeDbBlockDeviceDict( + { + 'instance_uuid': instance.uuid, + 'volume_id': uuids.fake_vol1, + 'attachment_id': uuids.fake_attachment_1, + 'source_type': 'volume', + 'destination_type': 'volume'})), + ]) + + cinder_attachments = [ + {'id': uuids.fake_attachment_1}, + {'id': 2}, + ] + _, mock_attachment_delete = self._test__delete_dangling_bdms( + instance, bdms, cinder_attachments, True) + + self.assertTrue(mock_attachment_delete.call_count, 1) + self.assertEqual(mock_attachment_delete.call_args_list[0][0][1], 2) + self.assertEqual(len(bdms), 1) + class ComputeTestCase(BaseTestCase, test_diagnostics.DiagnosticsComparisonMixin, @@ -2897,6 +3023,8 @@ class ComputeTestCase(BaseTestCase, reimage_boot_volume=False, target_state=None) self.compute.terminate_instance(self.context, instance, []) + @mock.patch.object(compute_manager.ComputeManager, + '_delete_dangling_bdms') @mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid') @mock.patch.object(compute_manager.ComputeManager, '_get_instance_block_device_info') @@ -2908,9 +3036,9 @@ class ComputeTestCase(BaseTestCase, @mock.patch('nova.compute.utils.notify_about_instance_action') def _test_reboot(self, soft, mock_notify_action, mock_get_power, mock_get_orig, mock_update, mock_notify_usage, - mock_get_blk, mock_get_bdms, test_delete=False, - test_unrescue=False, fail_reboot=False, - fail_running=False): + mock_get_blk, mock_get_bdms, mock_del_stale_bdms, + test_delete=False, test_unrescue=False, + fail_reboot=False, fail_running=False): reboot_type = soft and 'SOFT' or 'HARD' task_pending = (soft and task_states.REBOOT_PENDING or task_states.REBOOT_PENDING_HARD) @@ -3127,6 +3255,8 @@ class ComputeTestCase(BaseTestCase, def test_reboot_hard_and_delete_and_rescued(self): self._test_reboot(False, test_delete=True, test_unrescue=True) + @mock.patch.object(compute_manager.ComputeManager, + '_delete_dangling_bdms') @mock.patch('nova.virt.fake.FakeDriver.reboot') @mock.patch('nova.objects.instance.Instance.save') @mock.patch.object(objects.BlockDeviceMappingList, 'get_by_instance_uuid') @@ -3141,7 +3271,7 @@ class ComputeTestCase(BaseTestCase, def _test_reboot_with_accels(self, mock_notify_action, mock_get_power, mock_get_orig, mock_update, mock_notify_usage, mock_get_blk, mock_get_bdms, mock_inst_save, mock_reboot, - extra_specs=None, accel_info=None): + mock_del_stale_bdms, extra_specs=None, accel_info=None): self.compute.network_api.get_instance_nw_info = mock.Mock() diff --git a/releasenotes/notes/delete-dangling-volumes-2615100187fe29fb.yaml b/releasenotes/notes/delete-dangling-volumes-2615100187fe29fb.yaml new file mode 100644 index 0000000000..e758fb2e7f --- /dev/null +++ b/releasenotes/notes/delete-dangling-volumes-2615100187fe29fb.yaml @@ -0,0 +1,24 @@ +--- +features: + - | + This change ensures the synchronization of volume attachments + between Nova and Cinder, by deleting any dangling volume attachments + and maintaining consistency between two databases. + + Block device mapping (BDM) table in the Nova database, stores + information about volume attachments, image attachments + and swap attachments. Similarly, each volume attachment had a + corresponding entry in the Cinder database volume attachment table. + + With this change, on instance reboot, Nova will checks for all volume + attachments associated with the instance and verifies their availability + in the Cinder database. If attachments are not found they will get deleted + from Nova database too. + + After Nova database cleanup, similarly Cinder database is checked for + attachments related to instance. If attachments found in Cinder DB that + are not present in Nova DB, they will get deleted from Cinder databse. + + See `spec`__ for more details. + + __ https://specs.openstack.org/openstack/nova-specs/specs/2023.2/approved/cleanup-dangling-volume-attachments.html