nova/nova/tests/functional/regressions/test_bug_1853009.py

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import mock

from nova import context
from nova import objects
from nova.tests.functional import integrated_helpers


class NodeRebalanceDeletedComputeNodeRaceTestCase(
    integrated_helpers.ProviderUsageBaseTestCase,
):
    """Regression test for bug 1853009 observed in Rocky & later.

    When an ironic node re-balances from one host to another, there can be a
    race where the old host deletes the orphan compute node after the new host
    has taken ownership of it which results in the new host failing to create
    the compute node and resource provider because the ResourceTracker does not
    detect a change.
    """
    # Make sure we're using the fake driver that has predictable uuids
    # for each node.
    compute_driver = 'fake.PredictableNodeUUIDDriver'

    def _assert_hypervisor_api(self, nodename, expected_host):
        # We should have one compute node shown by the API.
        hypervisors = self.api.api_get(
            '/os-hypervisors/detail'
        ).body['hypervisors']
        self.assertEqual(1, len(hypervisors), hypervisors)
        hypervisor = hypervisors[0]
        self.assertEqual(nodename, hypervisor['hypervisor_hostname'])
        self.assertEqual(expected_host, hypervisor['service']['host'])

    def _start_compute(self, host):
        host = self.start_service('compute', host)
        # Ironic compute driver has rebalances_nodes = True.
        host.manager.driver.rebalances_nodes = True
        return host

    def setUp(self):
        super(NodeRebalanceDeletedComputeNodeRaceTestCase, self).setUp()

        self.nodename = 'fake-node'
        self.ctxt = context.get_admin_context()

    def test_node_rebalance_deleted_compute_node_race(self):
        # Simulate a service running and then stopping. host_a runs, creates
        # fake-node, then is stopped. The fake-node compute node is destroyed.
        # This leaves a soft-deleted node in the DB.
        host_a = self._start_compute('host_a')
        host_a.manager.driver._set_nodes([self.nodename])
        host_a.manager.update_available_resource(self.ctxt)
        host_a.stop()
        cn = objects.ComputeNode.get_by_host_and_nodename(
            self.ctxt, 'host_a', self.nodename,
        )
        cn.destroy()

        self.assertEqual(0, len(objects.ComputeNodeList.get_all(self.ctxt)))

        # Now we create a new compute service to manage our node.
        host_b = self._start_compute('host_b')
        host_b.manager.driver._set_nodes([self.nodename])

        # When start_service runs, it will create a host_b ComputeNode. We want
        # to delete that and inject our fake node into the driver which will
        # be re-balanced to another host later. First assert this actually
        # exists.
        self._assert_hypervisor_api('host_b', expected_host='host_b')

        # Now run the update_available_resource periodic to register fake-node
        # and have it managed by host_b. This will also detect the "host_b"
        # node as orphaned and delete it along with its resource provider.

        # host_b[1]: Finds no compute record in RT. Tries to create one
        # (_init_compute_node).
        # FIXME(mgoddard): This shows a traceback with SQL rollback due to
        # soft-deleted node. The create seems to succeed but breaks the RT
        # update for this node. See
        # https://bugs.launchpad.net/nova/+bug/1853159.
        host_b.manager.update_available_resource(self.ctxt)
        self._assert_hypervisor_api(self.nodename, expected_host='host_b')
        # There should only be one resource provider (fake-node).
        original_rps = self._get_all_providers()
        self.assertEqual(1, len(original_rps), original_rps)
        self.assertEqual(self.nodename, original_rps[0]['name'])

        # Simulate a re-balance by restarting host_a and make it manage
        # fake-node. At this point both host_b and host_a think they own
        # fake-node.
        host_a = self._start_compute('host_a')
        host_a.manager.driver._set_nodes([self.nodename])

        # host_a[1]: Finds no compute record in RT, 'moves' existing node from
        # host_b
        host_a.manager.update_available_resource(self.ctxt)
        # Assert that fake-node was re-balanced from host_b to host_a.
        self.assertIn(
            'ComputeNode fake-node moving from host_b to host_a',
            self.stdlog.logger.output)
        self._assert_hypervisor_api(self.nodename, expected_host='host_a')

        # host_a[2]: Begins periodic update, queries compute nodes for this
        # host, finds the fake-node.
        cn = objects.ComputeNode.get_by_host_and_nodename(
            self.ctxt, 'host_a', self.nodename,
        )

        # host_b[2]: Finds no compute record in RT, 'moves' existing node from
        # host_a
        host_b.manager.update_available_resource(self.ctxt)
        # Assert that fake-node was re-balanced from host_a to host_b.
        self.assertIn(
            'ComputeNode fake-node moving from host_a to host_b',
            self.stdlog.logger.output)
        self._assert_hypervisor_api(self.nodename, expected_host='host_b')

        # Complete rebalance, as host_a realises it does not own fake-node.
        host_a.manager.driver._set_nodes([])

        # host_a[2]: Deletes orphan compute node.
        # Mock out the compute node query to simulate a race condition where
        # the list includes an orphan compute node that is newly owned by
        # host_b by the time host_a attempts to delete it.
        # FIXME(mgoddard): Ideally host_a would not delete a node that does not
        # belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
        with mock.patch(
            'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
        ) as mock_get:
            mock_get.return_value = [cn]
            host_a.manager.update_available_resource(self.ctxt)

        # Verify that the node was deleted.
        self.assertIn(
            'Deleting orphan compute node %s hypervisor host '
            'is fake-node, nodes are' % cn.id,
            self.stdlog.logger.output)
        hypervisors = self.api.api_get(
            '/os-hypervisors/detail').body['hypervisors']
        self.assertEqual(0, len(hypervisors), hypervisors)
        rps = self._get_all_providers()
        self.assertEqual(0, len(rps), rps)

        # host_b[3]: Should recreate compute node and resource provider.
        # FIXME(mgoddard): Resource provider not recreated here, due to
        # https://bugs.launchpad.net/nova/+bug/1853159.
        host_b.manager.update_available_resource(self.ctxt)

        # Verify that the node was recreated.
        self._assert_hypervisor_api(self.nodename, 'host_b')

        # But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
        # node is not cached in the RT.
        self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)

        # There is no RP.
        rps = self._get_all_providers()
        self.assertEqual(0, len(rps), rps)

        # But the RP exists in the provider tree.
        self.assertFalse(host_b.manager.rt.reportclient._provider_tree.exists(
            self.nodename))

        # host_b[1]: Should add compute node to RT cache and recreate resource
        # provider.
        host_b.manager.update_available_resource(self.ctxt)

        # Verify that the node still exists.
        self._assert_hypervisor_api(self.nodename, 'host_b')

        # And it is now in the RT cache.
        self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)

        # The resource provider has now been created.
        rps = self._get_all_providers()
        self.assertEqual(1, len(rps), rps)
        self.assertEqual(self.nodename, rps[0]['name'])

        # This fails due to the lack of a resource provider.
        self.assertIn(
            'Skipping removal of allocations for deleted instances',
            self.stdlog.logger.output)