Files
nova/nova/tests/functional/regressions/test_bug_1853009.py
T
Mark Goddard 2bb4527228 Invalidate provider tree when compute node disappears
There is a race condition in nova-compute with the ironic virt driver
as nodes get rebalanced. It can lead to compute nodes being removed in
the DB and not repopulated. Ultimately this prevents these nodes from
being scheduled to.

The issue being addressed here is that if a compute node is deleted by a
host which thinks it is an orphan, then the resource provider for that
node might also be deleted. The compute host that owns the node might
not recreate the resource provider if it exists in the provider tree
cache.

This change fixes the issue by clearing resource providers from the
provider tree cache for which a compute node entry does not exist. Then,
when the available resource for the node is updated, the resource
providers are not found in the cache and get recreated in placement.

Change-Id: Ia53ff43e6964963cdf295604ba0fb7171389606e
Related-Bug: #1853009
Related-Bug: #1841481
2021-08-12 14:26:45 +01:00

194 lines
8.3 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import mock
from nova import context
from nova import objects
from nova.tests.functional import integrated_helpers
class NodeRebalanceDeletedComputeNodeRaceTestCase(
integrated_helpers.ProviderUsageBaseTestCase,
):
"""Regression test for bug 1853009 observed in Rocky & later.
When an ironic node re-balances from one host to another, there can be a
race where the old host deletes the orphan compute node after the new host
has taken ownership of it which results in the new host failing to create
the compute node and resource provider because the ResourceTracker does not
detect a change.
"""
# Make sure we're using the fake driver that has predictable uuids
# for each node.
compute_driver = 'fake.PredictableNodeUUIDDriver'
def _assert_hypervisor_api(self, nodename, expected_host):
# We should have one compute node shown by the API.
hypervisors = self.api.api_get(
'/os-hypervisors/detail'
).body['hypervisors']
self.assertEqual(1, len(hypervisors), hypervisors)
hypervisor = hypervisors[0]
self.assertEqual(nodename, hypervisor['hypervisor_hostname'])
self.assertEqual(expected_host, hypervisor['service']['host'])
def _start_compute(self, host):
host = self.start_service('compute', host)
# Ironic compute driver has rebalances_nodes = True.
host.manager.driver.rebalances_nodes = True
return host
def setUp(self):
super(NodeRebalanceDeletedComputeNodeRaceTestCase, self).setUp()
self.nodename = 'fake-node'
self.ctxt = context.get_admin_context()
def test_node_rebalance_deleted_compute_node_race(self):
# Simulate a service running and then stopping. host_a runs, creates
# fake-node, then is stopped. The fake-node compute node is destroyed.
# This leaves a soft-deleted node in the DB.
host_a = self._start_compute('host_a')
host_a.manager.driver._set_nodes([self.nodename])
host_a.manager.update_available_resource(self.ctxt)
host_a.stop()
cn = objects.ComputeNode.get_by_host_and_nodename(
self.ctxt, 'host_a', self.nodename,
)
cn.destroy()
self.assertEqual(0, len(objects.ComputeNodeList.get_all(self.ctxt)))
# Now we create a new compute service to manage our node.
host_b = self._start_compute('host_b')
host_b.manager.driver._set_nodes([self.nodename])
# When start_service runs, it will create a host_b ComputeNode. We want
# to delete that and inject our fake node into the driver which will
# be re-balanced to another host later. First assert this actually
# exists.
self._assert_hypervisor_api('host_b', expected_host='host_b')
# Now run the update_available_resource periodic to register fake-node
# and have it managed by host_b. This will also detect the "host_b"
# node as orphaned and delete it along with its resource provider.
# host_b[1]: Finds no compute record in RT. Tries to create one
# (_init_compute_node).
# FIXME(mgoddard): This shows a traceback with SQL rollback due to
# soft-deleted node. The create seems to succeed but breaks the RT
# update for this node. See
# https://bugs.launchpad.net/nova/+bug/1853159.
host_b.manager.update_available_resource(self.ctxt)
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
# There should only be one resource provider (fake-node).
original_rps = self._get_all_providers()
self.assertEqual(1, len(original_rps), original_rps)
self.assertEqual(self.nodename, original_rps[0]['name'])
# Simulate a re-balance by restarting host_a and make it manage
# fake-node. At this point both host_b and host_a think they own
# fake-node.
host_a = self._start_compute('host_a')
host_a.manager.driver._set_nodes([self.nodename])
# host_a[1]: Finds no compute record in RT, 'moves' existing node from
# host_b
host_a.manager.update_available_resource(self.ctxt)
# Assert that fake-node was re-balanced from host_b to host_a.
self.assertIn(
'ComputeNode fake-node moving from host_b to host_a',
self.stdlog.logger.output)
self._assert_hypervisor_api(self.nodename, expected_host='host_a')
# host_a[2]: Begins periodic update, queries compute nodes for this
# host, finds the fake-node.
cn = objects.ComputeNode.get_by_host_and_nodename(
self.ctxt, 'host_a', self.nodename,
)
# host_b[2]: Finds no compute record in RT, 'moves' existing node from
# host_a
host_b.manager.update_available_resource(self.ctxt)
# Assert that fake-node was re-balanced from host_a to host_b.
self.assertIn(
'ComputeNode fake-node moving from host_a to host_b',
self.stdlog.logger.output)
self._assert_hypervisor_api(self.nodename, expected_host='host_b')
# Complete rebalance, as host_a realises it does not own fake-node.
host_a.manager.driver._set_nodes([])
# host_a[2]: Deletes orphan compute node.
# Mock out the compute node query to simulate a race condition where
# the list includes an orphan compute node that is newly owned by
# host_b by the time host_a attempts to delete it.
# FIXME(mgoddard): Ideally host_a would not delete a node that does not
# belong to it. See https://bugs.launchpad.net/nova/+bug/1853009.
with mock.patch(
'nova.compute.manager.ComputeManager._get_compute_nodes_in_db'
) as mock_get:
mock_get.return_value = [cn]
host_a.manager.update_available_resource(self.ctxt)
# Verify that the node was deleted.
self.assertIn(
'Deleting orphan compute node %s hypervisor host '
'is fake-node, nodes are' % cn.id,
self.stdlog.logger.output)
hypervisors = self.api.api_get(
'/os-hypervisors/detail').body['hypervisors']
self.assertEqual(0, len(hypervisors), hypervisors)
rps = self._get_all_providers()
self.assertEqual(0, len(rps), rps)
# host_b[3]: Should recreate compute node and resource provider.
# FIXME(mgoddard): Resource provider not recreated here, due to
# https://bugs.launchpad.net/nova/+bug/1853159.
host_b.manager.update_available_resource(self.ctxt)
# Verify that the node was recreated.
self._assert_hypervisor_api(self.nodename, 'host_b')
# But due to https://bugs.launchpad.net/nova/+bug/1853159 the compute
# node is not cached in the RT.
self.assertNotIn(self.nodename, host_b.manager.rt.compute_nodes)
# There is no RP.
rps = self._get_all_providers()
self.assertEqual(0, len(rps), rps)
# But the RP exists in the provider tree.
self.assertFalse(host_b.manager.rt.reportclient._provider_tree.exists(
self.nodename))
# host_b[1]: Should add compute node to RT cache and recreate resource
# provider.
host_b.manager.update_available_resource(self.ctxt)
# Verify that the node still exists.
self._assert_hypervisor_api(self.nodename, 'host_b')
# And it is now in the RT cache.
self.assertIn(self.nodename, host_b.manager.rt.compute_nodes)
# The resource provider has now been created.
rps = self._get_all_providers()
self.assertEqual(1, len(rps), rps)
self.assertEqual(self.nodename, rps[0]['name'])
# This fails due to the lack of a resource provider.
self.assertIn(
'Skipping removal of allocations for deleted instances',
self.stdlog.logger.output)