From add69c05072b4ee7efd4e75debf2172ed2269c86 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Fri, 6 Oct 2017 07:40:42 -0700
Subject: [PATCH] Always put 'uuid' into sort_keys for stable instance lists

If we're listing by sort keys that yield many ambiguous results, we
may exacerbate issues in client pagination because we're not even
bound by insertion order given that we have multiple databases being
queried in parallel. So, even if the client didn't ask for it, throw
'uuid' into the end of sort_keys to provide us a stable ordering. This
was done for the default case by always including 'id' in the default
set of sort_keys, although a user could still break if they request
their own keys.

Note this also removes the recently-added explicit sort in the
test_bug_1689692 case, since we're enforcing a strict ordering with
this patch. Also, mriedem is awesome.

Change-Id: Ida446acb1286a8b215451a5d8d7d23882643ef13
Closes-Bug: #1721791
---
 nova/compute/instance_list.py                 | 11 +++++
 .../functional/compute/test_instance_list.py  | 41 +++++++++++++++++--
 .../regressions/test_bug_1689692.py           |  5 +--
 3 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/nova/compute/instance_list.py b/nova/compute/instance_list.py
index 13a01aa13c..2390148fa6 100644
--- a/nova/compute/instance_list.py
+++ b/nova/compute/instance_list.py
@@ -114,6 +114,17 @@ def get_instances_sorted(ctx, filters, limit, marker, columns_to_join,
         sort_keys = ['created_at', 'id']
         sort_dirs = ['asc', 'asc']
 
+    if 'uuid' not in sort_keys:
+        # Historically the default sort includes 'id' (see above), which
+        # should give us a stable ordering. Since we're striping across
+        # cell databases here, many sort_keys arrangements will yield
+        # nothing unique across all the databases to give us a stable
+        # ordering, which can mess up expected client pagination behavior.
+        # So, throw uuid into the sort_keys at the end if it's not already
+        # there to keep us repeatable.
+        sort_keys = copy.copy(sort_keys) + ['uuid']
+        sort_dirs = copy.copy(sort_dirs) + ['asc']
+
     sort_ctx = InstanceSortContext(sort_keys, sort_dirs)
 
     if marker:
diff --git a/nova/tests/functional/compute/test_instance_list.py b/nova/tests/functional/compute/test_instance_list.py
index 25b0573a80..29790ef43e 100644
--- a/nova/tests/functional/compute/test_instance_list.py
+++ b/nova/tests/functional/compute/test_instance_list.py
@@ -31,7 +31,8 @@ class InstanceListTestCase(test.TestCase):
         self.num_instances = 3
         self.instances = []
 
-        dt = datetime.datetime(1985, 10, 25, 1, 21, 0)
+        start = datetime.datetime(1985, 10, 25, 1, 21, 0)
+        dt = start
         spread = datetime.timedelta(minutes=10)
 
         self.cells = objects.CellMappingList.get_all(self.context)
@@ -45,6 +46,7 @@ class InstanceListTestCase(test.TestCase):
                         context=cctx,
                         project_id=self.context.project_id,
                         user_id=self.context.user_id,
+                        created_at=start,
                         launched_at=dt,
                         instance_type_id=i,
                         hostname='%s-inst%i' % (cell.name, i))
@@ -164,6 +166,11 @@ class InstanceListTestCase(test.TestCase):
                 break
             insts.extend(batch)
             page += 1
+            if page > len(self.instances) * 2:
+                # Do this sanity check in case we introduce (or find) another
+                # repeating page bug like #1721791. Without this we loop
+                # until timeout, which is less obvious.
+                raise Exception('Infinite paging loop')
 
         # We should have requested exactly (or one more unlimited) pages
         self.assertIn(page, (pages, pages + 1))
@@ -172,11 +179,12 @@ class InstanceListTestCase(test.TestCase):
         found = [x[sort_by] for x in insts]
         had = [x[sort_by] for x in self.instances]
 
-        if sort_by == 'launched_at':
+        if sort_by in ('launched_at', 'created_at'):
             # We're comparing objects and database entries, so we need to
             # squash the tzinfo of the object ones so we can compare
             had = [x.replace(tzinfo=None) for x in had]
 
+        self.assertEqual(len(had), len(found))
         self.assertEqual(sorted(had), found)
 
     def test_get_sorted_with_limit_marker_stable(self):
@@ -219,6 +227,14 @@ class InstanceListTestCase(test.TestCase):
         """
         self._test_get_sorted_with_limit_marker(sort_by='launched_at')
 
+    def test_get_sorted_with_limit_marker_datetime_same(self):
+        """Test sorted by created_at.
+
+        This tests that we can do all of this, but with datetime
+        fields that are identical.
+        """
+        self._test_get_sorted_with_limit_marker(sort_by='created_at')
+
     def test_get_sorted_with_deleted_marker(self):
         marker = self.instances[1]['uuid']
 
@@ -376,7 +392,8 @@ class TestInstanceListObjects(test.TestCase):
         self.num_instances = 3
         self.instances = []
 
-        dt = datetime.datetime(1985, 10, 25, 1, 21, 0)
+        start = datetime.datetime(1985, 10, 25, 1, 21, 0)
+        dt = start
         spread = datetime.timedelta(minutes=10)
 
         cells = objects.CellMappingList.get_all(self.context)
@@ -390,6 +407,7 @@ class TestInstanceListObjects(test.TestCase):
                         context=cctx,
                         project_id=self.context.project_id,
                         user_id=self.context.user_id,
+                        created_at=start,
                         launched_at=dt,
                         instance_type_id=i,
                         hostname='%s-inst%i' % (cell.name, i))
@@ -451,3 +469,20 @@ class TestInstanceListObjects(test.TestCase):
         # actual faults
         self.assertEqual(2, len([inst for inst in insts
                                  if inst.fault]))
+
+    def test_get_instance_objects_sorted_paged(self):
+        """Query a full first page and ensure an empty second one.
+
+        This uses created_at which is enforced to be the same across
+        each instance by setUp(). This will help make sure we still
+        have a stable ordering, even when we only claim to care about
+        created_at.
+        """
+        instp1 = instance_list.get_instance_objects_sorted(
+            self.context, {}, None, None, [],
+            ['created_at'], ['asc'])
+        self.assertEqual(len(self.instances), len(instp1))
+        instp2 = instance_list.get_instance_objects_sorted(
+            self.context, {}, None, instp1[-1]['uuid'], [],
+            ['created_at'], ['asc'])
+        self.assertEqual(0, len(instp2))
diff --git a/nova/tests/functional/regressions/test_bug_1689692.py b/nova/tests/functional/regressions/test_bug_1689692.py
index ab76cbda99..1b6cf48d83 100644
--- a/nova/tests/functional/regressions/test_bug_1689692.py
+++ b/nova/tests/functional/regressions/test_bug_1689692.py
@@ -71,14 +71,13 @@ class ServerListLimitMarkerCell0Test(test.TestCase,
             self.addCleanup(self.api.delete_server, server['id'])
             self._wait_for_state_change(self.api, server, 'ERROR')
 
-        servers = self.api.get_servers(search_opts=dict(sort_key='uuid'))
+        servers = self.api.get_servers()
         self.assertEqual(3, len(servers))
 
         # Take the first server and user that as our marker.
         marker = servers[0]['id']
         # Since we're paging after the first server as our marker, there are
         # only two left so specifying three should just return two.
-        servers = self.api.get_servers(search_opts=dict(sort_key='uuid',
-                                                        marker=marker,
+        servers = self.api.get_servers(search_opts=dict(marker=marker,
                                                         limit=3))
         self.assertEqual(2, len(servers))