Merge pull request #369 from auria/hacluster-scaleback

HaclusterScaleBackAndForthTest
2021-03-16 14:44:15 +03:00
parent fae2f6319e aafdc4070f
commit 79317b5e39
1 changed files with 129 additions and 3 deletions
@@ -77,7 +77,11 @@ class HaclusterTest(HaclusterBaseTest):


 class HaclusterScalebackTest(HaclusterBaseTest):
-    """hacluster scaleback tests."""
+    """hacluster scaleback tests.
+
+    Use for testing older releases where lp:1400481 wasn't fixed yet.
+    Superseded by HaclusterScaleBackAndForthTest.
+    """

    @classmethod
    def setUpClass(cls):
@@ -126,7 +130,129 @@ class HaclusterScalebackTest(HaclusterBaseTest):
        logging.info('Waiting for model to settle')
        zaza.model.block_until_unit_wl_status(other_hacluster_unit, 'active')
        # NOTE(lourot): the principle application sometimes remain blocked
-        # after scaling back up until lp:1400481 is solved.
-        # zaza.model.block_until_unit_wl_status(other_principle_unit, 'active')
+        # after scaling back up.
        zaza.model.block_until_all_units_idle()
        logging.debug('OK')
+
+
+class HaclusterScaleBackAndForthTest(HaclusterBaseTest):
+    """hacluster tests scaling back and forth.
+
+    Supersedes HaclusterScalebackTest.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        """Run class setup for running hacluster tests."""
+        super(HaclusterScaleBackAndForthTest, cls).setUpClass()
+        test_config = cls.test_config['tests_options']['hacluster']
+        cls._principle_app_name = test_config['principle-app-name']
+        cls._hacluster_charm_name = test_config['hacluster-charm-name']
+
+    def test_930_scaleback(self):
+        """Remove one unit, recalculate quorum and re-add one unit.
+
+        NOTE(lourot): before lp:1400481 was fixed, the corosync ring wasn't
+        recalculated when removing units. So within a cluster of 3 units,
+        removing a unit and re-adding one led to a situation where corosync
+        considers having 3 nodes online out of 4, instead of just 3 out of 3.
+        This test covers this scenario.
+        """
+        principle_units = sorted(zaza.model.get_status().applications[
+            self._principle_app_name]['units'].keys())
+        self.assertEqual(len(principle_units), 3)
+        surviving_principle_unit = principle_units[0]
+        doomed_principle_unit = principle_units[1]
+        surviving_hacluster_unit = juju_utils.get_subordinate_units(
+            [surviving_principle_unit],
+            charm_name=self._hacluster_charm_name)[0]
+        doomed_hacluster_unit = juju_utils.get_subordinate_units(
+            [doomed_principle_unit],
+            charm_name=self._hacluster_charm_name)[0]
+
+        logging.info('Pausing unit {}'.format(doomed_hacluster_unit))
+        zaza.model.run_action(
+            doomed_hacluster_unit,
+            'pause',
+            raise_on_failure=True)
+
+        logging.info('Removing {}'.format(doomed_principle_unit))
+        zaza.model.destroy_unit(
+            self._principle_app_name,
+            doomed_principle_unit,
+            wait_disappear=True)
+
+        logging.info('Waiting for model to settle')
+        zaza.model.block_until_unit_wl_status(surviving_hacluster_unit,
+                                              'blocked')
+        # NOTE(lourot): the surviving principle units (usually keystone units)
+        # aren't guaranteed to be blocked, so we don't validate that here.
+        zaza.model.block_until_all_units_idle()
+
+        # At this point the corosync ring hasn't been updated yet, so it should
+        # still remember the deleted unit:
+        self.__assert_some_corosync_nodes_are_offline(surviving_hacluster_unit)
+
+        logging.info('Updating corosync ring')
+        hacluster_app_name = zaza.model.get_unit_from_name(
+            surviving_hacluster_unit).application
+        zaza.model.run_action_on_leader(
+            hacluster_app_name,
+            'update-ring',
+            action_params={'i-really-mean-it': True},
+            raise_on_failure=True)
+
+        # At this point if the corosync ring has been properly updated, there
+        # shouldn't be any trace of the deleted unit anymore:
+        self.__assert_all_corosync_nodes_are_online(surviving_hacluster_unit)
+
+        logging.info('Re-adding an hacluster unit')
+        zaza.model.add_unit(self._principle_app_name, wait_appear=True)
+
+        logging.info('Waiting for model to settle')
+        # NOTE(lourot): the principle charm may remain blocked here. This seems
+        # to happen often when it is keystone and has a mysql-router as other
+        # subordinate charm. The keystone units seems to often remain blocked
+        # with 'Database not initialised'. This is not the hacluster charm's
+        # fault and this is why we don't validate here that the entire model
+        # goes back to active/idle.
+        zaza.model.block_until_unit_wl_status(surviving_hacluster_unit,
+                                              'active')
+        zaza.model.block_until_all_units_idle()
+
+        # Because of lp:1874719 the corosync ring may show a mysterious offline
+        # 'node1' node. We clean up the ring by re-running the 'update-ring'
+        # action:
+        logging.info('Updating corosync ring - workaround for lp:1874719')
+        zaza.model.run_action_on_leader(
+            hacluster_app_name,
+            'update-ring',
+            action_params={'i-really-mean-it': True},
+            raise_on_failure=True)
+
+        # At this point the corosync ring should not contain any offline node:
+        self.__assert_all_corosync_nodes_are_online(surviving_hacluster_unit)
+
+    def __assert_some_corosync_nodes_are_offline(self, hacluster_unit):
+        logging.info('Checking that corosync considers at least one node to '
+                     'be offline')
+        output = self._get_crm_status(hacluster_unit)
+        self.assertIn('OFFLINE', output,
+                      "corosync should list at least one offline node")
+
+    def __assert_all_corosync_nodes_are_online(self, hacluster_unit):
+        logging.info('Checking that corosync considers all nodes to be online')
+        output = self._get_crm_status(hacluster_unit)
+        self.assertNotIn('OFFLINE', output,
+                         "corosync shouldn't list any offline node")
+
+    @staticmethod
+    def _get_crm_status(hacluster_unit):
+        cmd = 'sudo crm status'
+        result = zaza.model.run_on_unit(hacluster_unit, cmd)
+        code = result.get('Code')
+        if code != '0':
+            raise zaza.model.CommandRunFailed(cmd, result)
+        output = result.get('Stdout').strip()
+        logging.debug('crm output received: {}'.format(output))
+        return output