diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 1690c85..5f46565 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -77,7 +77,11 @@ class HaclusterTest(HaclusterBaseTest): class HaclusterScalebackTest(HaclusterBaseTest): - """hacluster scaleback tests.""" + """hacluster scaleback tests. + + Use for testing older releases where lp:1400481 wasn't fixed yet. + Superseded by HaclusterScaleBackAndForthTest. + """ @classmethod def setUpClass(cls): @@ -126,7 +130,129 @@ class HaclusterScalebackTest(HaclusterBaseTest): logging.info('Waiting for model to settle') zaza.model.block_until_unit_wl_status(other_hacluster_unit, 'active') # NOTE(lourot): the principle application sometimes remain blocked - # after scaling back up until lp:1400481 is solved. - # zaza.model.block_until_unit_wl_status(other_principle_unit, 'active') + # after scaling back up. zaza.model.block_until_all_units_idle() logging.debug('OK') + + +class HaclusterScaleBackAndForthTest(HaclusterBaseTest): + """hacluster tests scaling back and forth. + + Supersedes HaclusterScalebackTest. + """ + + @classmethod + def setUpClass(cls): + """Run class setup for running hacluster tests.""" + super(HaclusterScaleBackAndForthTest, cls).setUpClass() + test_config = cls.test_config['tests_options']['hacluster'] + cls._principle_app_name = test_config['principle-app-name'] + cls._hacluster_charm_name = test_config['hacluster-charm-name'] + + def test_930_scaleback(self): + """Remove one unit, recalculate quorum and re-add one unit. + + NOTE(lourot): before lp:1400481 was fixed, the corosync ring wasn't + recalculated when removing units. So within a cluster of 3 units, + removing a unit and re-adding one led to a situation where corosync + considers having 3 nodes online out of 4, instead of just 3 out of 3. + This test covers this scenario. + """ + principle_units = sorted(zaza.model.get_status().applications[ + self._principle_app_name]['units'].keys()) + self.assertEqual(len(principle_units), 3) + surviving_principle_unit = principle_units[0] + doomed_principle_unit = principle_units[1] + surviving_hacluster_unit = juju_utils.get_subordinate_units( + [surviving_principle_unit], + charm_name=self._hacluster_charm_name)[0] + doomed_hacluster_unit = juju_utils.get_subordinate_units( + [doomed_principle_unit], + charm_name=self._hacluster_charm_name)[0] + + logging.info('Pausing unit {}'.format(doomed_hacluster_unit)) + zaza.model.run_action( + doomed_hacluster_unit, + 'pause', + raise_on_failure=True) + + logging.info('Removing {}'.format(doomed_principle_unit)) + zaza.model.destroy_unit( + self._principle_app_name, + doomed_principle_unit, + wait_disappear=True) + + logging.info('Waiting for model to settle') + zaza.model.block_until_unit_wl_status(surviving_hacluster_unit, + 'blocked') + # NOTE(lourot): the surviving principle units (usually keystone units) + # aren't guaranteed to be blocked, so we don't validate that here. + zaza.model.block_until_all_units_idle() + + # At this point the corosync ring hasn't been updated yet, so it should + # still remember the deleted unit: + self.__assert_some_corosync_nodes_are_offline(surviving_hacluster_unit) + + logging.info('Updating corosync ring') + hacluster_app_name = zaza.model.get_unit_from_name( + surviving_hacluster_unit).application + zaza.model.run_action_on_leader( + hacluster_app_name, + 'update-ring', + action_params={'i-really-mean-it': True}, + raise_on_failure=True) + + # At this point if the corosync ring has been properly updated, there + # shouldn't be any trace of the deleted unit anymore: + self.__assert_all_corosync_nodes_are_online(surviving_hacluster_unit) + + logging.info('Re-adding an hacluster unit') + zaza.model.add_unit(self._principle_app_name, wait_appear=True) + + logging.info('Waiting for model to settle') + # NOTE(lourot): the principle charm may remain blocked here. This seems + # to happen often when it is keystone and has a mysql-router as other + # subordinate charm. The keystone units seems to often remain blocked + # with 'Database not initialised'. This is not the hacluster charm's + # fault and this is why we don't validate here that the entire model + # goes back to active/idle. + zaza.model.block_until_unit_wl_status(surviving_hacluster_unit, + 'active') + zaza.model.block_until_all_units_idle() + + # Because of lp:1874719 the corosync ring may show a mysterious offline + # 'node1' node. We clean up the ring by re-running the 'update-ring' + # action: + logging.info('Updating corosync ring - workaround for lp:1874719') + zaza.model.run_action_on_leader( + hacluster_app_name, + 'update-ring', + action_params={'i-really-mean-it': True}, + raise_on_failure=True) + + # At this point the corosync ring should not contain any offline node: + self.__assert_all_corosync_nodes_are_online(surviving_hacluster_unit) + + def __assert_some_corosync_nodes_are_offline(self, hacluster_unit): + logging.info('Checking that corosync considers at least one node to ' + 'be offline') + output = self._get_crm_status(hacluster_unit) + self.assertIn('OFFLINE', output, + "corosync should list at least one offline node") + + def __assert_all_corosync_nodes_are_online(self, hacluster_unit): + logging.info('Checking that corosync considers all nodes to be online') + output = self._get_crm_status(hacluster_unit) + self.assertNotIn('OFFLINE', output, + "corosync shouldn't list any offline node") + + @staticmethod + def _get_crm_status(hacluster_unit): + cmd = 'sudo crm status' + result = zaza.model.run_on_unit(hacluster_unit, cmd) + code = result.get('Code') + if code != '0': + raise zaza.model.CommandRunFailed(cmd, result) + output = result.get('Stdout').strip() + logging.debug('crm output received: {}'.format(output)) + return output