From 5579f2f110b8b08097c6a7b189669886a10cb378 Mon Sep 17 00:00:00 2001 From: Alvaro Uria Date: Mon, 20 Jul 2020 19:24:33 +0200 Subject: [PATCH 01/17] HaclusterTest: support scaleback test on bionic --- zaza/openstack/charm_tests/hacluster/tests.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 199d915..3ae13d1 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -21,6 +21,7 @@ import os import zaza.openstack.charm_tests.test_utils as test_utils import zaza.openstack.configure.hacluster +import zaza.utilities.juju as juju_utils class HaclusterTest(test_utils.OpenStackBaseTest): @@ -69,3 +70,71 @@ class HaclusterTest(test_utils.OpenStackBaseTest): self._toggle_maintenance_and_wait('true') self._toggle_maintenance_and_wait('false') + + def test_930_scaleback_bionic(self): + """Remove a unit, recalculate quorum and add a new one.""" + principle_app = 'keystone' + principle_units = zaza.model.get_status().applications[ + principle_app]['units'] + self.assertEqual(len(principle_units), 3) + doomed_principle = sorted(principle_units.keys())[0] + series = juju_utils.get_machine_series( + principle_units[doomed_principle].machine) + if series != 'bionic': + logging.debug("noop - only run test in bionic") + logging.info('SKIP') + return + + doomed_unit = juju_utils.get_subordinate_units( + [doomed_principle], charm_name='hac')[0] + + logging.info('Pausing unit {}'.format(doomed_unit)) + zaza.model.run_action( + doomed_unit, + 'pause', + raise_on_failure=True) + logging.info('OK') + + logging.info('Resuming unit {}'.format(doomed_unit)) + zaza.model.run_action( + doomed_unit, + 'resume', + raise_on_failure=True) + logging.info('OK') + + logging.info('Removing {}'.format(doomed_principle)) + zaza.model.destroy_unit( + principle_app, + doomed_principle, + wait_disappear=True) + logging.info('OK') + + logging.info('Updating corosync ring') + zaza.model.run_action_on_leader( + self.application_name, + 'update-ring', + action_params={'i-really-mean-it': True}, + raise_on_failure=True) + + _states = { + self.application_name: { + "workload-status": "blocked", + "workload-status-message": + "Insufficient peer units for ha cluster (require 3)" + }, + 'keystone': { + "workload-status": "blocked", + "workload-status-message": "Database not initialised", + }, + } + zaza.model.wait_for_application_states(states=_states) + zaza.model.block_until_all_units_idle() + logging.info('OK') + + logging.info('Adding a hacluster unit') + zaza.model.add_unit(principle_app, wait_appear=True) + _states = {self.application_name: { + "workload-status": "active", + "workload-status-message": "Unit is ready and clustered"}} + zaza.model.wait_for_application_states(states=_states) + logging.debug('OK') From 573acf9d1c0e2490589a4246b90083fc4113fdc9 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Thu, 27 Aug 2020 11:54:42 +0200 Subject: [PATCH 02/17] Move new scaleback test to new test class --- zaza/openstack/charm_tests/hacluster/tests.py | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 3ae13d1..7715a4a 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -24,15 +24,19 @@ import zaza.openstack.configure.hacluster import zaza.utilities.juju as juju_utils -class HaclusterTest(test_utils.OpenStackBaseTest): - """hacluster tests.""" +class HaclusterBaseTest(test_utils.OpenStackBaseTest): + """Base class for hacluster tests.""" @classmethod def setUpClass(cls): """Run class setup for running hacluster tests.""" - super(HaclusterTest, cls).setUpClass() + super(HaclusterBaseTest, cls).setUpClass() cls.vip = os.environ.get("TEST_VIP00") + +class HaclusterTest(HaclusterBaseTest): + """hacluster tests.""" + def test_900_action_cleanup(self): """The services can be cleaned up.""" zaza.model.run_action_on_leader( @@ -71,22 +75,20 @@ class HaclusterTest(test_utils.OpenStackBaseTest): self._toggle_maintenance_and_wait('true') self._toggle_maintenance_and_wait('false') - def test_930_scaleback_bionic(self): + +class HaclusterScalebackTest(HaclusterBaseTest): + """hacluster scaleback tests.""" + + _PRINCIPLE_APP = 'keystone' + + def test_930_scaleback(self): """Remove a unit, recalculate quorum and add a new one.""" - principle_app = 'keystone' principle_units = zaza.model.get_status().applications[ - principle_app]['units'] + self._PRINCIPLE_APP]['units'] self.assertEqual(len(principle_units), 3) doomed_principle = sorted(principle_units.keys())[0] - series = juju_utils.get_machine_series( - principle_units[doomed_principle].machine) - if series != 'bionic': - logging.debug("noop - only run test in bionic") - logging.info('SKIP') - return - doomed_unit = juju_utils.get_subordinate_units( - [doomed_principle], charm_name='hac')[0] + [doomed_principle], charm_name=self.application_name)[0] logging.info('Pausing unit {}'.format(doomed_unit)) zaza.model.run_action( @@ -104,7 +106,7 @@ class HaclusterTest(test_utils.OpenStackBaseTest): logging.info('Removing {}'.format(doomed_principle)) zaza.model.destroy_unit( - principle_app, + self._PRINCIPLE_APP, doomed_principle, wait_disappear=True) logging.info('OK') @@ -116,25 +118,25 @@ class HaclusterTest(test_utils.OpenStackBaseTest): action_params={'i-really-mean-it': True}, raise_on_failure=True) - _states = { + expected_states = { self.application_name: { "workload-status": "blocked", "workload-status-message": "Insufficient peer units for ha cluster (require 3)" }, - 'keystone': { + self._PRINCIPLE_APP: { "workload-status": "blocked", "workload-status-message": "Database not initialised", }, } - zaza.model.wait_for_application_states(states=_states) + zaza.model.wait_for_application_states(states=expected_states) zaza.model.block_until_all_units_idle() logging.info('OK') logging.info('Adding a hacluster unit') - zaza.model.add_unit(principle_app, wait_appear=True) - _states = {self.application_name: { + zaza.model.add_unit(self._PRINCIPLE_APP, wait_appear=True) + expected_states = {self.application_name: { "workload-status": "active", "workload-status-message": "Unit is ready and clustered"}} - zaza.model.wait_for_application_states(states=_states) + zaza.model.wait_for_application_states(states=expected_states) logging.debug('OK') From e48da475a2fe4a4b7d6b210def38aacfee05b83e Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Thu, 27 Aug 2020 12:57:46 +0200 Subject: [PATCH 03/17] No need to resume hacluster unit before scaling down --- zaza/openstack/charm_tests/hacluster/tests.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 7715a4a..28f5248 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -97,13 +97,6 @@ class HaclusterScalebackTest(HaclusterBaseTest): raise_on_failure=True) logging.info('OK') - logging.info('Resuming unit {}'.format(doomed_unit)) - zaza.model.run_action( - doomed_unit, - 'resume', - raise_on_failure=True) - logging.info('OK') - logging.info('Removing {}'.format(doomed_principle)) zaza.model.destroy_unit( self._PRINCIPLE_APP, From b72855ca96d15a11c43e8b56a63a7f968e91d847 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Fri, 28 Aug 2020 13:15:31 +0200 Subject: [PATCH 04/17] Make HaclusterScalebackTest runnable against any charm --- zaza/openstack/charm_tests/hacluster/tests.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 28f5248..76eced8 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -79,16 +79,18 @@ class HaclusterTest(HaclusterBaseTest): class HaclusterScalebackTest(HaclusterBaseTest): """hacluster scaleback tests.""" - _PRINCIPLE_APP = 'keystone' + _PRINCIPLE_APP_NAME = 'keystone' + _HACLUSTER_APP_NAME = 'hacluster' + _HACLUSTER_CHARM_NAME = 'hacluster' def test_930_scaleback(self): """Remove a unit, recalculate quorum and add a new one.""" principle_units = zaza.model.get_status().applications[ - self._PRINCIPLE_APP]['units'] + self._PRINCIPLE_APP_NAME]['units'] self.assertEqual(len(principle_units), 3) doomed_principle = sorted(principle_units.keys())[0] doomed_unit = juju_utils.get_subordinate_units( - [doomed_principle], charm_name=self.application_name)[0] + [doomed_principle], charm_name=self._HACLUSTER_CHARM_NAME)[0] logging.info('Pausing unit {}'.format(doomed_unit)) zaza.model.run_action( @@ -99,25 +101,25 @@ class HaclusterScalebackTest(HaclusterBaseTest): logging.info('Removing {}'.format(doomed_principle)) zaza.model.destroy_unit( - self._PRINCIPLE_APP, + self._PRINCIPLE_APP_NAME, doomed_principle, wait_disappear=True) logging.info('OK') logging.info('Updating corosync ring') zaza.model.run_action_on_leader( - self.application_name, + self._HACLUSTER_APP_NAME, 'update-ring', action_params={'i-really-mean-it': True}, raise_on_failure=True) expected_states = { - self.application_name: { + self._HACLUSTER_APP_NAME: { "workload-status": "blocked", "workload-status-message": "Insufficient peer units for ha cluster (require 3)" }, - self._PRINCIPLE_APP: { + self._PRINCIPLE_APP_NAME: { "workload-status": "blocked", "workload-status-message": "Database not initialised", }, @@ -127,8 +129,8 @@ class HaclusterScalebackTest(HaclusterBaseTest): logging.info('OK') logging.info('Adding a hacluster unit') - zaza.model.add_unit(self._PRINCIPLE_APP, wait_appear=True) - expected_states = {self.application_name: { + zaza.model.add_unit(self._PRINCIPLE_APP_NAME, wait_appear=True) + expected_states = {self._HACLUSTER_APP_NAME: { "workload-status": "active", "workload-status-message": "Unit is ready and clustered"}} zaza.model.wait_for_application_states(states=expected_states) From 14f7e3848cb5989912769e3a7c47345cd1f56624 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Tue, 8 Sep 2020 12:51:53 +0200 Subject: [PATCH 05/17] Remove hacluster test option hacluster_app_name --- zaza/openstack/charm_tests/hacluster/tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 2728a68..e02832b 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -85,7 +85,6 @@ class HaclusterScalebackTest(HaclusterBaseTest): super(HaclusterScalebackTest, cls).setUpClass() test_config = cls.test_config['tests_options']['hacluster'] cls._principle_app_name = test_config['principle-app-name'] - cls._hacluster_app_name = test_config['hacluster-app-name'] cls._hacluster_charm_name = test_config['hacluster-charm-name'] def test_930_scaleback(self): @@ -121,8 +120,9 @@ class HaclusterScalebackTest(HaclusterBaseTest): logging.info('OK') logging.info('Updating corosync ring') + hacluster_app_name = other_hacluster_unit.application zaza.model.run_action_on_leader( - self._hacluster_app_name, + hacluster_app_name, 'update-ring', action_params={'i-really-mean-it': True}, raise_on_failure=True) @@ -132,7 +132,7 @@ class HaclusterScalebackTest(HaclusterBaseTest): logging.info('OK') logging.info('Waiting for model to settle') - expected_states = {self._hacluster_app_name: { + expected_states = {hacluster_app_name: { "workload-status": "active", "workload-status-message": "Unit is ready and clustered"}} zaza.model.wait_for_application_states(states=expected_states) From 11f3060f6b1564d6497cbfc2b2f80251bcc27ca3 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Wed, 9 Sep 2020 10:08:37 +0200 Subject: [PATCH 06/17] Fix unit-name -> app-name conversion --- zaza/openstack/charm_tests/hacluster/tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index e02832b..29751e1 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -120,7 +120,8 @@ class HaclusterScalebackTest(HaclusterBaseTest): logging.info('OK') logging.info('Updating corosync ring') - hacluster_app_name = other_hacluster_unit.application + hacluster_app_name = zaza.model.get_unit_from_name( + other_hacluster_unit).application zaza.model.run_action_on_leader( hacluster_app_name, 'update-ring', From 69b61c63f4ca647e597c17a98baa33d1fe9ee0a7 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Fri, 27 Nov 2020 12:04:43 +0100 Subject: [PATCH 07/17] Add HaclusterScaleBackAndForthTest --- zaza/openstack/charm_tests/hacluster/tests.py | 62 ++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 29751e1..99272ed 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -77,7 +77,67 @@ class HaclusterTest(HaclusterBaseTest): class HaclusterScalebackTest(HaclusterBaseTest): - """hacluster scaleback tests.""" + """hacluster scaleback tests. + + Use for testing older releases where lp:1400481 wasn't fixed yet. + Superseded by HaclusterScaleBackAndForthTest. + """ + + @classmethod + def setUpClass(cls): + """Run class setup for running hacluster scaleback tests.""" + super(HaclusterScalebackTest, cls).setUpClass() + test_config = cls.test_config['tests_options']['hacluster'] + cls._principle_app_name = test_config['principle-app-name'] + cls._hacluster_charm_name = test_config['hacluster-charm-name'] + + def test_930_scaleback(self): + """Remove a unit and add a new one.""" + principle_units = sorted(zaza.model.get_status().applications[ + self._principle_app_name]['units'].keys()) + self.assertEqual(len(principle_units), 3) + doomed_principle_unit = principle_units[0] + other_principle_unit = principle_units[1] + doomed_hacluster_unit = juju_utils.get_subordinate_units( + [doomed_principle_unit], charm_name=self._hacluster_charm_name)[0] + other_hacluster_unit = juju_utils.get_subordinate_units( + [other_principle_unit], charm_name=self._hacluster_charm_name)[0] + + logging.info('Pausing unit {}'.format(doomed_hacluster_unit)) + zaza.model.run_action( + doomed_hacluster_unit, + 'pause', + raise_on_failure=True) + logging.info('OK') + + logging.info('Removing {}'.format(doomed_principle_unit)) + zaza.model.destroy_unit( + self._principle_app_name, + doomed_principle_unit, + wait_disappear=True) + logging.info('OK') + + logging.info('Waiting for model to settle') + zaza.model.block_until_unit_wl_status(other_hacluster_unit, 'blocked') + zaza.model.block_until_unit_wl_status(other_principle_unit, 'blocked') + zaza.model.block_until_all_units_idle() + logging.info('OK') + + logging.info('Adding an hacluster unit') + zaza.model.add_unit(self._principle_app_name, wait_appear=True) + logging.info('OK') + + logging.info('Waiting for model to settle') + zaza.model.block_until_unit_wl_status(other_hacluster_unit, 'active') + # NOTE(lourot): the principle application remains blocked after scaling + # back up until lp:1400481 is solved. + zaza.model.block_until_unit_wl_status(other_principle_unit, 'blocked') + zaza.model.block_until_all_units_idle() + logging.debug('OK') + + +class HaclusterScaleBackAndForthTest(HaclusterBaseTest): + """hacluster tests scaling back and forth.""" @classmethod def setUpClass(cls): From a3248185f147d04774c3e8f2e32c84d386be385c Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Fri, 27 Nov 2020 13:52:44 +0100 Subject: [PATCH 08/17] Fix copy-paste mistake --- zaza/openstack/charm_tests/hacluster/tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 99272ed..517cde2 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -141,8 +141,8 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): @classmethod def setUpClass(cls): - """Run class setup for running hacluster scaleback tests.""" - super(HaclusterScalebackTest, cls).setUpClass() + """Run class setup for running hacluster tests.""" + super(HaclusterScaleBackAndForthTest, cls).setUpClass() test_config = cls.test_config['tests_options']['hacluster'] cls._principle_app_name = test_config['principle-app-name'] cls._hacluster_charm_name = test_config['hacluster-charm-name'] From 1c4d2e05aff4e56859afec4ce80b4e079a41175e Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Thu, 18 Feb 2021 09:51:00 +0100 Subject: [PATCH 09/17] Let test_930_scaleback remove 2 units instead of 1 --- zaza/openstack/charm_tests/hacluster/tests.py | 57 +++++++++++-------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 28d0383..b1c0713 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -137,7 +137,10 @@ class HaclusterScalebackTest(HaclusterBaseTest): class HaclusterScaleBackAndForthTest(HaclusterBaseTest): - """hacluster tests scaling back and forth.""" + """hacluster tests scaling back and forth. + + Supersedes HaclusterScalebackTest. + """ @classmethod def setUpClass(cls): @@ -148,38 +151,46 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): cls._hacluster_charm_name = test_config['hacluster-charm-name'] def test_930_scaleback(self): - """Remove a unit, recalculate quorum and add a new one.""" + """Remove two units, recalculate quorum and re-add two units. + + NOTE(lourot): before lp:1400481 was fixed, quorum wasn't recalculated + when removing units. Within a cluster of 3 units, removing two units + and re-adding them led to a situation where the expected quorum became + 5 and was still considered unreached. + """ principle_units = sorted(zaza.model.get_status().applications[ self._principle_app_name]['units'].keys()) self.assertEqual(len(principle_units), 3) - doomed_principle_unit = principle_units[0] - other_principle_unit = principle_units[1] - doomed_hacluster_unit = juju_utils.get_subordinate_units( - [doomed_principle_unit], charm_name=self._hacluster_charm_name)[0] + doomed_principle_units = principle_units[:2] + other_principle_unit = principle_units[2] + doomed_hacluster_units = juju_utils.get_subordinate_units( + doomed_principle_units, charm_name=self._hacluster_charm_name) other_hacluster_unit = juju_utils.get_subordinate_units( [other_principle_unit], charm_name=self._hacluster_charm_name)[0] - logging.info('Pausing unit {}'.format(doomed_hacluster_unit)) - zaza.model.run_action( - doomed_hacluster_unit, - 'pause', - raise_on_failure=True) - logging.info('OK') + for doomed_hacluster_unit in doomed_hacluster_units: + logging.info('Pausing unit {}'.format(doomed_hacluster_unit)) + zaza.model.run_action( + doomed_hacluster_unit, + 'pause', + raise_on_failure=True) - logging.info('Removing {}'.format(doomed_principle_unit)) - zaza.model.destroy_unit( - self._principle_app_name, - doomed_principle_unit, - wait_disappear=True) - logging.info('OK') + for doomed_principle_unit in doomed_principle_units: + logging.info('Removing {}'.format(doomed_principle_unit)) + zaza.model.destroy_unit( + self._principle_app_name, + doomed_principle_unit, + wait_disappear=True) logging.info('Waiting for model to settle') zaza.model.block_until_unit_wl_status(other_hacluster_unit, 'blocked') - zaza.model.block_until_unit_wl_status(other_principle_unit, 'blocked') + # NOTE(lourot): the principle unit (usually a keystone unit) isn't + # guaranteed to be blocked, so we don't validate that. zaza.model.block_until_all_units_idle() - logging.info('OK') logging.info('Updating corosync ring') + # NOTE(lourot): with Juju >= 2.8 this isn't actually necessary as it + # has already been done in hacluster's hanode departing hook: hacluster_app_name = zaza.model.get_unit_from_name( other_hacluster_unit).application zaza.model.run_action_on_leader( @@ -188,9 +199,9 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): action_params={'i-really-mean-it': True}, raise_on_failure=True) - logging.info('Adding an hacluster unit') - zaza.model.add_unit(self._principle_app_name, wait_appear=True) - logging.info('OK') + for article in ('an', 'another'): + logging.info('Adding {} hacluster unit'.format(article)) + zaza.model.add_unit(self._principle_app_name, wait_appear=True) logging.info('Waiting for model to settle') expected_states = {hacluster_app_name: { From f598eb93c29f0c7aec99ac22f1d5d5e5ec7cd7e1 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Thu, 18 Feb 2021 10:38:45 +0100 Subject: [PATCH 10/17] Improve comments --- zaza/openstack/charm_tests/hacluster/tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index b1c0713..710d3e7 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -185,14 +185,14 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): logging.info('Waiting for model to settle') zaza.model.block_until_unit_wl_status(other_hacluster_unit, 'blocked') # NOTE(lourot): the principle unit (usually a keystone unit) isn't - # guaranteed to be blocked, so we don't validate that. + # guaranteed to be blocked, so we don't validate that here. zaza.model.block_until_all_units_idle() logging.info('Updating corosync ring') - # NOTE(lourot): with Juju >= 2.8 this isn't actually necessary as it - # has already been done in hacluster's hanode departing hook: hacluster_app_name = zaza.model.get_unit_from_name( other_hacluster_unit).application + # NOTE(lourot): with Juju >= 2.8 this isn't actually necessary as it + # has already been done in hacluster's hanode departing hook: zaza.model.run_action_on_leader( hacluster_app_name, 'update-ring', From c00aa18ffbd03dfad9b196b86a54b51e1ba3ef7e Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Thu, 18 Feb 2021 16:22:43 +0100 Subject: [PATCH 11/17] Assert that corosync considers all nodes to be online --- zaza/openstack/charm_tests/hacluster/tests.py | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 710d3e7..f720d20 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -153,20 +153,22 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): def test_930_scaleback(self): """Remove two units, recalculate quorum and re-add two units. - NOTE(lourot): before lp:1400481 was fixed, quorum wasn't recalculated - when removing units. Within a cluster of 3 units, removing two units - and re-adding them led to a situation where the expected quorum became - 5 and was still considered unreached. + NOTE(lourot): before lp:1400481 was fixed, the corosync ring wasn't + recalculated when removing units. So within a cluster of 3 units, + removing two units and re-adding them led to a situation where corosync + considers having 3 nodes online out of 5, instead of just 3 out of 3. + This test covers this scenario. """ principle_units = sorted(zaza.model.get_status().applications[ self._principle_app_name]['units'].keys()) self.assertEqual(len(principle_units), 3) doomed_principle_units = principle_units[:2] - other_principle_unit = principle_units[2] + surviving_principle_unit = principle_units[2] doomed_hacluster_units = juju_utils.get_subordinate_units( doomed_principle_units, charm_name=self._hacluster_charm_name) - other_hacluster_unit = juju_utils.get_subordinate_units( - [other_principle_unit], charm_name=self._hacluster_charm_name)[0] + surviving_hacluster_unit = juju_utils.get_subordinate_units( + [surviving_principle_unit], + charm_name=self._hacluster_charm_name)[0] for doomed_hacluster_unit in doomed_hacluster_units: logging.info('Pausing unit {}'.format(doomed_hacluster_unit)) @@ -183,14 +185,15 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): wait_disappear=True) logging.info('Waiting for model to settle') - zaza.model.block_until_unit_wl_status(other_hacluster_unit, 'blocked') + zaza.model.block_until_unit_wl_status(surviving_hacluster_unit, + 'blocked') # NOTE(lourot): the principle unit (usually a keystone unit) isn't # guaranteed to be blocked, so we don't validate that here. zaza.model.block_until_all_units_idle() logging.info('Updating corosync ring') hacluster_app_name = zaza.model.get_unit_from_name( - other_hacluster_unit).application + surviving_hacluster_unit).application # NOTE(lourot): with Juju >= 2.8 this isn't actually necessary as it # has already been done in hacluster's hanode departing hook: zaza.model.run_action_on_leader( @@ -200,7 +203,7 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): raise_on_failure=True) for article in ('an', 'another'): - logging.info('Adding {} hacluster unit'.format(article)) + logging.info('Re-adding {} hacluster unit'.format(article)) zaza.model.add_unit(self._principle_app_name, wait_appear=True) logging.info('Waiting for model to settle') @@ -208,4 +211,16 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): "workload-status": "active", "workload-status-message": "Unit is ready and clustered"}} zaza.model.wait_for_application_states(states=expected_states) - logging.debug('OK') + + # At this point if the corosync ring has been properly updated, there + # shouldn't be any trace of the deleted units anymore: + logging.info('Checking that corosync considers all nodes to be online') + cmd = 'sudo crm status' + result = zaza.model.run_on_unit(surviving_hacluster_unit, cmd) + code = result.get('Code') + if code != '0': + raise zaza.model.CommandRunFailed(cmd, result) + output = result.get('Stdout').strip() + logging.debug('Output received: {}'.format(output)) + self.assertNotIn('OFFLINE', output, + "corosync shouldn't list any offline node") From 8c819814fb01741575bee97ab74d202b00947123 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Thu, 18 Feb 2021 10:50:53 +0100 Subject: [PATCH 12/17] Don't wait for principle to get back to active --- zaza/openstack/charm_tests/hacluster/tests.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index f720d20..f340022 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -130,8 +130,7 @@ class HaclusterScalebackTest(HaclusterBaseTest): logging.info('Waiting for model to settle') zaza.model.block_until_unit_wl_status(other_hacluster_unit, 'active') # NOTE(lourot): the principle application sometimes remain blocked - # after scaling back up until lp:1400481 is solved. - # zaza.model.block_until_unit_wl_status(other_principle_unit, 'active') + # after scaling back up. zaza.model.block_until_all_units_idle() logging.debug('OK') @@ -207,10 +206,15 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): zaza.model.add_unit(self._principle_app_name, wait_appear=True) logging.info('Waiting for model to settle') - expected_states = {hacluster_app_name: { - "workload-status": "active", - "workload-status-message": "Unit is ready and clustered"}} - zaza.model.wait_for_application_states(states=expected_states) + # NOTE(lourot): the principle charm may remain blocked here. This seems + # to happen often when it is keystone and has a mysql-router as other + # subordinate charm. The keystone units seems to often remain blocked + # with 'Database not initialised'. This is not the hacluster charm's + # fault and this is why we don't validate here that the entire model + # goes back to active/idle. + zaza.model.block_until_unit_wl_status(surviving_hacluster_unit, + 'active') + zaza.model.block_until_all_units_idle() # At this point if the corosync ring has been properly updated, there # shouldn't be any trace of the deleted units anymore: From 20bd5dfb5908f79b6e2c1c19b5e6e893baad86ee Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Thu, 25 Feb 2021 13:05:30 +0100 Subject: [PATCH 13/17] Remove obsolete comment The automatic corosync cleanup on Juju >= 2.8 turned out not to work and has been removed from charm-hacluster. The 'update-ring' action is now always needed. --- zaza/openstack/charm_tests/hacluster/tests.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index f340022..ea613ec 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -193,8 +193,6 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): logging.info('Updating corosync ring') hacluster_app_name = zaza.model.get_unit_from_name( surviving_hacluster_unit).application - # NOTE(lourot): with Juju >= 2.8 this isn't actually necessary as it - # has already been done in hacluster's hanode departing hook: zaza.model.run_action_on_leader( hacluster_app_name, 'update-ring', From c80f5fdef8fe80564406b084961d5a6b4f7e03a0 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Wed, 3 Mar 2021 10:50:53 +0100 Subject: [PATCH 14/17] Make HaclusterScaleBackAndForthTest less stressful Removing and re-adding two units out of a cluster of three so fast seems to be too stressful for corosync/pacemaker, which makes the test fail in rare cases. Removing and re-adding one unit only is enough in order to validate the functionality. This also makes the test execution time much shorter. --- zaza/openstack/charm_tests/hacluster/tests.py | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index ea613ec..b365960 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -154,40 +154,39 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): NOTE(lourot): before lp:1400481 was fixed, the corosync ring wasn't recalculated when removing units. So within a cluster of 3 units, - removing two units and re-adding them led to a situation where corosync - considers having 3 nodes online out of 5, instead of just 3 out of 3. + removing a unit and re-adding one led to a situation where corosync + considers having 3 nodes online out of 4, instead of just 3 out of 3. This test covers this scenario. """ principle_units = sorted(zaza.model.get_status().applications[ self._principle_app_name]['units'].keys()) self.assertEqual(len(principle_units), 3) - doomed_principle_units = principle_units[:2] - surviving_principle_unit = principle_units[2] - doomed_hacluster_units = juju_utils.get_subordinate_units( - doomed_principle_units, charm_name=self._hacluster_charm_name) + surviving_principle_unit = principle_units[0] + doomed_principle_unit = principle_units[1] surviving_hacluster_unit = juju_utils.get_subordinate_units( [surviving_principle_unit], charm_name=self._hacluster_charm_name)[0] + doomed_hacluster_unit = juju_utils.get_subordinate_units( + [doomed_principle_unit], + charm_name=self._hacluster_charm_name)[0] - for doomed_hacluster_unit in doomed_hacluster_units: - logging.info('Pausing unit {}'.format(doomed_hacluster_unit)) - zaza.model.run_action( - doomed_hacluster_unit, - 'pause', - raise_on_failure=True) + logging.info('Pausing unit {}'.format(doomed_hacluster_unit)) + zaza.model.run_action( + doomed_hacluster_unit, + 'pause', + raise_on_failure=True) - for doomed_principle_unit in doomed_principle_units: - logging.info('Removing {}'.format(doomed_principle_unit)) - zaza.model.destroy_unit( - self._principle_app_name, - doomed_principle_unit, - wait_disappear=True) + logging.info('Removing {}'.format(doomed_principle_unit)) + zaza.model.destroy_unit( + self._principle_app_name, + doomed_principle_unit, + wait_disappear=True) logging.info('Waiting for model to settle') zaza.model.block_until_unit_wl_status(surviving_hacluster_unit, 'blocked') - # NOTE(lourot): the principle unit (usually a keystone unit) isn't - # guaranteed to be blocked, so we don't validate that here. + # NOTE(lourot): the surviving principle units (usually keystone units) + # aren't guaranteed to be blocked, so we don't validate that here. zaza.model.block_until_all_units_idle() logging.info('Updating corosync ring') @@ -199,9 +198,8 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): action_params={'i-really-mean-it': True}, raise_on_failure=True) - for article in ('an', 'another'): - logging.info('Re-adding {} hacluster unit'.format(article)) - zaza.model.add_unit(self._principle_app_name, wait_appear=True) + logging.info('Re-adding an hacluster unit') + zaza.model.add_unit(self._principle_app_name, wait_appear=True) logging.info('Waiting for model to settle') # NOTE(lourot): the principle charm may remain blocked here. This seems @@ -215,7 +213,7 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): zaza.model.block_until_all_units_idle() # At this point if the corosync ring has been properly updated, there - # shouldn't be any trace of the deleted units anymore: + # shouldn't be any trace of the deleted unit anymore: logging.info('Checking that corosync considers all nodes to be online') cmd = 'sudo crm status' result = zaza.model.run_on_unit(surviving_hacluster_unit, cmd) From 0931ebb456574fb3b4b25344e3ed26861eca7429 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Wed, 3 Mar 2021 10:56:15 +0100 Subject: [PATCH 15/17] Fix docstring --- zaza/openstack/charm_tests/hacluster/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index b365960..66d4f45 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -150,7 +150,7 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): cls._hacluster_charm_name = test_config['hacluster-charm-name'] def test_930_scaleback(self): - """Remove two units, recalculate quorum and re-add two units. + """Remove one unit, recalculate quorum and re-add one unit. NOTE(lourot): before lp:1400481 was fixed, the corosync ring wasn't recalculated when removing units. So within a cluster of 3 units, From 93a48133b35e20d79251922c321d97b1ddf03e4e Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Fri, 5 Mar 2021 12:12:00 +0100 Subject: [PATCH 16/17] Add assertions to HaclusterScaleBackAndForthTest --- zaza/openstack/charm_tests/hacluster/tests.py | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 66d4f45..53c7392 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -189,6 +189,10 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): # aren't guaranteed to be blocked, so we don't validate that here. zaza.model.block_until_all_units_idle() + # At this point the corosync ring hasn't been updated yet, so it should + # still remember the deleted unit: + self.__assert_some_corosync_nodes_are_offline(surviving_hacluster_unit) + logging.info('Updating corosync ring') hacluster_app_name = zaza.model.get_unit_from_name( surviving_hacluster_unit).application @@ -198,6 +202,10 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): action_params={'i-really-mean-it': True}, raise_on_failure=True) + # At this point if the corosync ring has been properly updated, there + # shouldn't be any trace of the deleted unit anymore: + self.__assert_all_corosync_nodes_are_online(surviving_hacluster_unit) + logging.info('Re-adding an hacluster unit') zaza.model.add_unit(self._principle_app_name, wait_appear=True) @@ -212,15 +220,30 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): 'active') zaza.model.block_until_all_units_idle() - # At this point if the corosync ring has been properly updated, there - # shouldn't be any trace of the deleted unit anymore: + # At this point the corosync ring should still not contain any offline + # node: + self.__assert_all_corosync_nodes_are_online(surviving_hacluster_unit) + + def __assert_some_corosync_nodes_are_offline(self, hacluster_unit): + logging.info('Checking that corosync considers at least one node to ' + 'be offline') + output = self._get_crm_status(hacluster_unit) + self.assertIn('OFFLINE', output, + "corosync should list at least one offline node") + + def __assert_all_corosync_nodes_are_online(self, hacluster_unit): logging.info('Checking that corosync considers all nodes to be online') + output = self._get_crm_status(hacluster_unit) + self.assertNotIn('OFFLINE', output, + "corosync shouldn't list any offline node") + + @staticmethod + def _get_crm_status(hacluster_unit): cmd = 'sudo crm status' - result = zaza.model.run_on_unit(surviving_hacluster_unit, cmd) + result = zaza.model.run_on_unit(hacluster_unit, cmd) code = result.get('Code') if code != '0': raise zaza.model.CommandRunFailed(cmd, result) output = result.get('Stdout').strip() - logging.debug('Output received: {}'.format(output)) - self.assertNotIn('OFFLINE', output, - "corosync shouldn't list any offline node") + logging.debug('crm output received: {}'.format(output)) + return output From aafdc4070f0984d87dea05008de9a86e0beed920 Mon Sep 17 00:00:00 2001 From: Aurelien Lourot Date: Fri, 5 Mar 2021 18:08:54 +0100 Subject: [PATCH 17/17] Workaround for lp:1874719 --- zaza/openstack/charm_tests/hacluster/tests.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/zaza/openstack/charm_tests/hacluster/tests.py b/zaza/openstack/charm_tests/hacluster/tests.py index 53c7392..5f46565 100644 --- a/zaza/openstack/charm_tests/hacluster/tests.py +++ b/zaza/openstack/charm_tests/hacluster/tests.py @@ -220,8 +220,17 @@ class HaclusterScaleBackAndForthTest(HaclusterBaseTest): 'active') zaza.model.block_until_all_units_idle() - # At this point the corosync ring should still not contain any offline - # node: + # Because of lp:1874719 the corosync ring may show a mysterious offline + # 'node1' node. We clean up the ring by re-running the 'update-ring' + # action: + logging.info('Updating corosync ring - workaround for lp:1874719') + zaza.model.run_action_on_leader( + hacluster_app_name, + 'update-ring', + action_params={'i-really-mean-it': True}, + raise_on_failure=True) + + # At this point the corosync ring should not contain any offline node: self.__assert_all_corosync_nodes_are_online(surviving_hacluster_unit) def __assert_some_corosync_nodes_are_offline(self, hacluster_unit):