From d41f8b37240b794442c873327696ffe2ed9d4fbb Mon Sep 17 00:00:00 2001 From: Ionut Balutoiu Date: Wed, 20 Jan 2021 20:10:34 +0200 Subject: [PATCH] Update Ceph RBD Mirror tests The updated tests add the possibility of testing deployments with `image` RBD mirroring mode implemented as part of the Cinder Ceph Replication charm spec. --- .../charm_tests/ceph/rbd_mirror/tests.py | 308 ++++++++++++++---- 1 file changed, 242 insertions(+), 66 deletions(-) diff --git a/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py b/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py index 6c2fa1b..75a3f42 100644 --- a/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py +++ b/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py @@ -16,6 +16,9 @@ import json import logging import re +import time + +import cinderclient.exceptions as cinder_exceptions import zaza.openstack.charm_tests.test_utils as test_utils @@ -39,16 +42,20 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): cls.site_a_model = cls.site_b_model = zaza.model.get_juju_model() cls.site_b_app_suffix = '-b' - def run_status_action(self, application_name=None, model_name=None): + def run_status_action(self, application_name=None, model_name=None, + pools=[]): """Run status action, decode and return response.""" + action_params = { + 'verbose': True, + 'format': 'json', + } + if len(pools) > 0: + action_params['pools'] = ','.join(pools) result = zaza.model.run_action_on_leader( application_name or self.application_name, 'status', model_name=model_name, - action_params={ - 'verbose': True, - 'format': 'json', - }) + action_params=action_params) return json.loads(result.results['output']) def get_pools(self): @@ -71,7 +78,8 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): def wait_for_mirror_state(self, state, application_name=None, model_name=None, check_entries_behind_master=False, - require_images_in=[]): + require_images_in=[], + pools=[]): """Wait until all images reach requested state. This function runs the ``status`` action and examines the data it @@ -90,6 +98,9 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): :type check_entries_behind_master: bool :param require_images_in: List of pools to require images in :type require_images_in: list of str + :param pools: List of pools to run status on. If this is empty, the + status action will run on all the pools. + :type pools: list of str :returns: True on success, never returns on failure """ rep = re.compile(r'.*entries_behind_master=(\d+)') @@ -97,7 +108,8 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): try: # encapsulate in try except to work around LP: #1820976 pool_status = self.run_status_action( - application_name=application_name, model_name=model_name) + application_name=application_name, model_name=model_name, + pools=pools) except KeyError: continue for pool, status in pool_status.items(): @@ -124,6 +136,119 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): # all images with state has expected state return True + def get_cinder_rbd_mirroring_mode(self, + cinder_ceph_app_name='cinder-ceph'): + """Get the RBD mirroring mode for the Cinder Ceph pool. + + :returns: A string representing the RBD mirroring mode. It can be + either 'pool' or 'image'. + """ + DEFAULT_RBD_MIRRORING_MODE = 'pool' + + rbd_mirroring_mode_config = zaza.model.get_application_config( + cinder_ceph_app_name).get('rbd-mirroring-mode') + if rbd_mirroring_mode_config: + rbd_mirroring_mode = rbd_mirroring_mode_config.get( + 'value', DEFAULT_RBD_MIRRORING_MODE).lower() + else: + rbd_mirroring_mode = DEFAULT_RBD_MIRRORING_MODE + + return rbd_mirroring_mode + + def create_cinder_volume(self, session, from_image=False): + """Create Cinder Volume from image. + + :rtype: :class:`Volume`. + """ + def get_glance_image(session): + glance = openstack.get_glance_session_client(session) + images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME) + if images: + return images[0] + logging.info("Failed to find {} image, falling back to {}".format( + CIRROS_IMAGE_NAME, + LTS_IMAGE_NAME)) + return openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0] + + def create_volume_type(cinder): + try: + vol_type = cinder.volume_types.find(name='repl') + except cinder_exceptions.NotFound: + vol_type = cinder.volume_types.create('repl') + vol_type.set_keys(metadata={ + 'volume_backend_name': 'cinder-ceph', + 'replication_enabled': ' True', + }) + return vol_type + + # NOTE(fnordahl): for some reason create volume from image often fails + # when run just after deployment is finished. We should figure out + # why, resolve the underlying issue and then remove this. + # + # We do not use tenacity here as it will interfere with tenacity used + # in ``resource_reaches_status`` + def create_volume(cinder, volume_params, retry=20): + if retry < 1: + return + volume = cinder.volumes.create(**volume_params) + try: + # Note(coreycb): stop_after_attempt is increased because using + # juju storage for ceph-osd backed by cinder on undercloud + # takes longer than the prior method of directory-backed OSD + # devices. + openstack.resource_reaches_status( + cinder.volumes, volume.id, msg='volume', + stop_after_attempt=20) + return volume + except AssertionError: + logging.info('retrying') + volume.delete() + return create_volume(cinder, volume_params, retry=retry - 1) + + volume_params = { + 'size': 8, + 'name': 'zaza', + } + if from_image: + volume_params['imageRef'] = get_glance_image(session).id + cinder = openstack.get_cinder_session_client(session) + if self.get_cinder_rbd_mirroring_mode() == 'image': + volume_params['volume_type'] = create_volume_type(cinder).id + + return create_volume(cinder, volume_params) + + def failover_cinder_volume_host(self, cinder_client, + backend_name='cinder-ceph', + target_backend_id='ceph', + target_status='disabled', + target_replication_status='failed-over', + timeout=300): + """Failover Cinder volume host.""" + host = 'cinder@{}'.format(backend_name) + logging.info( + 'Failover Cinder host %s to backend_id %s', + host, target_backend_id) + cinder_client.services.failover_host( + host=host, + backend_id=target_backend_id) + start = time.time() + while True: + elapsed = time.time() - start + if elapsed > timeout: + raise cinder_exceptions.TimeoutException( + obj=cinder_client.services, + action='failover_host') + service = cinder_client.services.list( + host=host, + binary='cinder-volume')[0] + if (service.status == target_status and + service.replication_status == target_replication_status): + break + time.sleep(5) + logging.info( + 'Successfully failed-over Cinder host %s to backend_id %s', + host, target_backend_id) + class CephRBDMirrorTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` tests.""" @@ -196,43 +321,7 @@ class CephRBDMirrorTest(CephRBDMirrorBase): test. """ session = openstack.get_overcloud_keystone_session() - glance = openstack.get_glance_session_client(session) - cinder = openstack.get_cinder_session_client(session) - - images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME) - if images: - image = images[0] - else: - logging.info("Failed to find {} image, falling back to {}".format( - CIRROS_IMAGE_NAME, - LTS_IMAGE_NAME)) - image = openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0] - - # NOTE(fnordahl): for some reason create volume from image often fails - # when run just after deployment is finished. We should figure out - # why, resolve the underlying issue and then remove this. - # - # We do not use tenacity here as it will interfere with tenacity used - # in ``resource_reaches_status`` - def create_volume_from_image(cinder, image, retry=20): - if retry < 1: - return - volume = cinder.volumes.create(8, name='zaza', imageRef=image.id) - try: - # Note(coreycb): stop_after_attempt is increased because using - # juju storage for ceph-osd backed by cinder on undercloud - # takes longer than the prior method of directory-backed OSD - # devices. - openstack.resource_reaches_status( - cinder.volumes, volume.id, msg='volume', - stop_after_attempt=20) - return volume - except AssertionError: - logging.info('retrying') - volume.delete() - return create_volume_from_image(cinder, image, retry=retry - 1) - volume = create_volume_from_image(cinder, image) - + volume = self.create_cinder_volume(session, from_image=True) site_a_hash = zaza.openstack.utilities.ceph.get_rbd_hash( zaza.model.get_lead_unit_name('ceph-mon', model_name=self.site_a_model), @@ -258,85 +347,170 @@ class CephRBDMirrorTest(CephRBDMirrorBase): class CephRBDMirrorControlledFailoverTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` controlled failover tests.""" + def cinder_fail_over_fall_back(self): + """Validate controlled fail over and fall back via the Cinder API.""" + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session) + volume = self.create_cinder_volume(session, from_image=True) + self.wait_for_mirror_state( + 'up+replaying', + check_entries_behind_master=True, + application_name=self.application_name + self.site_b_app_suffix, + model_name=self.site_b_model, + pools=['cinder-ceph']) + self.failover_cinder_volume_host( + cinder_client=cinder) + self.assertEqual(cinder.volumes.get(volume.id).status, 'available') + self.failover_cinder_volume_host( + cinder_client=cinder, + target_backend_id='default', + target_status='enabled', + target_replication_status='enabled') + self.assertEqual(cinder.volumes.get(volume.id).status, 'available') + def test_fail_over_fall_back(self): """Validate controlled fail over and fall back.""" site_a_pools, site_b_pools = self.get_pools() + site_a_action_params = {} + site_b_action_params = {} + if self.get_cinder_rbd_mirroring_mode() == 'image': + site_a_pools.remove('cinder-ceph') + site_a_action_params['pools'] = ','.join(site_a_pools) + site_b_pools.remove('cinder-ceph') + site_b_action_params['pools'] = ','.join(site_b_pools) result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror', 'demote', model_name=self.site_a_model, - action_params={}) + action_params=site_a_action_params) logging.info(result.results) n_pools_demoted = len(result.results['output'].split('\n')) self.assertEqual(len(site_a_pools), n_pools_demoted) - self.wait_for_mirror_state('up+unknown', model_name=self.site_a_model) + self.wait_for_mirror_state( + 'up+unknown', + model_name=self.site_a_model, + pools=site_a_pools) self.wait_for_mirror_state( 'up+unknown', application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model) + model_name=self.site_b_model, + pools=site_b_pools) result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror' + self.site_b_app_suffix, 'promote', model_name=self.site_b_model, - action_params={}) + action_params=site_b_action_params) logging.info(result.results) n_pools_promoted = len(result.results['output'].split('\n')) self.assertEqual(len(site_b_pools), n_pools_promoted) self.wait_for_mirror_state( 'up+replaying', - model_name=self.site_a_model) + model_name=self.site_a_model, + pools=site_a_pools) self.wait_for_mirror_state( 'up+stopped', application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model) + model_name=self.site_b_model, + pools=site_b_pools) result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror' + self.site_b_app_suffix, 'demote', model_name=self.site_b_model, - action_params={ - }) + action_params=site_b_action_params) logging.info(result.results) n_pools_demoted = len(result.results['output'].split('\n')) self.assertEqual(len(site_a_pools), n_pools_demoted) self.wait_for_mirror_state( 'up+unknown', - model_name=self.site_a_model) + model_name=self.site_a_model, + pools=site_a_pools) self.wait_for_mirror_state( 'up+unknown', application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model) + model_name=self.site_b_model, + pools=site_b_pools) result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror', 'promote', model_name=self.site_a_model, - action_params={ - }) + action_params=site_a_action_params) logging.info(result.results) n_pools_promoted = len(result.results['output'].split('\n')) self.assertEqual(len(site_b_pools), n_pools_promoted) self.wait_for_mirror_state( 'up+stopped', - model_name=self.site_a_model) + model_name=self.site_a_model, + pools=site_a_pools) + action_params = { + 'i-really-mean-it': True, + } + if self.get_cinder_rbd_mirroring_mode() == 'image': + action_params['pools'] = site_b_action_params['pools'] result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror' + self.site_b_app_suffix, 'resync-pools', model_name=self.site_b_model, - action_params={ - 'i-really-mean-it': True, - }) + action_params=action_params) logging.info(result.results) self.wait_for_mirror_state( 'up+replaying', application_name=self.application_name + self.site_b_app_suffix, model_name=self.site_b_model, - require_images_in=['cinder-ceph', 'glance']) + require_images_in=['cinder-ceph', 'glance'], + pools=site_a_pools) + if self.get_cinder_rbd_mirroring_mode() == 'image': + self.cinder_fail_over_fall_back() class CephRBDMirrorDisasterFailoverTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` destructive tests.""" + def forced_failover_cinder_volume_host(self, cinder_client): + """Validate forced Cinder volume host fail over.""" + def apply_cinder_workaround(): + """Set minimal timeouts / retries to the Cinder Ceph backend. + + This is needed because the failover via Cinder will try to do a + demotion of the site-a, and with the default timeouts / retries, + the operation takes an unreasonably amount of time. + """ + cinder_configs = { + 'rados_connect_timeout': '1', + 'rados_connection_retries': '1', + 'rados_connection_interval': '0', + 'replication_connect_timeout': '1', + } + update_cinder_conf_cmd = ( + "import configparser; " + "config = configparser.ConfigParser(); " + "config.read('/etc/cinder/cinder.conf'); " + "{}" + "f = open('/etc/cinder/cinder.conf', 'w'); " + "config.write(f); " + "f.close()") + cmd = '' + for config in cinder_configs: + cmd += "config.set('cinder-ceph', '{0}', '{1}'); ".format( + config, cinder_configs[config]) + cmd = update_cinder_conf_cmd.format(cmd) + zaza.model.run_on_leader( + 'cinder-ceph', + 'python3 -c "{}"; systemctl restart cinder-volume'.format(cmd)) + + apply_cinder_workaround() + self.failover_cinder_volume_host(cinder_client) + + for volume in cinder_client.volumes.list(): + self.assertEqual(volume.status, 'available') + def test_kill_site_a_fail_over(self): """Validate fail over after uncontrolled shutdown of primary.""" + action_params = {} + if self.get_cinder_rbd_mirroring_mode() == 'image': + _, site_b_pools = self.get_pools() + site_b_pools.remove('cinder-ceph') + action_params['pools'] = ','.join(site_b_pools) + for application in 'ceph-rbd-mirror', 'ceph-mon', 'ceph-osd': zaza.model.remove_application( application, @@ -346,14 +520,16 @@ class CephRBDMirrorDisasterFailoverTest(CephRBDMirrorBase): 'ceph-rbd-mirror' + self.site_b_app_suffix, 'promote', model_name=self.site_b_model, - action_params={ - }) + action_params=action_params) self.assertEqual(result.status, 'failed') + action_params['force'] = True result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror' + self.site_b_app_suffix, 'promote', model_name=self.site_b_model, - action_params={ - 'force': True, - }) + action_params=action_params) self.assertEqual(result.status, 'completed') + if self.get_cinder_rbd_mirroring_mode() == 'image': + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session) + self.forced_failover_cinder_volume_host(cinder)