diff --git a/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py b/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py index 6c2fa1b..d8d7967 100644 --- a/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py +++ b/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py @@ -17,6 +17,8 @@ import json import logging import re +import cinderclient.exceptions as cinder_exceptions + import zaza.openstack.charm_tests.test_utils as test_utils import zaza.model @@ -28,6 +30,129 @@ from zaza.openstack.charm_tests.glance.setup import ( CIRROS_IMAGE_NAME) +DEFAULT_CINDER_RBD_MIRRORING_MODE = 'pool' + + +def get_cinder_rbd_mirroring_mode(cinder_ceph_app_name='cinder-ceph'): + """Get the RBD mirroring mode for the Cinder Ceph pool. + + :param cinder_ceph_app_name: Cinder Ceph Juju application name. + :type cinder_ceph_app_name: str + :returns: A string representing the RBD mirroring mode. It can be + either 'pool' or 'image'. + :rtype: str + """ + rbd_mirroring_mode_config = zaza.model.get_application_config( + cinder_ceph_app_name).get('rbd-mirroring-mode') + if rbd_mirroring_mode_config: + rbd_mirroring_mode = rbd_mirroring_mode_config.get( + 'value', DEFAULT_CINDER_RBD_MIRRORING_MODE).lower() + else: + rbd_mirroring_mode = DEFAULT_CINDER_RBD_MIRRORING_MODE + + return rbd_mirroring_mode + + +def get_glance_image(glance): + """Get the Glance image object to be used by the Ceph tests. + + It looks for the Cirros Glance image, and it's returned if it's found. + If the Cirros image is not found, it will try and find the Ubuntu + LTS image. + + :param glance: Authenticated glanceclient + :type glance: glanceclient.Client + :returns: Glance image object + :rtype: glanceclient.image + """ + images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME) + if images: + return images[0] + logging.info("Failed to find {} image, falling back to {}".format( + CIRROS_IMAGE_NAME, + LTS_IMAGE_NAME)) + return openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0] + + +def setup_cinder_repl_volume_type(cinder, type_name='repl', + backend_name='cinder-ceph'): + """Set up the Cinder volume replication type. + + :param cinder: Authenticated cinderclient + :type cinder: cinder.Client + :param type_name: Cinder volume type name + :type type_name: str + :param backend_name: Cinder volume backend name with replication enabled. + :type backend_name: str + :returns: Cinder volume type object + :rtype: cinderclient.VolumeType + """ + try: + vol_type = cinder.volume_types.find(name=type_name) + except cinder_exceptions.NotFound: + vol_type = cinder.volume_types.create(type_name) + + vol_type.set_keys(metadata={ + 'volume_backend_name': backend_name, + 'replication_enabled': ' True', + }) + return vol_type + + +# TODO: This function should be incorporated into +# 'zaza.openstack.utilities.openstack.create_volume' helper, once the below +# flakiness comments are addressed. +def create_cinder_volume(cinder, name='zaza', image_id=None, type_id=None): + """Create a new Cinder volume. + + :param cinder: Authenticated cinderclient. + :type cinder: cinder.Client + :param name: Volume name. + :type name: str + :param image_id: Glance image id, if the volume is created from image. + :type image_id: str + :param type_id: Cinder Volume type id, if the volume needs to use an + explicit volume type. + :type type_id: boolean + :returns: Cinder volume + :rtype: :class:`Volume`. + """ + # NOTE(fnordahl): for some reason create volume from image often fails + # when run just after deployment is finished. We should figure out + # why, resolve the underlying issue and then remove this. + # + # We do not use tenacity here as it will interfere with tenacity used + # in ``resource_reaches_status`` + def create_volume(cinder, volume_params, retry=20): + if retry < 1: + return + volume = cinder.volumes.create(**volume_params) + try: + # Note(coreycb): stop_after_attempt is increased because using + # juju storage for ceph-osd backed by cinder on undercloud + # takes longer than the prior method of directory-backed OSD + # devices. + openstack.resource_reaches_status( + cinder.volumes, volume.id, msg='volume', + stop_after_attempt=20) + return volume + except AssertionError: + logging.info('retrying') + volume.delete() + return create_volume(cinder, volume_params, retry=retry - 1) + + volume_params = { + 'size': 8, + 'name': name, + } + if image_id: + volume_params['imageRef'] = image_id + if type_id: + volume_params['volume_type'] = type_id + + return create_volume(cinder, volume_params) + + class CephRBDMirrorBase(test_utils.OpenStackBaseTest): """Base class for ``ceph-rbd-mirror`` tests.""" @@ -35,20 +160,26 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): def setUpClass(cls): """Run setup for ``ceph-rbd-mirror`` tests.""" super().setUpClass() + cls.cinder_ceph_app_name = 'cinder-ceph' + cls.test_cinder_volume_name = 'test-cinder-ceph-volume' # get ready for multi-model Zaza cls.site_a_model = cls.site_b_model = zaza.model.get_juju_model() cls.site_b_app_suffix = '-b' - def run_status_action(self, application_name=None, model_name=None): + def run_status_action(self, application_name=None, model_name=None, + pools=[]): """Run status action, decode and return response.""" + action_params = { + 'verbose': True, + 'format': 'json', + } + if len(pools) > 0: + action_params['pools'] = ','.join(pools) result = zaza.model.run_action_on_leader( application_name or self.application_name, 'status', model_name=model_name, - action_params={ - 'verbose': True, - 'format': 'json', - }) + action_params=action_params) return json.loads(result.results['output']) def get_pools(self): @@ -68,10 +199,26 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): model_name=self.site_b_model) return sorted(site_a_pools.keys()), sorted(site_b_pools.keys()) + def get_failover_pools(self): + """Get the failover Ceph pools' names, from both sites. + + If the Cinder RBD mirroring mode is 'image', the 'cinder-ceph' pool + needs to be excluded, since Cinder orchestrates the failover then. + + :returns: Tuple with site-a pools and site-b pools. + :rtype: Tuple[List[str], List[str]] + """ + site_a_pools, site_b_pools = self.get_pools() + if get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) == 'image': + site_a_pools.remove(self.cinder_ceph_app_name) + site_b_pools.remove(self.cinder_ceph_app_name) + return site_a_pools, site_b_pools + def wait_for_mirror_state(self, state, application_name=None, model_name=None, check_entries_behind_master=False, - require_images_in=[]): + require_images_in=[], + pools=[]): """Wait until all images reach requested state. This function runs the ``status`` action and examines the data it @@ -90,6 +237,9 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): :type check_entries_behind_master: bool :param require_images_in: List of pools to require images in :type require_images_in: list of str + :param pools: List of pools to run status on. If this is empty, the + status action will run on all the pools. + :type pools: list of str :returns: True on success, never returns on failure """ rep = re.compile(r'.*entries_behind_master=(\d+)') @@ -97,7 +247,8 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): try: # encapsulate in try except to work around LP: #1820976 pool_status = self.run_status_action( - application_name=application_name, model_name=model_name) + application_name=application_name, model_name=model_name, + pools=pools) except KeyError: continue for pool, status in pool_status.items(): @@ -124,6 +275,41 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): # all images with state has expected state return True + def setup_test_cinder_volume(self): + """Set up the test Cinder volume into the Ceph RBD mirror environment. + + If the volume already exists, then it's returned. + + Also, if the Cinder RBD mirroring mode is 'image', the volume will + use an explicit volume type with the appropriate replication flags. + Otherwise, it is just a simple Cinder volume using the default backend. + + :returns: Cinder volume + :rtype: :class:`Volume`. + """ + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session, version=3) + + try: + return cinder.volumes.find(name=self.test_cinder_volume_name) + except cinder_exceptions.NotFound: + logging.info("Test Cinder volume doesn't exist. Creating it") + + glance = openstack.get_glance_session_client(session) + image = get_glance_image(glance) + kwargs = { + 'cinder': cinder, + 'name': self.test_cinder_volume_name, + 'image_id': image.id, + } + if get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) == 'image': + volume_type = setup_cinder_repl_volume_type( + cinder, + backend_name=self.cinder_ceph_app_name) + kwargs['type_id'] = volume_type.id + + return create_cinder_volume(**kwargs) + class CephRBDMirrorTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` tests.""" @@ -195,44 +381,7 @@ class CephRBDMirrorTest(CephRBDMirrorBase): site B and subsequently comparing the contents we get a full end to end test. """ - session = openstack.get_overcloud_keystone_session() - glance = openstack.get_glance_session_client(session) - cinder = openstack.get_cinder_session_client(session) - - images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME) - if images: - image = images[0] - else: - logging.info("Failed to find {} image, falling back to {}".format( - CIRROS_IMAGE_NAME, - LTS_IMAGE_NAME)) - image = openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0] - - # NOTE(fnordahl): for some reason create volume from image often fails - # when run just after deployment is finished. We should figure out - # why, resolve the underlying issue and then remove this. - # - # We do not use tenacity here as it will interfere with tenacity used - # in ``resource_reaches_status`` - def create_volume_from_image(cinder, image, retry=20): - if retry < 1: - return - volume = cinder.volumes.create(8, name='zaza', imageRef=image.id) - try: - # Note(coreycb): stop_after_attempt is increased because using - # juju storage for ceph-osd backed by cinder on undercloud - # takes longer than the prior method of directory-backed OSD - # devices. - openstack.resource_reaches_status( - cinder.volumes, volume.id, msg='volume', - stop_after_attempt=20) - return volume - except AssertionError: - logging.info('retrying') - volume.delete() - return create_volume_from_image(cinder, image, retry=retry - 1) - volume = create_volume_from_image(cinder, image) - + volume = self.setup_test_cinder_volume() site_a_hash = zaza.openstack.utilities.ceph.get_rbd_hash( zaza.model.get_lead_unit_name('ceph-mon', model_name=self.site_a_model), @@ -244,6 +393,8 @@ class CephRBDMirrorTest(CephRBDMirrorBase): check_entries_behind_master=True, application_name=self.application_name + self.site_b_app_suffix, model_name=self.site_b_model) + logging.info('Checking the Ceph RBD hashes of the primary and ' + 'the secondary Ceph images') site_b_hash = zaza.openstack.utilities.ceph.get_rbd_hash( zaza.model.get_lead_unit_name('ceph-mon' + self.site_b_app_suffix, model_name=self.site_b_model), @@ -258,102 +409,399 @@ class CephRBDMirrorTest(CephRBDMirrorBase): class CephRBDMirrorControlledFailoverTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` controlled failover tests.""" - def test_fail_over_fall_back(self): - """Validate controlled fail over and fall back.""" - site_a_pools, site_b_pools = self.get_pools() + def execute_failover_juju_actions(self, + primary_site_app_name, + primary_site_model, + primary_site_pools, + secondary_site_app_name, + secondary_site_model, + secondary_site_pools): + """Execute the failover Juju actions. + + The failover / failback via Juju actions shares the same workflow. The + failback is just a failover with sites in reversed order. + + This function encapsulates the tasks to failover a primary site to + a secondary site: + 1. Demote primary site + 2. Validation of the primary site demotion + 3. Promote secondary site + 4. Validation of the secondary site promotion + + :param primary_site_app_name: Primary site Ceph RBD mirror app name. + :type primary_site_app_name: str + :param primary_site_model: Primary site Juju model name. + :type primary_site_model: str + :param primary_site_pools: Primary site pools. + :type primary_site_pools: List[str] + :param secondary_site_app_name: Secondary site Ceph RBD mirror + app name. + :type secondary_site_app_name: str + :param secondary_site_model: Secondary site Juju model name. + :type secondary_site_model: str + :param secondary_site_pools: Secondary site pools. + :type secondary_site_pools: List[str] + """ + # Check if primary and secondary pools sizes are the same. + self.assertEqual(len(primary_site_pools), len(secondary_site_pools)) + + # Run the 'demote' Juju action against the primary site pools. + logging.info('Demoting {} from model {}.'.format( + primary_site_app_name, primary_site_model)) result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror', + primary_site_app_name, 'demote', - model_name=self.site_a_model, - action_params={}) + model_name=primary_site_model, + action_params={ + 'pools': ','.join(primary_site_pools) + }) logging.info(result.results) + self.assertEqual(int(result.results['Code']), 0) + + # Validate that the demoted pools count matches the total primary site + # pools count. n_pools_demoted = len(result.results['output'].split('\n')) - self.assertEqual(len(site_a_pools), n_pools_demoted) - self.wait_for_mirror_state('up+unknown', model_name=self.site_a_model) + self.assertEqual(len(primary_site_pools), n_pools_demoted) + + # At this point, both primary and secondary sites are demoted. Validate + # that the Ceph images, from both sites, report 'up+unknown', since + # there isn't a primary site at the moment. + logging.info('Waiting until {} is demoted.'.format( + primary_site_app_name)) self.wait_for_mirror_state( 'up+unknown', - application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model) + application_name=primary_site_app_name, + model_name=primary_site_model, + pools=primary_site_pools) + self.wait_for_mirror_state( + 'up+unknown', + application_name=secondary_site_app_name, + model_name=secondary_site_model, + pools=secondary_site_pools) + + # Run the 'promote' Juju against the secondary site. + logging.info('Promoting {} from model {}.'.format( + secondary_site_app_name, secondary_site_model)) result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, + secondary_site_app_name, 'promote', - model_name=self.site_b_model, - action_params={}) + model_name=secondary_site_model, + action_params={ + 'pools': ','.join(secondary_site_pools) + }) logging.info(result.results) + self.assertEqual(int(result.results['Code']), 0) + + # Validate that the promoted pools count matches the total secondary + # site pools count. n_pools_promoted = len(result.results['output'].split('\n')) - self.assertEqual(len(site_b_pools), n_pools_promoted) + self.assertEqual(len(secondary_site_pools), n_pools_promoted) + + # Validate that the Ceph images from the newly promoted site + # report 'up+stopped' state (which is reported by primary Ceph images). + logging.info('Waiting until {} is promoted.'.format( + secondary_site_app_name)) + self.wait_for_mirror_state( + 'up+stopped', + application_name=secondary_site_app_name, + model_name=secondary_site_model, + pools=secondary_site_pools) + + # Validate that the Ceph images from site-a report 'up+replaying' + # (which is reported by secondary Ceph images). self.wait_for_mirror_state( 'up+replaying', - model_name=self.site_a_model) + check_entries_behind_master=True, + application_name=primary_site_app_name, + model_name=primary_site_model, + pools=primary_site_pools) + + def test_100_cinder_failover(self): + """Validate controlled failover via the Cinder API. + + This test only makes sense if Cinder RBD mirroring mode is 'image'. + It will return early, if this is not the case. + """ + cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode( + self.cinder_ceph_app_name) + if cinder_rbd_mirroring_mode != 'image': + logging.warning( + "Skipping 'test_100_cinder_failover' since Cinder RBD " + "mirroring mode is {}.".format(cinder_rbd_mirroring_mode)) + return + + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session, version=3) + + # Check if the Cinder volume host is available with replication + # enabled. + host = 'cinder@{}'.format(self.cinder_ceph_app_name) + svc = cinder.services.list(host=host, binary='cinder-volume')[0] + self.assertEqual(svc.replication_status, 'enabled') + self.assertEqual(svc.status, 'enabled') + + # Setup the test Cinder volume + volume = self.setup_test_cinder_volume() + + # Check if the volume is properly mirrored self.wait_for_mirror_state( - 'up+stopped', + 'up+replaying', + check_entries_behind_master=True, application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model) - result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, - 'demote', model_name=self.site_b_model, - action_params={ - }) - logging.info(result.results) - n_pools_demoted = len(result.results['output'].split('\n')) - self.assertEqual(len(site_a_pools), n_pools_demoted) - self.wait_for_mirror_state( - 'up+unknown', - model_name=self.site_a_model) - self.wait_for_mirror_state( - 'up+unknown', - application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model) + pools=[self.cinder_ceph_app_name]) + + # Execute the Cinder volume failover + openstack.failover_cinder_volume_host( + cinder=cinder, + backend_name=self.cinder_ceph_app_name, + target_backend_id='ceph', + target_status='disabled', + target_replication_status='failed-over') + + # Check if the test volume is still available after failover + self.assertEqual(cinder.volumes.get(volume.id).status, 'available') + + def test_101_cinder_failback(self): + """Validate controlled failback via the Cinder API. + + This test only makes sense if Cinder RBD mirroring mode is 'image'. + It will return early, if this is not the case. + + The test needs to be executed when the Cinder volume host is already + failed-over with the test volume on it. + """ + cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode( + self.cinder_ceph_app_name) + if cinder_rbd_mirroring_mode != 'image': + logging.warning( + "Skipping 'test_101_cinder_failback' since Cinder RBD " + "mirroring mode is {}.".format(cinder_rbd_mirroring_mode)) + return + + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session, version=3) + + # Check if the Cinder volume host is already failed-over + host = 'cinder@{}'.format(self.cinder_ceph_app_name) + svc = cinder.services.list(host=host, binary='cinder-volume')[0] + self.assertEqual(svc.replication_status, 'failed-over') + self.assertEqual(svc.status, 'disabled') + + # Check if the test Cinder volume is already present. The method + # 'cinder.volumes.find' raises 404 if the volume is not found. + volume = cinder.volumes.find(name=self.test_cinder_volume_name) + + # Execute the Cinder volume failback + openstack.failover_cinder_volume_host( + cinder=cinder, + backend_name=self.cinder_ceph_app_name, + target_backend_id='default', + target_status='enabled', + target_replication_status='enabled') + + # Check if the test volume is still available after failback + self.assertEqual(cinder.volumes.get(volume.id).status, 'available') + + def test_200_juju_failover(self): + """Validate controlled failover via Juju actions.""" + # Get the Ceph pools needed to failover + site_a_pools, site_b_pools = self.get_failover_pools() + + # Execute the failover Juju actions with the appropriate parameters. + site_b_app_name = self.application_name + self.site_b_app_suffix + self.execute_failover_juju_actions( + primary_site_app_name=self.application_name, + primary_site_model=self.site_a_model, + primary_site_pools=site_a_pools, + secondary_site_app_name=site_b_app_name, + secondary_site_model=self.site_b_model, + secondary_site_pools=site_b_pools) + + def test_201_juju_failback(self): + """Validate controlled failback via Juju actions.""" + # Get the Ceph pools needed to failback + site_a_pools, site_b_pools = self.get_failover_pools() + + # Execute the failover Juju actions with the appropriate parameters. + # The failback operation is just a failover with sites in reverse + # order. + site_b_app_name = self.application_name + self.site_b_app_suffix + self.execute_failover_juju_actions( + primary_site_app_name=site_b_app_name, + primary_site_model=self.site_b_model, + primary_site_pools=site_b_pools, + secondary_site_app_name=self.application_name, + secondary_site_model=self.site_a_model, + secondary_site_pools=site_a_pools) + + def test_203_juju_resync(self): + """Validate the 'resync-pools' Juju action. + + The 'resync-pools' Juju action is meant to flag Ceph images from the + secondary site to re-sync against the Ceph images from the primary + site. + + This use case is useful when the Ceph secondary images are out of sync. + """ + # Get the Ceph pools needed to failback + _, site_b_pools = self.get_failover_pools() + + # Run the 'resync-pools' Juju action against the pools from site-b. + # This will make sure that the Ceph images from site-b are properly + # synced with the primary images from site-a. + site_b_app_name = self.application_name + self.site_b_app_suffix + logging.info('Re-syncing {} from model {}'.format( + site_b_app_name, self.site_b_model)) result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror', - 'promote', - model_name=self.site_a_model, - action_params={ - }) - logging.info(result.results) - n_pools_promoted = len(result.results['output'].split('\n')) - self.assertEqual(len(site_b_pools), n_pools_promoted) - self.wait_for_mirror_state( - 'up+stopped', - model_name=self.site_a_model) - result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, + site_b_app_name, 'resync-pools', model_name=self.site_b_model, action_params={ + 'pools': ','.join(site_b_pools), 'i-really-mean-it': True, }) logging.info(result.results) + self.assertEqual(int(result.results['Code']), 0) + + # Validate that the Ceph images from site-b report 'up+replaying' + # (which is reported by secondary Ceph images). And check that images + # exist in Cinder and Glance pools. self.wait_for_mirror_state( 'up+replaying', - application_name=self.application_name + self.site_b_app_suffix, + check_entries_behind_master=True, + application_name=site_b_app_name, model_name=self.site_b_model, - require_images_in=['cinder-ceph', 'glance']) + require_images_in=[self.cinder_ceph_app_name, 'glance'], + pools=site_b_pools) class CephRBDMirrorDisasterFailoverTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` destructive tests.""" - def test_kill_site_a_fail_over(self): - """Validate fail over after uncontrolled shutdown of primary.""" - for application in 'ceph-rbd-mirror', 'ceph-mon', 'ceph-osd': + def apply_cinder_ceph_workaround(self): + """Set minimal timeouts / retries to the Cinder Ceph backend. + + This is needed because the failover via Cinder API will try to do a + demotion of the site-a. However, when site-a is down, and with the + default timeouts / retries, the operation takes an unreasonably amount + of time (or sometimes it never finishes). + """ + # These new config options need to be set under the Cinder Ceph backend + # section in the main Cinder config file. + # At the moment, we don't the possibility of using Juju config to set + # these options. And also, it's not even a good practice to have them + # in production. + # These should be set only to do the Ceph failover via Cinder API, and + # they need to be removed after. + configs = { + 'rados_connect_timeout': '1', + 'rados_connection_retries': '1', + 'rados_connection_interval': '0', + 'replication_connect_timeout': '1', + } + + # Small Python script that will be executed via Juju run to update + # the Cinder config file. + update_cinder_conf_script = ( + "import configparser; " + "config = configparser.ConfigParser(); " + "config.read('/etc/cinder/cinder.conf'); " + "{}" + "f = open('/etc/cinder/cinder.conf', 'w'); " + "config.write(f); " + "f.close()") + set_cmd = '' + for cfg_name in configs: + set_cmd += "config.set('{0}', '{1}', '{2}'); ".format( + self.cinder_ceph_app_name, cfg_name, configs[cfg_name]) + script = update_cinder_conf_script.format(set_cmd) + + # Run the workaround script via Juju run + zaza.model.run_on_leader( + self.cinder_ceph_app_name, + 'python3 -c "{}"; systemctl restart cinder-volume'.format(script)) + + def kill_primary_site(self): + """Simulate an unexpected primary site shutdown.""" + logging.info('Killing the Ceph primary site') + for application in ['ceph-rbd-mirror', 'ceph-mon', 'ceph-osd']: zaza.model.remove_application( application, model_name=self.site_a_model, forcefully_remove_machines=True) + + def test_100_forced_juju_failover(self): + """Validate Ceph failover via Juju when the primary site is down. + + * Kill the primary site + * Execute the forced failover via Juju actions + """ + # Get the site-b Ceph pools that need to be promoted + _, site_b_pools = self.get_failover_pools() + site_b_app_name = self.application_name + self.site_b_app_suffix + + # Simulate primary site unexpected shutdown + self.kill_primary_site() + + # Try and promote the site-b to primary. result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, + site_b_app_name, 'promote', model_name=self.site_b_model, action_params={ + 'pools': ','.join(site_b_pools), }) + self.assertEqual(int(result.results['Code']), 0) + + # The site-b 'promote' Juju action is expected to fail, because the + # primary site is down. self.assertEqual(result.status, 'failed') + + # Retry to promote site-b using the 'force' Juju action parameter. result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, + site_b_app_name, 'promote', model_name=self.site_b_model, action_params={ 'force': True, + 'pools': ','.join(site_b_pools), }) + self.assertEqual(int(result.results['Code']), 0) + + # Validate successful Juju action execution self.assertEqual(result.status, 'completed') + + def test_200_forced_cinder_failover(self): + """Validate Ceph failover via Cinder when the primary site is down. + + This test only makes sense if Cinder RBD mirroring mode is 'image'. + It will return early, if this is not the case. + + This assumes that the primary site is already killed. + """ + cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode( + self.cinder_ceph_app_name) + if cinder_rbd_mirroring_mode != 'image': + logging.warning( + "Skipping 'test_200_cinder_failover_without_primary_site' " + "since Cinder RBD mirroring mode is {}.".format( + cinder_rbd_mirroring_mode)) + return + + # Make sure that the Cinder Ceph backend workaround is applied. + self.apply_cinder_ceph_workaround() + + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session, version=3) + openstack.failover_cinder_volume_host( + cinder=cinder, + backend_name=self.cinder_ceph_app_name, + target_backend_id='ceph', + target_status='disabled', + target_replication_status='failed-over') + + # Check that the Cinder volumes are still available after forced + # failover. + for volume in cinder.volumes.list(): + self.assertEqual(volume.status, 'available') diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py index d277656..4ed75b9 100644 --- a/zaza/openstack/utilities/openstack.py +++ b/zaza/openstack/utilities/openstack.py @@ -2489,6 +2489,40 @@ def attach_volume(nova, volume_id, instance_id): device='/dev/vdx') +def failover_cinder_volume_host(cinder, backend_name='cinder-ceph', + target_backend_id='ceph', + target_status='disabled', + target_replication_status='failed-over'): + """Failover Cinder volume host with replication enabled. + + :param cinder: Authenticated cinderclient + :type cinder: cinder.Client + :param backend_name: Cinder volume backend name with + replication enabled. + :type backend_name: str + :param target_backend_id: Failover target Cinder backend id. + :type target_backend_id: str + :param target_status: Target Cinder volume status after failover. + :type target_status: str + :param target_replication_status: Target Cinder volume replication + status after failover. + :type target_replication_status: str + :raises: AssertionError + """ + host = 'cinder@{}'.format(backend_name) + logging.info('Failover Cinder volume host %s to backend_id %s', + host, target_backend_id) + cinder.services.failover_host(host=host, backend_id=target_backend_id) + for attempt in tenacity.Retrying( + retry=tenacity.retry_if_exception_type(AssertionError), + stop=tenacity.stop_after_attempt(10), + wait=tenacity.wait_exponential(multiplier=1, min=2, max=10)): + with attempt: + svc = cinder.services.list(host=host, binary='cinder-volume')[0] + assert svc.status == target_status + assert svc.replication_status == target_replication_status + + def create_volume_backup(cinder, volume_id, name=None): """Create cinder volume backup.