From d41f8b37240b794442c873327696ffe2ed9d4fbb Mon Sep 17 00:00:00 2001 From: Ionut Balutoiu Date: Wed, 20 Jan 2021 20:10:34 +0200 Subject: [PATCH 1/2] Update Ceph RBD Mirror tests The updated tests add the possibility of testing deployments with `image` RBD mirroring mode implemented as part of the Cinder Ceph Replication charm spec. --- .../charm_tests/ceph/rbd_mirror/tests.py | 308 ++++++++++++++---- 1 file changed, 242 insertions(+), 66 deletions(-) diff --git a/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py b/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py index 6c2fa1b..75a3f42 100644 --- a/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py +++ b/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py @@ -16,6 +16,9 @@ import json import logging import re +import time + +import cinderclient.exceptions as cinder_exceptions import zaza.openstack.charm_tests.test_utils as test_utils @@ -39,16 +42,20 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): cls.site_a_model = cls.site_b_model = zaza.model.get_juju_model() cls.site_b_app_suffix = '-b' - def run_status_action(self, application_name=None, model_name=None): + def run_status_action(self, application_name=None, model_name=None, + pools=[]): """Run status action, decode and return response.""" + action_params = { + 'verbose': True, + 'format': 'json', + } + if len(pools) > 0: + action_params['pools'] = ','.join(pools) result = zaza.model.run_action_on_leader( application_name or self.application_name, 'status', model_name=model_name, - action_params={ - 'verbose': True, - 'format': 'json', - }) + action_params=action_params) return json.loads(result.results['output']) def get_pools(self): @@ -71,7 +78,8 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): def wait_for_mirror_state(self, state, application_name=None, model_name=None, check_entries_behind_master=False, - require_images_in=[]): + require_images_in=[], + pools=[]): """Wait until all images reach requested state. This function runs the ``status`` action and examines the data it @@ -90,6 +98,9 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): :type check_entries_behind_master: bool :param require_images_in: List of pools to require images in :type require_images_in: list of str + :param pools: List of pools to run status on. If this is empty, the + status action will run on all the pools. + :type pools: list of str :returns: True on success, never returns on failure """ rep = re.compile(r'.*entries_behind_master=(\d+)') @@ -97,7 +108,8 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): try: # encapsulate in try except to work around LP: #1820976 pool_status = self.run_status_action( - application_name=application_name, model_name=model_name) + application_name=application_name, model_name=model_name, + pools=pools) except KeyError: continue for pool, status in pool_status.items(): @@ -124,6 +136,119 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): # all images with state has expected state return True + def get_cinder_rbd_mirroring_mode(self, + cinder_ceph_app_name='cinder-ceph'): + """Get the RBD mirroring mode for the Cinder Ceph pool. + + :returns: A string representing the RBD mirroring mode. It can be + either 'pool' or 'image'. + """ + DEFAULT_RBD_MIRRORING_MODE = 'pool' + + rbd_mirroring_mode_config = zaza.model.get_application_config( + cinder_ceph_app_name).get('rbd-mirroring-mode') + if rbd_mirroring_mode_config: + rbd_mirroring_mode = rbd_mirroring_mode_config.get( + 'value', DEFAULT_RBD_MIRRORING_MODE).lower() + else: + rbd_mirroring_mode = DEFAULT_RBD_MIRRORING_MODE + + return rbd_mirroring_mode + + def create_cinder_volume(self, session, from_image=False): + """Create Cinder Volume from image. + + :rtype: :class:`Volume`. + """ + def get_glance_image(session): + glance = openstack.get_glance_session_client(session) + images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME) + if images: + return images[0] + logging.info("Failed to find {} image, falling back to {}".format( + CIRROS_IMAGE_NAME, + LTS_IMAGE_NAME)) + return openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0] + + def create_volume_type(cinder): + try: + vol_type = cinder.volume_types.find(name='repl') + except cinder_exceptions.NotFound: + vol_type = cinder.volume_types.create('repl') + vol_type.set_keys(metadata={ + 'volume_backend_name': 'cinder-ceph', + 'replication_enabled': ' True', + }) + return vol_type + + # NOTE(fnordahl): for some reason create volume from image often fails + # when run just after deployment is finished. We should figure out + # why, resolve the underlying issue and then remove this. + # + # We do not use tenacity here as it will interfere with tenacity used + # in ``resource_reaches_status`` + def create_volume(cinder, volume_params, retry=20): + if retry < 1: + return + volume = cinder.volumes.create(**volume_params) + try: + # Note(coreycb): stop_after_attempt is increased because using + # juju storage for ceph-osd backed by cinder on undercloud + # takes longer than the prior method of directory-backed OSD + # devices. + openstack.resource_reaches_status( + cinder.volumes, volume.id, msg='volume', + stop_after_attempt=20) + return volume + except AssertionError: + logging.info('retrying') + volume.delete() + return create_volume(cinder, volume_params, retry=retry - 1) + + volume_params = { + 'size': 8, + 'name': 'zaza', + } + if from_image: + volume_params['imageRef'] = get_glance_image(session).id + cinder = openstack.get_cinder_session_client(session) + if self.get_cinder_rbd_mirroring_mode() == 'image': + volume_params['volume_type'] = create_volume_type(cinder).id + + return create_volume(cinder, volume_params) + + def failover_cinder_volume_host(self, cinder_client, + backend_name='cinder-ceph', + target_backend_id='ceph', + target_status='disabled', + target_replication_status='failed-over', + timeout=300): + """Failover Cinder volume host.""" + host = 'cinder@{}'.format(backend_name) + logging.info( + 'Failover Cinder host %s to backend_id %s', + host, target_backend_id) + cinder_client.services.failover_host( + host=host, + backend_id=target_backend_id) + start = time.time() + while True: + elapsed = time.time() - start + if elapsed > timeout: + raise cinder_exceptions.TimeoutException( + obj=cinder_client.services, + action='failover_host') + service = cinder_client.services.list( + host=host, + binary='cinder-volume')[0] + if (service.status == target_status and + service.replication_status == target_replication_status): + break + time.sleep(5) + logging.info( + 'Successfully failed-over Cinder host %s to backend_id %s', + host, target_backend_id) + class CephRBDMirrorTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` tests.""" @@ -196,43 +321,7 @@ class CephRBDMirrorTest(CephRBDMirrorBase): test. """ session = openstack.get_overcloud_keystone_session() - glance = openstack.get_glance_session_client(session) - cinder = openstack.get_cinder_session_client(session) - - images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME) - if images: - image = images[0] - else: - logging.info("Failed to find {} image, falling back to {}".format( - CIRROS_IMAGE_NAME, - LTS_IMAGE_NAME)) - image = openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0] - - # NOTE(fnordahl): for some reason create volume from image often fails - # when run just after deployment is finished. We should figure out - # why, resolve the underlying issue and then remove this. - # - # We do not use tenacity here as it will interfere with tenacity used - # in ``resource_reaches_status`` - def create_volume_from_image(cinder, image, retry=20): - if retry < 1: - return - volume = cinder.volumes.create(8, name='zaza', imageRef=image.id) - try: - # Note(coreycb): stop_after_attempt is increased because using - # juju storage for ceph-osd backed by cinder on undercloud - # takes longer than the prior method of directory-backed OSD - # devices. - openstack.resource_reaches_status( - cinder.volumes, volume.id, msg='volume', - stop_after_attempt=20) - return volume - except AssertionError: - logging.info('retrying') - volume.delete() - return create_volume_from_image(cinder, image, retry=retry - 1) - volume = create_volume_from_image(cinder, image) - + volume = self.create_cinder_volume(session, from_image=True) site_a_hash = zaza.openstack.utilities.ceph.get_rbd_hash( zaza.model.get_lead_unit_name('ceph-mon', model_name=self.site_a_model), @@ -258,85 +347,170 @@ class CephRBDMirrorTest(CephRBDMirrorBase): class CephRBDMirrorControlledFailoverTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` controlled failover tests.""" + def cinder_fail_over_fall_back(self): + """Validate controlled fail over and fall back via the Cinder API.""" + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session) + volume = self.create_cinder_volume(session, from_image=True) + self.wait_for_mirror_state( + 'up+replaying', + check_entries_behind_master=True, + application_name=self.application_name + self.site_b_app_suffix, + model_name=self.site_b_model, + pools=['cinder-ceph']) + self.failover_cinder_volume_host( + cinder_client=cinder) + self.assertEqual(cinder.volumes.get(volume.id).status, 'available') + self.failover_cinder_volume_host( + cinder_client=cinder, + target_backend_id='default', + target_status='enabled', + target_replication_status='enabled') + self.assertEqual(cinder.volumes.get(volume.id).status, 'available') + def test_fail_over_fall_back(self): """Validate controlled fail over and fall back.""" site_a_pools, site_b_pools = self.get_pools() + site_a_action_params = {} + site_b_action_params = {} + if self.get_cinder_rbd_mirroring_mode() == 'image': + site_a_pools.remove('cinder-ceph') + site_a_action_params['pools'] = ','.join(site_a_pools) + site_b_pools.remove('cinder-ceph') + site_b_action_params['pools'] = ','.join(site_b_pools) result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror', 'demote', model_name=self.site_a_model, - action_params={}) + action_params=site_a_action_params) logging.info(result.results) n_pools_demoted = len(result.results['output'].split('\n')) self.assertEqual(len(site_a_pools), n_pools_demoted) - self.wait_for_mirror_state('up+unknown', model_name=self.site_a_model) + self.wait_for_mirror_state( + 'up+unknown', + model_name=self.site_a_model, + pools=site_a_pools) self.wait_for_mirror_state( 'up+unknown', application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model) + model_name=self.site_b_model, + pools=site_b_pools) result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror' + self.site_b_app_suffix, 'promote', model_name=self.site_b_model, - action_params={}) + action_params=site_b_action_params) logging.info(result.results) n_pools_promoted = len(result.results['output'].split('\n')) self.assertEqual(len(site_b_pools), n_pools_promoted) self.wait_for_mirror_state( 'up+replaying', - model_name=self.site_a_model) + model_name=self.site_a_model, + pools=site_a_pools) self.wait_for_mirror_state( 'up+stopped', application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model) + model_name=self.site_b_model, + pools=site_b_pools) result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror' + self.site_b_app_suffix, 'demote', model_name=self.site_b_model, - action_params={ - }) + action_params=site_b_action_params) logging.info(result.results) n_pools_demoted = len(result.results['output'].split('\n')) self.assertEqual(len(site_a_pools), n_pools_demoted) self.wait_for_mirror_state( 'up+unknown', - model_name=self.site_a_model) + model_name=self.site_a_model, + pools=site_a_pools) self.wait_for_mirror_state( 'up+unknown', application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model) + model_name=self.site_b_model, + pools=site_b_pools) result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror', 'promote', model_name=self.site_a_model, - action_params={ - }) + action_params=site_a_action_params) logging.info(result.results) n_pools_promoted = len(result.results['output'].split('\n')) self.assertEqual(len(site_b_pools), n_pools_promoted) self.wait_for_mirror_state( 'up+stopped', - model_name=self.site_a_model) + model_name=self.site_a_model, + pools=site_a_pools) + action_params = { + 'i-really-mean-it': True, + } + if self.get_cinder_rbd_mirroring_mode() == 'image': + action_params['pools'] = site_b_action_params['pools'] result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror' + self.site_b_app_suffix, 'resync-pools', model_name=self.site_b_model, - action_params={ - 'i-really-mean-it': True, - }) + action_params=action_params) logging.info(result.results) self.wait_for_mirror_state( 'up+replaying', application_name=self.application_name + self.site_b_app_suffix, model_name=self.site_b_model, - require_images_in=['cinder-ceph', 'glance']) + require_images_in=['cinder-ceph', 'glance'], + pools=site_a_pools) + if self.get_cinder_rbd_mirroring_mode() == 'image': + self.cinder_fail_over_fall_back() class CephRBDMirrorDisasterFailoverTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` destructive tests.""" + def forced_failover_cinder_volume_host(self, cinder_client): + """Validate forced Cinder volume host fail over.""" + def apply_cinder_workaround(): + """Set minimal timeouts / retries to the Cinder Ceph backend. + + This is needed because the failover via Cinder will try to do a + demotion of the site-a, and with the default timeouts / retries, + the operation takes an unreasonably amount of time. + """ + cinder_configs = { + 'rados_connect_timeout': '1', + 'rados_connection_retries': '1', + 'rados_connection_interval': '0', + 'replication_connect_timeout': '1', + } + update_cinder_conf_cmd = ( + "import configparser; " + "config = configparser.ConfigParser(); " + "config.read('/etc/cinder/cinder.conf'); " + "{}" + "f = open('/etc/cinder/cinder.conf', 'w'); " + "config.write(f); " + "f.close()") + cmd = '' + for config in cinder_configs: + cmd += "config.set('cinder-ceph', '{0}', '{1}'); ".format( + config, cinder_configs[config]) + cmd = update_cinder_conf_cmd.format(cmd) + zaza.model.run_on_leader( + 'cinder-ceph', + 'python3 -c "{}"; systemctl restart cinder-volume'.format(cmd)) + + apply_cinder_workaround() + self.failover_cinder_volume_host(cinder_client) + + for volume in cinder_client.volumes.list(): + self.assertEqual(volume.status, 'available') + def test_kill_site_a_fail_over(self): """Validate fail over after uncontrolled shutdown of primary.""" + action_params = {} + if self.get_cinder_rbd_mirroring_mode() == 'image': + _, site_b_pools = self.get_pools() + site_b_pools.remove('cinder-ceph') + action_params['pools'] = ','.join(site_b_pools) + for application in 'ceph-rbd-mirror', 'ceph-mon', 'ceph-osd': zaza.model.remove_application( application, @@ -346,14 +520,16 @@ class CephRBDMirrorDisasterFailoverTest(CephRBDMirrorBase): 'ceph-rbd-mirror' + self.site_b_app_suffix, 'promote', model_name=self.site_b_model, - action_params={ - }) + action_params=action_params) self.assertEqual(result.status, 'failed') + action_params['force'] = True result = zaza.model.run_action_on_leader( 'ceph-rbd-mirror' + self.site_b_app_suffix, 'promote', model_name=self.site_b_model, - action_params={ - 'force': True, - }) + action_params=action_params) self.assertEqual(result.status, 'completed') + if self.get_cinder_rbd_mirroring_mode() == 'image': + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session) + self.forced_failover_cinder_volume_host(cinder) From 2fefca5a406647a1efcd91e5f67441766f6976b7 Mon Sep 17 00:00:00 2001 From: Ionut Balutoiu Date: Tue, 2 Mar 2021 18:03:06 +0200 Subject: [PATCH 2/2] Code cleanup * Move general function out of the testing class. This will make them easy to be reused. * Properly organize the tests, and add tons of docstrings and comments to have them as clear as possible. * Add `failover_cinder_volume_host` to the Zaza `utilities/openstack.py`, since this is a general purpose function. --- .../charm_tests/ceph/rbd_mirror/tests.py | 776 ++++++++++++------ zaza/openstack/utilities/openstack.py | 34 + 2 files changed, 558 insertions(+), 252 deletions(-) diff --git a/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py b/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py index 75a3f42..d8d7967 100644 --- a/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py +++ b/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py @@ -16,7 +16,6 @@ import json import logging import re -import time import cinderclient.exceptions as cinder_exceptions @@ -31,6 +30,129 @@ from zaza.openstack.charm_tests.glance.setup import ( CIRROS_IMAGE_NAME) +DEFAULT_CINDER_RBD_MIRRORING_MODE = 'pool' + + +def get_cinder_rbd_mirroring_mode(cinder_ceph_app_name='cinder-ceph'): + """Get the RBD mirroring mode for the Cinder Ceph pool. + + :param cinder_ceph_app_name: Cinder Ceph Juju application name. + :type cinder_ceph_app_name: str + :returns: A string representing the RBD mirroring mode. It can be + either 'pool' or 'image'. + :rtype: str + """ + rbd_mirroring_mode_config = zaza.model.get_application_config( + cinder_ceph_app_name).get('rbd-mirroring-mode') + if rbd_mirroring_mode_config: + rbd_mirroring_mode = rbd_mirroring_mode_config.get( + 'value', DEFAULT_CINDER_RBD_MIRRORING_MODE).lower() + else: + rbd_mirroring_mode = DEFAULT_CINDER_RBD_MIRRORING_MODE + + return rbd_mirroring_mode + + +def get_glance_image(glance): + """Get the Glance image object to be used by the Ceph tests. + + It looks for the Cirros Glance image, and it's returned if it's found. + If the Cirros image is not found, it will try and find the Ubuntu + LTS image. + + :param glance: Authenticated glanceclient + :type glance: glanceclient.Client + :returns: Glance image object + :rtype: glanceclient.image + """ + images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME) + if images: + return images[0] + logging.info("Failed to find {} image, falling back to {}".format( + CIRROS_IMAGE_NAME, + LTS_IMAGE_NAME)) + return openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0] + + +def setup_cinder_repl_volume_type(cinder, type_name='repl', + backend_name='cinder-ceph'): + """Set up the Cinder volume replication type. + + :param cinder: Authenticated cinderclient + :type cinder: cinder.Client + :param type_name: Cinder volume type name + :type type_name: str + :param backend_name: Cinder volume backend name with replication enabled. + :type backend_name: str + :returns: Cinder volume type object + :rtype: cinderclient.VolumeType + """ + try: + vol_type = cinder.volume_types.find(name=type_name) + except cinder_exceptions.NotFound: + vol_type = cinder.volume_types.create(type_name) + + vol_type.set_keys(metadata={ + 'volume_backend_name': backend_name, + 'replication_enabled': ' True', + }) + return vol_type + + +# TODO: This function should be incorporated into +# 'zaza.openstack.utilities.openstack.create_volume' helper, once the below +# flakiness comments are addressed. +def create_cinder_volume(cinder, name='zaza', image_id=None, type_id=None): + """Create a new Cinder volume. + + :param cinder: Authenticated cinderclient. + :type cinder: cinder.Client + :param name: Volume name. + :type name: str + :param image_id: Glance image id, if the volume is created from image. + :type image_id: str + :param type_id: Cinder Volume type id, if the volume needs to use an + explicit volume type. + :type type_id: boolean + :returns: Cinder volume + :rtype: :class:`Volume`. + """ + # NOTE(fnordahl): for some reason create volume from image often fails + # when run just after deployment is finished. We should figure out + # why, resolve the underlying issue and then remove this. + # + # We do not use tenacity here as it will interfere with tenacity used + # in ``resource_reaches_status`` + def create_volume(cinder, volume_params, retry=20): + if retry < 1: + return + volume = cinder.volumes.create(**volume_params) + try: + # Note(coreycb): stop_after_attempt is increased because using + # juju storage for ceph-osd backed by cinder on undercloud + # takes longer than the prior method of directory-backed OSD + # devices. + openstack.resource_reaches_status( + cinder.volumes, volume.id, msg='volume', + stop_after_attempt=20) + return volume + except AssertionError: + logging.info('retrying') + volume.delete() + return create_volume(cinder, volume_params, retry=retry - 1) + + volume_params = { + 'size': 8, + 'name': name, + } + if image_id: + volume_params['imageRef'] = image_id + if type_id: + volume_params['volume_type'] = type_id + + return create_volume(cinder, volume_params) + + class CephRBDMirrorBase(test_utils.OpenStackBaseTest): """Base class for ``ceph-rbd-mirror`` tests.""" @@ -38,6 +160,8 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): def setUpClass(cls): """Run setup for ``ceph-rbd-mirror`` tests.""" super().setUpClass() + cls.cinder_ceph_app_name = 'cinder-ceph' + cls.test_cinder_volume_name = 'test-cinder-ceph-volume' # get ready for multi-model Zaza cls.site_a_model = cls.site_b_model = zaza.model.get_juju_model() cls.site_b_app_suffix = '-b' @@ -75,6 +199,21 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): model_name=self.site_b_model) return sorted(site_a_pools.keys()), sorted(site_b_pools.keys()) + def get_failover_pools(self): + """Get the failover Ceph pools' names, from both sites. + + If the Cinder RBD mirroring mode is 'image', the 'cinder-ceph' pool + needs to be excluded, since Cinder orchestrates the failover then. + + :returns: Tuple with site-a pools and site-b pools. + :rtype: Tuple[List[str], List[str]] + """ + site_a_pools, site_b_pools = self.get_pools() + if get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) == 'image': + site_a_pools.remove(self.cinder_ceph_app_name) + site_b_pools.remove(self.cinder_ceph_app_name) + return site_a_pools, site_b_pools + def wait_for_mirror_state(self, state, application_name=None, model_name=None, check_entries_behind_master=False, @@ -136,118 +275,40 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest): # all images with state has expected state return True - def get_cinder_rbd_mirroring_mode(self, - cinder_ceph_app_name='cinder-ceph'): - """Get the RBD mirroring mode for the Cinder Ceph pool. + def setup_test_cinder_volume(self): + """Set up the test Cinder volume into the Ceph RBD mirror environment. - :returns: A string representing the RBD mirroring mode. It can be - either 'pool' or 'image'. - """ - DEFAULT_RBD_MIRRORING_MODE = 'pool' + If the volume already exists, then it's returned. - rbd_mirroring_mode_config = zaza.model.get_application_config( - cinder_ceph_app_name).get('rbd-mirroring-mode') - if rbd_mirroring_mode_config: - rbd_mirroring_mode = rbd_mirroring_mode_config.get( - 'value', DEFAULT_RBD_MIRRORING_MODE).lower() - else: - rbd_mirroring_mode = DEFAULT_RBD_MIRRORING_MODE - - return rbd_mirroring_mode - - def create_cinder_volume(self, session, from_image=False): - """Create Cinder Volume from image. + Also, if the Cinder RBD mirroring mode is 'image', the volume will + use an explicit volume type with the appropriate replication flags. + Otherwise, it is just a simple Cinder volume using the default backend. + :returns: Cinder volume :rtype: :class:`Volume`. """ - def get_glance_image(session): - glance = openstack.get_glance_session_client(session) - images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME) - if images: - return images[0] - logging.info("Failed to find {} image, falling back to {}".format( - CIRROS_IMAGE_NAME, - LTS_IMAGE_NAME)) - return openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0] + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session, version=3) - def create_volume_type(cinder): - try: - vol_type = cinder.volume_types.find(name='repl') - except cinder_exceptions.NotFound: - vol_type = cinder.volume_types.create('repl') - vol_type.set_keys(metadata={ - 'volume_backend_name': 'cinder-ceph', - 'replication_enabled': ' True', - }) - return vol_type + try: + return cinder.volumes.find(name=self.test_cinder_volume_name) + except cinder_exceptions.NotFound: + logging.info("Test Cinder volume doesn't exist. Creating it") - # NOTE(fnordahl): for some reason create volume from image often fails - # when run just after deployment is finished. We should figure out - # why, resolve the underlying issue and then remove this. - # - # We do not use tenacity here as it will interfere with tenacity used - # in ``resource_reaches_status`` - def create_volume(cinder, volume_params, retry=20): - if retry < 1: - return - volume = cinder.volumes.create(**volume_params) - try: - # Note(coreycb): stop_after_attempt is increased because using - # juju storage for ceph-osd backed by cinder on undercloud - # takes longer than the prior method of directory-backed OSD - # devices. - openstack.resource_reaches_status( - cinder.volumes, volume.id, msg='volume', - stop_after_attempt=20) - return volume - except AssertionError: - logging.info('retrying') - volume.delete() - return create_volume(cinder, volume_params, retry=retry - 1) - - volume_params = { - 'size': 8, - 'name': 'zaza', + glance = openstack.get_glance_session_client(session) + image = get_glance_image(glance) + kwargs = { + 'cinder': cinder, + 'name': self.test_cinder_volume_name, + 'image_id': image.id, } - if from_image: - volume_params['imageRef'] = get_glance_image(session).id - cinder = openstack.get_cinder_session_client(session) - if self.get_cinder_rbd_mirroring_mode() == 'image': - volume_params['volume_type'] = create_volume_type(cinder).id + if get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) == 'image': + volume_type = setup_cinder_repl_volume_type( + cinder, + backend_name=self.cinder_ceph_app_name) + kwargs['type_id'] = volume_type.id - return create_volume(cinder, volume_params) - - def failover_cinder_volume_host(self, cinder_client, - backend_name='cinder-ceph', - target_backend_id='ceph', - target_status='disabled', - target_replication_status='failed-over', - timeout=300): - """Failover Cinder volume host.""" - host = 'cinder@{}'.format(backend_name) - logging.info( - 'Failover Cinder host %s to backend_id %s', - host, target_backend_id) - cinder_client.services.failover_host( - host=host, - backend_id=target_backend_id) - start = time.time() - while True: - elapsed = time.time() - start - if elapsed > timeout: - raise cinder_exceptions.TimeoutException( - obj=cinder_client.services, - action='failover_host') - service = cinder_client.services.list( - host=host, - binary='cinder-volume')[0] - if (service.status == target_status and - service.replication_status == target_replication_status): - break - time.sleep(5) - logging.info( - 'Successfully failed-over Cinder host %s to backend_id %s', - host, target_backend_id) + return create_cinder_volume(**kwargs) class CephRBDMirrorTest(CephRBDMirrorBase): @@ -320,8 +381,7 @@ class CephRBDMirrorTest(CephRBDMirrorBase): site B and subsequently comparing the contents we get a full end to end test. """ - session = openstack.get_overcloud_keystone_session() - volume = self.create_cinder_volume(session, from_image=True) + volume = self.setup_test_cinder_volume() site_a_hash = zaza.openstack.utilities.ceph.get_rbd_hash( zaza.model.get_lead_unit_name('ceph-mon', model_name=self.site_a_model), @@ -333,6 +393,8 @@ class CephRBDMirrorTest(CephRBDMirrorBase): check_entries_behind_master=True, application_name=self.application_name + self.site_b_app_suffix, model_name=self.site_b_model) + logging.info('Checking the Ceph RBD hashes of the primary and ' + 'the secondary Ceph images') site_b_hash = zaza.openstack.utilities.ceph.get_rbd_hash( zaza.model.get_lead_unit_name('ceph-mon' + self.site_b_app_suffix, model_name=self.site_b_model), @@ -347,189 +409,399 @@ class CephRBDMirrorTest(CephRBDMirrorBase): class CephRBDMirrorControlledFailoverTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` controlled failover tests.""" - def cinder_fail_over_fall_back(self): - """Validate controlled fail over and fall back via the Cinder API.""" + def execute_failover_juju_actions(self, + primary_site_app_name, + primary_site_model, + primary_site_pools, + secondary_site_app_name, + secondary_site_model, + secondary_site_pools): + """Execute the failover Juju actions. + + The failover / failback via Juju actions shares the same workflow. The + failback is just a failover with sites in reversed order. + + This function encapsulates the tasks to failover a primary site to + a secondary site: + 1. Demote primary site + 2. Validation of the primary site demotion + 3. Promote secondary site + 4. Validation of the secondary site promotion + + :param primary_site_app_name: Primary site Ceph RBD mirror app name. + :type primary_site_app_name: str + :param primary_site_model: Primary site Juju model name. + :type primary_site_model: str + :param primary_site_pools: Primary site pools. + :type primary_site_pools: List[str] + :param secondary_site_app_name: Secondary site Ceph RBD mirror + app name. + :type secondary_site_app_name: str + :param secondary_site_model: Secondary site Juju model name. + :type secondary_site_model: str + :param secondary_site_pools: Secondary site pools. + :type secondary_site_pools: List[str] + """ + # Check if primary and secondary pools sizes are the same. + self.assertEqual(len(primary_site_pools), len(secondary_site_pools)) + + # Run the 'demote' Juju action against the primary site pools. + logging.info('Demoting {} from model {}.'.format( + primary_site_app_name, primary_site_model)) + result = zaza.model.run_action_on_leader( + primary_site_app_name, + 'demote', + model_name=primary_site_model, + action_params={ + 'pools': ','.join(primary_site_pools) + }) + logging.info(result.results) + self.assertEqual(int(result.results['Code']), 0) + + # Validate that the demoted pools count matches the total primary site + # pools count. + n_pools_demoted = len(result.results['output'].split('\n')) + self.assertEqual(len(primary_site_pools), n_pools_demoted) + + # At this point, both primary and secondary sites are demoted. Validate + # that the Ceph images, from both sites, report 'up+unknown', since + # there isn't a primary site at the moment. + logging.info('Waiting until {} is demoted.'.format( + primary_site_app_name)) + self.wait_for_mirror_state( + 'up+unknown', + application_name=primary_site_app_name, + model_name=primary_site_model, + pools=primary_site_pools) + self.wait_for_mirror_state( + 'up+unknown', + application_name=secondary_site_app_name, + model_name=secondary_site_model, + pools=secondary_site_pools) + + # Run the 'promote' Juju against the secondary site. + logging.info('Promoting {} from model {}.'.format( + secondary_site_app_name, secondary_site_model)) + result = zaza.model.run_action_on_leader( + secondary_site_app_name, + 'promote', + model_name=secondary_site_model, + action_params={ + 'pools': ','.join(secondary_site_pools) + }) + logging.info(result.results) + self.assertEqual(int(result.results['Code']), 0) + + # Validate that the promoted pools count matches the total secondary + # site pools count. + n_pools_promoted = len(result.results['output'].split('\n')) + self.assertEqual(len(secondary_site_pools), n_pools_promoted) + + # Validate that the Ceph images from the newly promoted site + # report 'up+stopped' state (which is reported by primary Ceph images). + logging.info('Waiting until {} is promoted.'.format( + secondary_site_app_name)) + self.wait_for_mirror_state( + 'up+stopped', + application_name=secondary_site_app_name, + model_name=secondary_site_model, + pools=secondary_site_pools) + + # Validate that the Ceph images from site-a report 'up+replaying' + # (which is reported by secondary Ceph images). + self.wait_for_mirror_state( + 'up+replaying', + check_entries_behind_master=True, + application_name=primary_site_app_name, + model_name=primary_site_model, + pools=primary_site_pools) + + def test_100_cinder_failover(self): + """Validate controlled failover via the Cinder API. + + This test only makes sense if Cinder RBD mirroring mode is 'image'. + It will return early, if this is not the case. + """ + cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode( + self.cinder_ceph_app_name) + if cinder_rbd_mirroring_mode != 'image': + logging.warning( + "Skipping 'test_100_cinder_failover' since Cinder RBD " + "mirroring mode is {}.".format(cinder_rbd_mirroring_mode)) + return + session = openstack.get_overcloud_keystone_session() - cinder = openstack.get_cinder_session_client(session) - volume = self.create_cinder_volume(session, from_image=True) + cinder = openstack.get_cinder_session_client(session, version=3) + + # Check if the Cinder volume host is available with replication + # enabled. + host = 'cinder@{}'.format(self.cinder_ceph_app_name) + svc = cinder.services.list(host=host, binary='cinder-volume')[0] + self.assertEqual(svc.replication_status, 'enabled') + self.assertEqual(svc.status, 'enabled') + + # Setup the test Cinder volume + volume = self.setup_test_cinder_volume() + + # Check if the volume is properly mirrored self.wait_for_mirror_state( 'up+replaying', check_entries_behind_master=True, application_name=self.application_name + self.site_b_app_suffix, model_name=self.site_b_model, - pools=['cinder-ceph']) - self.failover_cinder_volume_host( - cinder_client=cinder) + pools=[self.cinder_ceph_app_name]) + + # Execute the Cinder volume failover + openstack.failover_cinder_volume_host( + cinder=cinder, + backend_name=self.cinder_ceph_app_name, + target_backend_id='ceph', + target_status='disabled', + target_replication_status='failed-over') + + # Check if the test volume is still available after failover self.assertEqual(cinder.volumes.get(volume.id).status, 'available') - self.failover_cinder_volume_host( - cinder_client=cinder, + + def test_101_cinder_failback(self): + """Validate controlled failback via the Cinder API. + + This test only makes sense if Cinder RBD mirroring mode is 'image'. + It will return early, if this is not the case. + + The test needs to be executed when the Cinder volume host is already + failed-over with the test volume on it. + """ + cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode( + self.cinder_ceph_app_name) + if cinder_rbd_mirroring_mode != 'image': + logging.warning( + "Skipping 'test_101_cinder_failback' since Cinder RBD " + "mirroring mode is {}.".format(cinder_rbd_mirroring_mode)) + return + + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session, version=3) + + # Check if the Cinder volume host is already failed-over + host = 'cinder@{}'.format(self.cinder_ceph_app_name) + svc = cinder.services.list(host=host, binary='cinder-volume')[0] + self.assertEqual(svc.replication_status, 'failed-over') + self.assertEqual(svc.status, 'disabled') + + # Check if the test Cinder volume is already present. The method + # 'cinder.volumes.find' raises 404 if the volume is not found. + volume = cinder.volumes.find(name=self.test_cinder_volume_name) + + # Execute the Cinder volume failback + openstack.failover_cinder_volume_host( + cinder=cinder, + backend_name=self.cinder_ceph_app_name, target_backend_id='default', target_status='enabled', target_replication_status='enabled') + + # Check if the test volume is still available after failback self.assertEqual(cinder.volumes.get(volume.id).status, 'available') - def test_fail_over_fall_back(self): - """Validate controlled fail over and fall back.""" - site_a_pools, site_b_pools = self.get_pools() - site_a_action_params = {} - site_b_action_params = {} - if self.get_cinder_rbd_mirroring_mode() == 'image': - site_a_pools.remove('cinder-ceph') - site_a_action_params['pools'] = ','.join(site_a_pools) - site_b_pools.remove('cinder-ceph') - site_b_action_params['pools'] = ','.join(site_b_pools) + def test_200_juju_failover(self): + """Validate controlled failover via Juju actions.""" + # Get the Ceph pools needed to failover + site_a_pools, site_b_pools = self.get_failover_pools() + + # Execute the failover Juju actions with the appropriate parameters. + site_b_app_name = self.application_name + self.site_b_app_suffix + self.execute_failover_juju_actions( + primary_site_app_name=self.application_name, + primary_site_model=self.site_a_model, + primary_site_pools=site_a_pools, + secondary_site_app_name=site_b_app_name, + secondary_site_model=self.site_b_model, + secondary_site_pools=site_b_pools) + + def test_201_juju_failback(self): + """Validate controlled failback via Juju actions.""" + # Get the Ceph pools needed to failback + site_a_pools, site_b_pools = self.get_failover_pools() + + # Execute the failover Juju actions with the appropriate parameters. + # The failback operation is just a failover with sites in reverse + # order. + site_b_app_name = self.application_name + self.site_b_app_suffix + self.execute_failover_juju_actions( + primary_site_app_name=site_b_app_name, + primary_site_model=self.site_b_model, + primary_site_pools=site_b_pools, + secondary_site_app_name=self.application_name, + secondary_site_model=self.site_a_model, + secondary_site_pools=site_a_pools) + + def test_203_juju_resync(self): + """Validate the 'resync-pools' Juju action. + + The 'resync-pools' Juju action is meant to flag Ceph images from the + secondary site to re-sync against the Ceph images from the primary + site. + + This use case is useful when the Ceph secondary images are out of sync. + """ + # Get the Ceph pools needed to failback + _, site_b_pools = self.get_failover_pools() + + # Run the 'resync-pools' Juju action against the pools from site-b. + # This will make sure that the Ceph images from site-b are properly + # synced with the primary images from site-a. + site_b_app_name = self.application_name + self.site_b_app_suffix + logging.info('Re-syncing {} from model {}'.format( + site_b_app_name, self.site_b_model)) result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror', - 'demote', - model_name=self.site_a_model, - action_params=site_a_action_params) - logging.info(result.results) - n_pools_demoted = len(result.results['output'].split('\n')) - self.assertEqual(len(site_a_pools), n_pools_demoted) - self.wait_for_mirror_state( - 'up+unknown', - model_name=self.site_a_model, - pools=site_a_pools) - self.wait_for_mirror_state( - 'up+unknown', - application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model, - pools=site_b_pools) - result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, - 'promote', - model_name=self.site_b_model, - action_params=site_b_action_params) - logging.info(result.results) - n_pools_promoted = len(result.results['output'].split('\n')) - self.assertEqual(len(site_b_pools), n_pools_promoted) - self.wait_for_mirror_state( - 'up+replaying', - model_name=self.site_a_model, - pools=site_a_pools) - self.wait_for_mirror_state( - 'up+stopped', - application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model, - pools=site_b_pools) - result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, - 'demote', - model_name=self.site_b_model, - action_params=site_b_action_params) - logging.info(result.results) - n_pools_demoted = len(result.results['output'].split('\n')) - self.assertEqual(len(site_a_pools), n_pools_demoted) - self.wait_for_mirror_state( - 'up+unknown', - model_name=self.site_a_model, - pools=site_a_pools) - self.wait_for_mirror_state( - 'up+unknown', - application_name=self.application_name + self.site_b_app_suffix, - model_name=self.site_b_model, - pools=site_b_pools) - result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror', - 'promote', - model_name=self.site_a_model, - action_params=site_a_action_params) - logging.info(result.results) - n_pools_promoted = len(result.results['output'].split('\n')) - self.assertEqual(len(site_b_pools), n_pools_promoted) - self.wait_for_mirror_state( - 'up+stopped', - model_name=self.site_a_model, - pools=site_a_pools) - action_params = { - 'i-really-mean-it': True, - } - if self.get_cinder_rbd_mirroring_mode() == 'image': - action_params['pools'] = site_b_action_params['pools'] - result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, + site_b_app_name, 'resync-pools', model_name=self.site_b_model, - action_params=action_params) + action_params={ + 'pools': ','.join(site_b_pools), + 'i-really-mean-it': True, + }) logging.info(result.results) + self.assertEqual(int(result.results['Code']), 0) + + # Validate that the Ceph images from site-b report 'up+replaying' + # (which is reported by secondary Ceph images). And check that images + # exist in Cinder and Glance pools. self.wait_for_mirror_state( 'up+replaying', - application_name=self.application_name + self.site_b_app_suffix, + check_entries_behind_master=True, + application_name=site_b_app_name, model_name=self.site_b_model, - require_images_in=['cinder-ceph', 'glance'], - pools=site_a_pools) - if self.get_cinder_rbd_mirroring_mode() == 'image': - self.cinder_fail_over_fall_back() + require_images_in=[self.cinder_ceph_app_name, 'glance'], + pools=site_b_pools) class CephRBDMirrorDisasterFailoverTest(CephRBDMirrorBase): """Encapsulate ``ceph-rbd-mirror`` destructive tests.""" - def forced_failover_cinder_volume_host(self, cinder_client): - """Validate forced Cinder volume host fail over.""" - def apply_cinder_workaround(): - """Set minimal timeouts / retries to the Cinder Ceph backend. + def apply_cinder_ceph_workaround(self): + """Set minimal timeouts / retries to the Cinder Ceph backend. - This is needed because the failover via Cinder will try to do a - demotion of the site-a, and with the default timeouts / retries, - the operation takes an unreasonably amount of time. - """ - cinder_configs = { - 'rados_connect_timeout': '1', - 'rados_connection_retries': '1', - 'rados_connection_interval': '0', - 'replication_connect_timeout': '1', - } - update_cinder_conf_cmd = ( - "import configparser; " - "config = configparser.ConfigParser(); " - "config.read('/etc/cinder/cinder.conf'); " - "{}" - "f = open('/etc/cinder/cinder.conf', 'w'); " - "config.write(f); " - "f.close()") - cmd = '' - for config in cinder_configs: - cmd += "config.set('cinder-ceph', '{0}', '{1}'); ".format( - config, cinder_configs[config]) - cmd = update_cinder_conf_cmd.format(cmd) - zaza.model.run_on_leader( - 'cinder-ceph', - 'python3 -c "{}"; systemctl restart cinder-volume'.format(cmd)) + This is needed because the failover via Cinder API will try to do a + demotion of the site-a. However, when site-a is down, and with the + default timeouts / retries, the operation takes an unreasonably amount + of time (or sometimes it never finishes). + """ + # These new config options need to be set under the Cinder Ceph backend + # section in the main Cinder config file. + # At the moment, we don't the possibility of using Juju config to set + # these options. And also, it's not even a good practice to have them + # in production. + # These should be set only to do the Ceph failover via Cinder API, and + # they need to be removed after. + configs = { + 'rados_connect_timeout': '1', + 'rados_connection_retries': '1', + 'rados_connection_interval': '0', + 'replication_connect_timeout': '1', + } - apply_cinder_workaround() - self.failover_cinder_volume_host(cinder_client) + # Small Python script that will be executed via Juju run to update + # the Cinder config file. + update_cinder_conf_script = ( + "import configparser; " + "config = configparser.ConfigParser(); " + "config.read('/etc/cinder/cinder.conf'); " + "{}" + "f = open('/etc/cinder/cinder.conf', 'w'); " + "config.write(f); " + "f.close()") + set_cmd = '' + for cfg_name in configs: + set_cmd += "config.set('{0}', '{1}', '{2}'); ".format( + self.cinder_ceph_app_name, cfg_name, configs[cfg_name]) + script = update_cinder_conf_script.format(set_cmd) - for volume in cinder_client.volumes.list(): - self.assertEqual(volume.status, 'available') + # Run the workaround script via Juju run + zaza.model.run_on_leader( + self.cinder_ceph_app_name, + 'python3 -c "{}"; systemctl restart cinder-volume'.format(script)) - def test_kill_site_a_fail_over(self): - """Validate fail over after uncontrolled shutdown of primary.""" - action_params = {} - if self.get_cinder_rbd_mirroring_mode() == 'image': - _, site_b_pools = self.get_pools() - site_b_pools.remove('cinder-ceph') - action_params['pools'] = ','.join(site_b_pools) - - for application in 'ceph-rbd-mirror', 'ceph-mon', 'ceph-osd': + def kill_primary_site(self): + """Simulate an unexpected primary site shutdown.""" + logging.info('Killing the Ceph primary site') + for application in ['ceph-rbd-mirror', 'ceph-mon', 'ceph-osd']: zaza.model.remove_application( application, model_name=self.site_a_model, forcefully_remove_machines=True) + + def test_100_forced_juju_failover(self): + """Validate Ceph failover via Juju when the primary site is down. + + * Kill the primary site + * Execute the forced failover via Juju actions + """ + # Get the site-b Ceph pools that need to be promoted + _, site_b_pools = self.get_failover_pools() + site_b_app_name = self.application_name + self.site_b_app_suffix + + # Simulate primary site unexpected shutdown + self.kill_primary_site() + + # Try and promote the site-b to primary. result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, + site_b_app_name, 'promote', model_name=self.site_b_model, - action_params=action_params) + action_params={ + 'pools': ','.join(site_b_pools), + }) + self.assertEqual(int(result.results['Code']), 0) + + # The site-b 'promote' Juju action is expected to fail, because the + # primary site is down. self.assertEqual(result.status, 'failed') - action_params['force'] = True + + # Retry to promote site-b using the 'force' Juju action parameter. result = zaza.model.run_action_on_leader( - 'ceph-rbd-mirror' + self.site_b_app_suffix, + site_b_app_name, 'promote', model_name=self.site_b_model, - action_params=action_params) + action_params={ + 'force': True, + 'pools': ','.join(site_b_pools), + }) + self.assertEqual(int(result.results['Code']), 0) + + # Validate successful Juju action execution self.assertEqual(result.status, 'completed') - if self.get_cinder_rbd_mirroring_mode() == 'image': - session = openstack.get_overcloud_keystone_session() - cinder = openstack.get_cinder_session_client(session) - self.forced_failover_cinder_volume_host(cinder) + + def test_200_forced_cinder_failover(self): + """Validate Ceph failover via Cinder when the primary site is down. + + This test only makes sense if Cinder RBD mirroring mode is 'image'. + It will return early, if this is not the case. + + This assumes that the primary site is already killed. + """ + cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode( + self.cinder_ceph_app_name) + if cinder_rbd_mirroring_mode != 'image': + logging.warning( + "Skipping 'test_200_cinder_failover_without_primary_site' " + "since Cinder RBD mirroring mode is {}.".format( + cinder_rbd_mirroring_mode)) + return + + # Make sure that the Cinder Ceph backend workaround is applied. + self.apply_cinder_ceph_workaround() + + session = openstack.get_overcloud_keystone_session() + cinder = openstack.get_cinder_session_client(session, version=3) + openstack.failover_cinder_volume_host( + cinder=cinder, + backend_name=self.cinder_ceph_app_name, + target_backend_id='ceph', + target_status='disabled', + target_replication_status='failed-over') + + # Check that the Cinder volumes are still available after forced + # failover. + for volume in cinder.volumes.list(): + self.assertEqual(volume.status, 'available') diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py index d277656..4ed75b9 100644 --- a/zaza/openstack/utilities/openstack.py +++ b/zaza/openstack/utilities/openstack.py @@ -2489,6 +2489,40 @@ def attach_volume(nova, volume_id, instance_id): device='/dev/vdx') +def failover_cinder_volume_host(cinder, backend_name='cinder-ceph', + target_backend_id='ceph', + target_status='disabled', + target_replication_status='failed-over'): + """Failover Cinder volume host with replication enabled. + + :param cinder: Authenticated cinderclient + :type cinder: cinder.Client + :param backend_name: Cinder volume backend name with + replication enabled. + :type backend_name: str + :param target_backend_id: Failover target Cinder backend id. + :type target_backend_id: str + :param target_status: Target Cinder volume status after failover. + :type target_status: str + :param target_replication_status: Target Cinder volume replication + status after failover. + :type target_replication_status: str + :raises: AssertionError + """ + host = 'cinder@{}'.format(backend_name) + logging.info('Failover Cinder volume host %s to backend_id %s', + host, target_backend_id) + cinder.services.failover_host(host=host, backend_id=target_backend_id) + for attempt in tenacity.Retrying( + retry=tenacity.retry_if_exception_type(AssertionError), + stop=tenacity.stop_after_attempt(10), + wait=tenacity.wait_exponential(multiplier=1, min=2, max=10)): + with attempt: + svc = cinder.services.list(host=host, binary='cinder-volume')[0] + assert svc.status == target_status + assert svc.replication_status == target_replication_status + + def create_volume_backup(cinder, volume_id, name=None): """Create cinder volume backup.