Merge pull request #486 from ionutbalutoiu/update-ceph-rbd-mirror-tests

Update Ceph RBD Mirror tests
2021-03-12 09:19:54 +01:00
parent 0cbe7fe563 2fefca5a40
commit c9d8800034
2 changed files with 579 additions and 97 deletions
@@ -17,6 +17,8 @@ import json
 import logging
 import re

+import cinderclient.exceptions as cinder_exceptions
+
 import zaza.openstack.charm_tests.test_utils as test_utils

 import zaza.model
@@ -28,6 +30,129 @@ from zaza.openstack.charm_tests.glance.setup import (
    CIRROS_IMAGE_NAME)


+DEFAULT_CINDER_RBD_MIRRORING_MODE = 'pool'
+
+
+def get_cinder_rbd_mirroring_mode(cinder_ceph_app_name='cinder-ceph'):
+    """Get the RBD mirroring mode for the Cinder Ceph pool.
+
+    :param cinder_ceph_app_name: Cinder Ceph Juju application name.
+    :type cinder_ceph_app_name: str
+    :returns: A string representing the RBD mirroring mode. It can be
+              either 'pool' or 'image'.
+    :rtype: str
+    """
+    rbd_mirroring_mode_config = zaza.model.get_application_config(
+        cinder_ceph_app_name).get('rbd-mirroring-mode')
+    if rbd_mirroring_mode_config:
+        rbd_mirroring_mode = rbd_mirroring_mode_config.get(
+            'value', DEFAULT_CINDER_RBD_MIRRORING_MODE).lower()
+    else:
+        rbd_mirroring_mode = DEFAULT_CINDER_RBD_MIRRORING_MODE
+
+    return rbd_mirroring_mode
+
+
+def get_glance_image(glance):
+    """Get the Glance image object to be used by the Ceph tests.
+
+    It looks for the Cirros Glance image, and it's returned if it's found.
+    If the Cirros image is not found, it will try and find the Ubuntu
+    LTS image.
+
+    :param glance: Authenticated glanceclient
+    :type glance: glanceclient.Client
+    :returns: Glance image object
+    :rtype: glanceclient.image
+    """
+    images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME)
+    if images:
+        return images[0]
+    logging.info("Failed to find {} image, falling back to {}".format(
+        CIRROS_IMAGE_NAME,
+        LTS_IMAGE_NAME))
+    return openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0]
+
+
+def setup_cinder_repl_volume_type(cinder, type_name='repl',
+                                  backend_name='cinder-ceph'):
+    """Set up the Cinder volume replication type.
+
+    :param cinder: Authenticated cinderclient
+    :type cinder: cinder.Client
+    :param type_name: Cinder volume type name
+    :type type_name: str
+    :param backend_name: Cinder volume backend name with replication enabled.
+    :type backend_name: str
+    :returns: Cinder volume type object
+    :rtype: cinderclient.VolumeType
+    """
+    try:
+        vol_type = cinder.volume_types.find(name=type_name)
+    except cinder_exceptions.NotFound:
+        vol_type = cinder.volume_types.create(type_name)
+
+    vol_type.set_keys(metadata={
+        'volume_backend_name': backend_name,
+        'replication_enabled': '<is> True',
+    })
+    return vol_type
+
+
+# TODO: This function should be incorporated into
+# 'zaza.openstack.utilities.openstack.create_volume' helper, once the below
+# flakiness comments are addressed.
+def create_cinder_volume(cinder, name='zaza', image_id=None, type_id=None):
+    """Create a new Cinder volume.
+
+    :param cinder: Authenticated cinderclient.
+    :type cinder: cinder.Client
+    :param name: Volume name.
+    :type name: str
+    :param image_id: Glance image id, if the volume is created from image.
+    :type image_id: str
+    :param type_id: Cinder Volume type id, if the volume needs to use an
+                    explicit volume type.
+    :type type_id: boolean
+    :returns: Cinder volume
+    :rtype: :class:`Volume`.
+    """
+    # NOTE(fnordahl): for some reason create volume from image often fails
+    # when run just after deployment is finished.  We should figure out
+    # why, resolve the underlying issue and then remove this.
+    #
+    # We do not use tenacity here as it will interfere with tenacity used
+    # in ``resource_reaches_status``
+    def create_volume(cinder, volume_params, retry=20):
+        if retry < 1:
+            return
+        volume = cinder.volumes.create(**volume_params)
+        try:
+            # Note(coreycb): stop_after_attempt is increased because using
+            # juju storage for ceph-osd backed by cinder on undercloud
+            # takes longer than the prior method of directory-backed OSD
+            # devices.
+            openstack.resource_reaches_status(
+                cinder.volumes, volume.id, msg='volume',
+                stop_after_attempt=20)
+            return volume
+        except AssertionError:
+            logging.info('retrying')
+            volume.delete()
+            return create_volume(cinder, volume_params, retry=retry - 1)
+
+    volume_params = {
+        'size': 8,
+        'name': name,
+    }
+    if image_id:
+        volume_params['imageRef'] = image_id
+    if type_id:
+        volume_params['volume_type'] = type_id
+
+    return create_volume(cinder, volume_params)
+
+
 class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
    """Base class for ``ceph-rbd-mirror`` tests."""

@@ -35,20 +160,26 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
    def setUpClass(cls):
        """Run setup for ``ceph-rbd-mirror`` tests."""
        super().setUpClass()
+        cls.cinder_ceph_app_name = 'cinder-ceph'
+        cls.test_cinder_volume_name = 'test-cinder-ceph-volume'
        # get ready for multi-model Zaza
        cls.site_a_model = cls.site_b_model = zaza.model.get_juju_model()
        cls.site_b_app_suffix = '-b'

-    def run_status_action(self, application_name=None, model_name=None):
+    def run_status_action(self, application_name=None, model_name=None,
+                          pools=[]):
        """Run status action, decode and return response."""
+        action_params = {
+            'verbose': True,
+            'format': 'json',
+        }
+        if len(pools) > 0:
+            action_params['pools'] = ','.join(pools)
        result = zaza.model.run_action_on_leader(
            application_name or self.application_name,
            'status',
            model_name=model_name,
-            action_params={
-                'verbose': True,
-                'format': 'json',
-            })
+            action_params=action_params)
        return json.loads(result.results['output'])

    def get_pools(self):
@@ -68,10 +199,26 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
            model_name=self.site_b_model)
        return sorted(site_a_pools.keys()), sorted(site_b_pools.keys())

+    def get_failover_pools(self):
+        """Get the failover Ceph pools' names, from both sites.
+
+        If the Cinder RBD mirroring mode is 'image', the 'cinder-ceph' pool
+        needs to be excluded, since Cinder orchestrates the failover then.
+
+        :returns: Tuple with site-a pools and site-b pools.
+        :rtype: Tuple[List[str], List[str]]
+        """
+        site_a_pools, site_b_pools = self.get_pools()
+        if get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) == 'image':
+            site_a_pools.remove(self.cinder_ceph_app_name)
+            site_b_pools.remove(self.cinder_ceph_app_name)
+        return site_a_pools, site_b_pools
+
    def wait_for_mirror_state(self, state, application_name=None,
                              model_name=None,
                              check_entries_behind_master=False,
-                              require_images_in=[]):
+                              require_images_in=[],
+                              pools=[]):
        """Wait until all images reach requested state.

        This function runs the ``status`` action and examines the data it
@@ -90,6 +237,9 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
        :type check_entries_behind_master: bool
        :param require_images_in: List of pools to require images in
        :type require_images_in: list of str
+        :param pools: List of pools to run status on. If this is empty, the
+                      status action will run on all the pools.
+        :type pools: list of str
        :returns: True on success, never returns on failure
        """
        rep = re.compile(r'.*entries_behind_master=(\d+)')
@@ -97,7 +247,8 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
            try:
                # encapsulate in try except to work around LP: #1820976
                pool_status = self.run_status_action(
-                    application_name=application_name, model_name=model_name)
+                    application_name=application_name, model_name=model_name,
+                    pools=pools)
            except KeyError:
                continue
            for pool, status in pool_status.items():
@@ -124,6 +275,41 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
                # all images with state has expected state
                return True

+    def setup_test_cinder_volume(self):
+        """Set up the test Cinder volume into the Ceph RBD mirror environment.
+
+        If the volume already exists, then it's returned.
+
+        Also, if the Cinder RBD mirroring mode is 'image', the volume will
+        use an explicit volume type with the appropriate replication flags.
+        Otherwise, it is just a simple Cinder volume using the default backend.
+
+        :returns: Cinder volume
+        :rtype: :class:`Volume`.
+        """
+        session = openstack.get_overcloud_keystone_session()
+        cinder = openstack.get_cinder_session_client(session, version=3)
+
+        try:
+            return cinder.volumes.find(name=self.test_cinder_volume_name)
+        except cinder_exceptions.NotFound:
+            logging.info("Test Cinder volume doesn't exist. Creating it")
+
+        glance = openstack.get_glance_session_client(session)
+        image = get_glance_image(glance)
+        kwargs = {
+            'cinder': cinder,
+            'name': self.test_cinder_volume_name,
+            'image_id': image.id,
+        }
+        if get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) == 'image':
+            volume_type = setup_cinder_repl_volume_type(
+                cinder,
+                backend_name=self.cinder_ceph_app_name)
+            kwargs['type_id'] = volume_type.id
+
+        return create_cinder_volume(**kwargs)
+

 class CephRBDMirrorTest(CephRBDMirrorBase):
    """Encapsulate ``ceph-rbd-mirror`` tests."""
@@ -195,44 +381,7 @@ class CephRBDMirrorTest(CephRBDMirrorBase):
        site B and subsequently comparing the contents we get a full end to end
        test.
        """
-        session = openstack.get_overcloud_keystone_session()
-        glance = openstack.get_glance_session_client(session)
-        cinder = openstack.get_cinder_session_client(session)
-
-        images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME)
-        if images:
-            image = images[0]
-        else:
-            logging.info("Failed to find {} image, falling back to {}".format(
-                CIRROS_IMAGE_NAME,
-                LTS_IMAGE_NAME))
-            image = openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0]
-
-        # NOTE(fnordahl): for some reason create volume from image often fails
-        # when run just after deployment is finished.  We should figure out
-        # why, resolve the underlying issue and then remove this.
-        #
-        # We do not use tenacity here as it will interfere with tenacity used
-        # in ``resource_reaches_status``
-        def create_volume_from_image(cinder, image, retry=20):
-            if retry < 1:
-                return
-            volume = cinder.volumes.create(8, name='zaza', imageRef=image.id)
-            try:
-                # Note(coreycb): stop_after_attempt is increased because using
-                # juju storage for ceph-osd backed by cinder on undercloud
-                # takes longer than the prior method of directory-backed OSD
-                # devices.
-                openstack.resource_reaches_status(
-                    cinder.volumes, volume.id, msg='volume',
-                    stop_after_attempt=20)
-                return volume
-            except AssertionError:
-                logging.info('retrying')
-                volume.delete()
-                return create_volume_from_image(cinder, image, retry=retry - 1)
-        volume = create_volume_from_image(cinder, image)
-
+        volume = self.setup_test_cinder_volume()
        site_a_hash = zaza.openstack.utilities.ceph.get_rbd_hash(
            zaza.model.get_lead_unit_name('ceph-mon',
                                          model_name=self.site_a_model),
@@ -244,6 +393,8 @@ class CephRBDMirrorTest(CephRBDMirrorBase):
            check_entries_behind_master=True,
            application_name=self.application_name + self.site_b_app_suffix,
            model_name=self.site_b_model)
+        logging.info('Checking the Ceph RBD hashes of the primary and '
+                     'the secondary Ceph images')
        site_b_hash = zaza.openstack.utilities.ceph.get_rbd_hash(
            zaza.model.get_lead_unit_name('ceph-mon' + self.site_b_app_suffix,
                                          model_name=self.site_b_model),
@@ -258,102 +409,399 @@ class CephRBDMirrorTest(CephRBDMirrorBase):
 class CephRBDMirrorControlledFailoverTest(CephRBDMirrorBase):
    """Encapsulate ``ceph-rbd-mirror`` controlled failover tests."""

-    def test_fail_over_fall_back(self):
-        """Validate controlled fail over and fall back."""
-        site_a_pools, site_b_pools = self.get_pools()
+    def execute_failover_juju_actions(self,
+                                      primary_site_app_name,
+                                      primary_site_model,
+                                      primary_site_pools,
+                                      secondary_site_app_name,
+                                      secondary_site_model,
+                                      secondary_site_pools):
+        """Execute the failover Juju actions.
+
+        The failover / failback via Juju actions shares the same workflow. The
+        failback is just a failover with sites in reversed order.
+
+        This function encapsulates the tasks to failover a primary site to
+        a secondary site:
+        1. Demote primary site
+        2. Validation of the primary site demotion
+        3. Promote secondary site
+        4. Validation of the secondary site promotion
+
+        :param primary_site_app_name: Primary site Ceph RBD mirror app name.
+        :type primary_site_app_name: str
+        :param primary_site_model: Primary site Juju model name.
+        :type primary_site_model: str
+        :param primary_site_pools: Primary site pools.
+        :type primary_site_pools: List[str]
+        :param secondary_site_app_name: Secondary site Ceph RBD mirror
+                                        app name.
+        :type secondary_site_app_name: str
+        :param secondary_site_model: Secondary site Juju model name.
+        :type secondary_site_model: str
+        :param secondary_site_pools: Secondary site pools.
+        :type secondary_site_pools: List[str]
+        """
+        # Check if primary and secondary pools sizes are the same.
+        self.assertEqual(len(primary_site_pools), len(secondary_site_pools))
+
+        # Run the 'demote' Juju action against the primary site pools.
+        logging.info('Demoting {} from model {}.'.format(
+            primary_site_app_name, primary_site_model))
        result = zaza.model.run_action_on_leader(
-            'ceph-rbd-mirror',
+            primary_site_app_name,
            'demote',
-            model_name=self.site_a_model,
-            action_params={})
+            model_name=primary_site_model,
+            action_params={
+                'pools': ','.join(primary_site_pools)
+            })
        logging.info(result.results)
+        self.assertEqual(int(result.results['Code']), 0)
+
+        # Validate that the demoted pools count matches the total primary site
+        # pools count.
        n_pools_demoted = len(result.results['output'].split('\n'))
-        self.assertEqual(len(site_a_pools), n_pools_demoted)
-        self.wait_for_mirror_state('up+unknown', model_name=self.site_a_model)
+        self.assertEqual(len(primary_site_pools), n_pools_demoted)
+
+        # At this point, both primary and secondary sites are demoted. Validate
+        # that the Ceph images, from both sites, report 'up+unknown', since
+        # there isn't a primary site at the moment.
+        logging.info('Waiting until {} is demoted.'.format(
+            primary_site_app_name))
        self.wait_for_mirror_state(
            'up+unknown',
-            application_name=self.application_name + self.site_b_app_suffix,
-            model_name=self.site_b_model)
+            application_name=primary_site_app_name,
+            model_name=primary_site_model,
+            pools=primary_site_pools)
+        self.wait_for_mirror_state(
+            'up+unknown',
+            application_name=secondary_site_app_name,
+            model_name=secondary_site_model,
+            pools=secondary_site_pools)
+
+        # Run the 'promote' Juju against the secondary site.
+        logging.info('Promoting {} from model {}.'.format(
+            secondary_site_app_name, secondary_site_model))
        result = zaza.model.run_action_on_leader(
-            'ceph-rbd-mirror' + self.site_b_app_suffix,
+            secondary_site_app_name,
            'promote',
-            model_name=self.site_b_model,
-            action_params={})
+            model_name=secondary_site_model,
+            action_params={
+                'pools': ','.join(secondary_site_pools)
+            })
        logging.info(result.results)
+        self.assertEqual(int(result.results['Code']), 0)
+
+        # Validate that the promoted pools count matches the total secondary
+        # site pools count.
        n_pools_promoted = len(result.results['output'].split('\n'))
-        self.assertEqual(len(site_b_pools), n_pools_promoted)
+        self.assertEqual(len(secondary_site_pools), n_pools_promoted)
+
+        # Validate that the Ceph images from the newly promoted site
+        # report 'up+stopped' state (which is reported by primary Ceph images).
+        logging.info('Waiting until {} is promoted.'.format(
+            secondary_site_app_name))
+        self.wait_for_mirror_state(
+            'up+stopped',
+            application_name=secondary_site_app_name,
+            model_name=secondary_site_model,
+            pools=secondary_site_pools)
+
+        # Validate that the Ceph images from site-a report 'up+replaying'
+        # (which is reported by secondary Ceph images).
        self.wait_for_mirror_state(
            'up+replaying',
-            model_name=self.site_a_model)
+            check_entries_behind_master=True,
+            application_name=primary_site_app_name,
+            model_name=primary_site_model,
+            pools=primary_site_pools)
+
+    def test_100_cinder_failover(self):
+        """Validate controlled failover via the Cinder API.
+
+        This test only makes sense if Cinder RBD mirroring mode is 'image'.
+        It will return early, if this is not the case.
+        """
+        cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode(
+            self.cinder_ceph_app_name)
+        if cinder_rbd_mirroring_mode != 'image':
+            logging.warning(
+                "Skipping 'test_100_cinder_failover' since Cinder RBD "
+                "mirroring mode is {}.".format(cinder_rbd_mirroring_mode))
+            return
+
+        session = openstack.get_overcloud_keystone_session()
+        cinder = openstack.get_cinder_session_client(session, version=3)
+
+        # Check if the Cinder volume host is available with replication
+        # enabled.
+        host = 'cinder@{}'.format(self.cinder_ceph_app_name)
+        svc = cinder.services.list(host=host, binary='cinder-volume')[0]
+        self.assertEqual(svc.replication_status, 'enabled')
+        self.assertEqual(svc.status, 'enabled')
+
+        # Setup the test Cinder volume
+        volume = self.setup_test_cinder_volume()
+
+        # Check if the volume is properly mirrored
        self.wait_for_mirror_state(
-            'up+stopped',
+            'up+replaying',
+            check_entries_behind_master=True,
            application_name=self.application_name + self.site_b_app_suffix,
-            model_name=self.site_b_model)
-        result = zaza.model.run_action_on_leader(
-            'ceph-rbd-mirror' + self.site_b_app_suffix,
-            'demote',
            model_name=self.site_b_model,
-            action_params={
-            })
-        logging.info(result.results)
-        n_pools_demoted = len(result.results['output'].split('\n'))
-        self.assertEqual(len(site_a_pools), n_pools_demoted)
-        self.wait_for_mirror_state(
-            'up+unknown',
-            model_name=self.site_a_model)
-        self.wait_for_mirror_state(
-            'up+unknown',
-            application_name=self.application_name + self.site_b_app_suffix,
-            model_name=self.site_b_model)
+            pools=[self.cinder_ceph_app_name])
+
+        # Execute the Cinder volume failover
+        openstack.failover_cinder_volume_host(
+            cinder=cinder,
+            backend_name=self.cinder_ceph_app_name,
+            target_backend_id='ceph',
+            target_status='disabled',
+            target_replication_status='failed-over')
+
+        # Check if the test volume is still available after failover
+        self.assertEqual(cinder.volumes.get(volume.id).status, 'available')
+
+    def test_101_cinder_failback(self):
+        """Validate controlled failback via the Cinder API.
+
+        This test only makes sense if Cinder RBD mirroring mode is 'image'.
+        It will return early, if this is not the case.
+
+        The test needs to be executed when the Cinder volume host is already
+        failed-over with the test volume on it.
+        """
+        cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode(
+            self.cinder_ceph_app_name)
+        if cinder_rbd_mirroring_mode != 'image':
+            logging.warning(
+                "Skipping 'test_101_cinder_failback' since Cinder RBD "
+                "mirroring mode is {}.".format(cinder_rbd_mirroring_mode))
+            return
+
+        session = openstack.get_overcloud_keystone_session()
+        cinder = openstack.get_cinder_session_client(session, version=3)
+
+        # Check if the Cinder volume host is already failed-over
+        host = 'cinder@{}'.format(self.cinder_ceph_app_name)
+        svc = cinder.services.list(host=host, binary='cinder-volume')[0]
+        self.assertEqual(svc.replication_status, 'failed-over')
+        self.assertEqual(svc.status, 'disabled')
+
+        # Check if the test Cinder volume is already present. The method
+        # 'cinder.volumes.find' raises 404 if the volume is not found.
+        volume = cinder.volumes.find(name=self.test_cinder_volume_name)
+
+        # Execute the Cinder volume failback
+        openstack.failover_cinder_volume_host(
+            cinder=cinder,
+            backend_name=self.cinder_ceph_app_name,
+            target_backend_id='default',
+            target_status='enabled',
+            target_replication_status='enabled')
+
+        # Check if the test volume is still available after failback
+        self.assertEqual(cinder.volumes.get(volume.id).status, 'available')
+
+    def test_200_juju_failover(self):
+        """Validate controlled failover via Juju actions."""
+        # Get the Ceph pools needed to failover
+        site_a_pools, site_b_pools = self.get_failover_pools()
+
+        # Execute the failover Juju actions with the appropriate parameters.
+        site_b_app_name = self.application_name + self.site_b_app_suffix
+        self.execute_failover_juju_actions(
+            primary_site_app_name=self.application_name,
+            primary_site_model=self.site_a_model,
+            primary_site_pools=site_a_pools,
+            secondary_site_app_name=site_b_app_name,
+            secondary_site_model=self.site_b_model,
+            secondary_site_pools=site_b_pools)
+
+    def test_201_juju_failback(self):
+        """Validate controlled failback via Juju actions."""
+        # Get the Ceph pools needed to failback
+        site_a_pools, site_b_pools = self.get_failover_pools()
+
+        # Execute the failover Juju actions with the appropriate parameters.
+        # The failback operation is just a failover with sites in reverse
+        # order.
+        site_b_app_name = self.application_name + self.site_b_app_suffix
+        self.execute_failover_juju_actions(
+            primary_site_app_name=site_b_app_name,
+            primary_site_model=self.site_b_model,
+            primary_site_pools=site_b_pools,
+            secondary_site_app_name=self.application_name,
+            secondary_site_model=self.site_a_model,
+            secondary_site_pools=site_a_pools)
+
+    def test_203_juju_resync(self):
+        """Validate the 'resync-pools' Juju action.
+
+        The 'resync-pools' Juju action is meant to flag Ceph images from the
+        secondary site to re-sync against the Ceph images from the primary
+        site.
+
+        This use case is useful when the Ceph secondary images are out of sync.
+        """
+        # Get the Ceph pools needed to failback
+        _, site_b_pools = self.get_failover_pools()
+
+        # Run the 'resync-pools' Juju action against the pools from site-b.
+        # This will make sure that the Ceph images from site-b are properly
+        # synced with the primary images from site-a.
+        site_b_app_name = self.application_name + self.site_b_app_suffix
+        logging.info('Re-syncing {} from model {}'.format(
+            site_b_app_name, self.site_b_model))
        result = zaza.model.run_action_on_leader(
-            'ceph-rbd-mirror',
-            'promote',
-            model_name=self.site_a_model,
-            action_params={
-            })
-        logging.info(result.results)
-        n_pools_promoted = len(result.results['output'].split('\n'))
-        self.assertEqual(len(site_b_pools), n_pools_promoted)
-        self.wait_for_mirror_state(
-            'up+stopped',
-            model_name=self.site_a_model)
-        result = zaza.model.run_action_on_leader(
-            'ceph-rbd-mirror' + self.site_b_app_suffix,
+            site_b_app_name,
            'resync-pools',
            model_name=self.site_b_model,
            action_params={
+                'pools': ','.join(site_b_pools),
                'i-really-mean-it': True,
            })
        logging.info(result.results)
+        self.assertEqual(int(result.results['Code']), 0)
+
+        # Validate that the Ceph images from site-b report 'up+replaying'
+        # (which is reported by secondary Ceph images). And check that images
+        # exist in Cinder and Glance pools.
        self.wait_for_mirror_state(
            'up+replaying',
-            application_name=self.application_name + self.site_b_app_suffix,
+            check_entries_behind_master=True,
+            application_name=site_b_app_name,
            model_name=self.site_b_model,
-            require_images_in=['cinder-ceph', 'glance'])
+            require_images_in=[self.cinder_ceph_app_name, 'glance'],
+            pools=site_b_pools)


 class CephRBDMirrorDisasterFailoverTest(CephRBDMirrorBase):
    """Encapsulate ``ceph-rbd-mirror`` destructive tests."""

-    def test_kill_site_a_fail_over(self):
-        """Validate fail over after uncontrolled shutdown of primary."""
-        for application in 'ceph-rbd-mirror', 'ceph-mon', 'ceph-osd':
+    def apply_cinder_ceph_workaround(self):
+        """Set minimal timeouts / retries to the Cinder Ceph backend.
+
+        This is needed because the failover via Cinder API will try to do a
+        demotion of the site-a. However, when site-a is down, and with the
+        default timeouts / retries, the operation takes an unreasonably amount
+        of time (or sometimes it never finishes).
+        """
+        # These new config options need to be set under the Cinder Ceph backend
+        # section in the main Cinder config file.
+        # At the moment, we don't the possibility of using Juju config to set
+        # these options. And also, it's not even a good practice to have them
+        # in production.
+        # These should be set only to do the Ceph failover via Cinder API, and
+        # they need to be removed after.
+        configs = {
+            'rados_connect_timeout': '1',
+            'rados_connection_retries': '1',
+            'rados_connection_interval': '0',
+            'replication_connect_timeout': '1',
+        }
+
+        # Small Python script that will be executed via Juju run to update
+        # the Cinder config file.
+        update_cinder_conf_script = (
+            "import configparser; "
+            "config = configparser.ConfigParser(); "
+            "config.read('/etc/cinder/cinder.conf'); "
+            "{}"
+            "f = open('/etc/cinder/cinder.conf', 'w'); "
+            "config.write(f); "
+            "f.close()")
+        set_cmd = ''
+        for cfg_name in configs:
+            set_cmd += "config.set('{0}', '{1}', '{2}'); ".format(
+                self.cinder_ceph_app_name, cfg_name, configs[cfg_name])
+        script = update_cinder_conf_script.format(set_cmd)
+
+        # Run the workaround script via Juju run
+        zaza.model.run_on_leader(
+            self.cinder_ceph_app_name,
+            'python3 -c "{}"; systemctl restart cinder-volume'.format(script))
+
+    def kill_primary_site(self):
+        """Simulate an unexpected primary site shutdown."""
+        logging.info('Killing the Ceph primary site')
+        for application in ['ceph-rbd-mirror', 'ceph-mon', 'ceph-osd']:
            zaza.model.remove_application(
                application,
                model_name=self.site_a_model,
                forcefully_remove_machines=True)
+
+    def test_100_forced_juju_failover(self):
+        """Validate Ceph failover via Juju when the primary site is down.
+
+        * Kill the primary site
+        * Execute the forced failover via Juju actions
+        """
+        # Get the site-b Ceph pools that need to be promoted
+        _, site_b_pools = self.get_failover_pools()
+        site_b_app_name = self.application_name + self.site_b_app_suffix
+
+        # Simulate primary site unexpected shutdown
+        self.kill_primary_site()
+
+        # Try and promote the site-b to primary.
        result = zaza.model.run_action_on_leader(
-            'ceph-rbd-mirror' + self.site_b_app_suffix,
+            site_b_app_name,
            'promote',
            model_name=self.site_b_model,
            action_params={
+                'pools': ','.join(site_b_pools),
            })
+        self.assertEqual(int(result.results['Code']), 0)
+
+        # The site-b 'promote' Juju action is expected to fail, because the
+        # primary site is down.
        self.assertEqual(result.status, 'failed')
+
+        # Retry to promote site-b using the 'force' Juju action parameter.
        result = zaza.model.run_action_on_leader(
-            'ceph-rbd-mirror' + self.site_b_app_suffix,
+            site_b_app_name,
            'promote',
            model_name=self.site_b_model,
            action_params={
                'force': True,
+                'pools': ','.join(site_b_pools),
            })
+        self.assertEqual(int(result.results['Code']), 0)
+
+        # Validate successful Juju action execution
        self.assertEqual(result.status, 'completed')
+
+    def test_200_forced_cinder_failover(self):
+        """Validate Ceph failover via Cinder when the primary site is down.
+
+        This test only makes sense if Cinder RBD mirroring mode is 'image'.
+        It will return early, if this is not the case.
+
+        This assumes that the primary site is already killed.
+        """
+        cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode(
+            self.cinder_ceph_app_name)
+        if cinder_rbd_mirroring_mode != 'image':
+            logging.warning(
+                "Skipping 'test_200_cinder_failover_without_primary_site' "
+                "since Cinder RBD mirroring mode is {}.".format(
+                    cinder_rbd_mirroring_mode))
+            return
+
+        # Make sure that the Cinder Ceph backend workaround is applied.
+        self.apply_cinder_ceph_workaround()
+
+        session = openstack.get_overcloud_keystone_session()
+        cinder = openstack.get_cinder_session_client(session, version=3)
+        openstack.failover_cinder_volume_host(
+            cinder=cinder,
+            backend_name=self.cinder_ceph_app_name,
+            target_backend_id='ceph',
+            target_status='disabled',
+            target_replication_status='failed-over')
+
+        # Check that the Cinder volumes are still available after forced
+        # failover.
+        for volume in cinder.volumes.list():
+            self.assertEqual(volume.status, 'available')
@@ -2489,6 +2489,40 @@ def attach_volume(nova, volume_id, instance_id):
                                             device='/dev/vdx')


+def failover_cinder_volume_host(cinder, backend_name='cinder-ceph',
+                                target_backend_id='ceph',
+                                target_status='disabled',
+                                target_replication_status='failed-over'):
+    """Failover Cinder volume host with replication enabled.
+
+    :param cinder: Authenticated cinderclient
+    :type cinder: cinder.Client
+    :param backend_name: Cinder volume backend name with
+                         replication enabled.
+    :type backend_name: str
+    :param target_backend_id: Failover target Cinder backend id.
+    :type target_backend_id: str
+    :param target_status: Target Cinder volume status after failover.
+    :type target_status: str
+    :param target_replication_status: Target Cinder volume replication
+                                      status after failover.
+    :type target_replication_status: str
+    :raises: AssertionError
+    """
+    host = 'cinder@{}'.format(backend_name)
+    logging.info('Failover Cinder volume host %s to backend_id %s',
+                 host, target_backend_id)
+    cinder.services.failover_host(host=host, backend_id=target_backend_id)
+    for attempt in tenacity.Retrying(
+            retry=tenacity.retry_if_exception_type(AssertionError),
+            stop=tenacity.stop_after_attempt(10),
+            wait=tenacity.wait_exponential(multiplier=1, min=2, max=10)):
+        with attempt:
+            svc = cinder.services.list(host=host, binary='cinder-volume')[0]
+            assert svc.status == target_status
+            assert svc.replication_status == target_replication_status
+
+
 def create_volume_backup(cinder, volume_id, name=None):
    """Create cinder volume backup.