Merge pull request #486 from ionutbalutoiu/update-ceph-rbd-mirror-tests
Update Ceph RBD Mirror tests
This commit is contained in:
@@ -17,6 +17,8 @@ import json
|
||||
import logging
|
||||
import re
|
||||
|
||||
import cinderclient.exceptions as cinder_exceptions
|
||||
|
||||
import zaza.openstack.charm_tests.test_utils as test_utils
|
||||
|
||||
import zaza.model
|
||||
@@ -28,6 +30,129 @@ from zaza.openstack.charm_tests.glance.setup import (
|
||||
CIRROS_IMAGE_NAME)
|
||||
|
||||
|
||||
DEFAULT_CINDER_RBD_MIRRORING_MODE = 'pool'
|
||||
|
||||
|
||||
def get_cinder_rbd_mirroring_mode(cinder_ceph_app_name='cinder-ceph'):
|
||||
"""Get the RBD mirroring mode for the Cinder Ceph pool.
|
||||
|
||||
:param cinder_ceph_app_name: Cinder Ceph Juju application name.
|
||||
:type cinder_ceph_app_name: str
|
||||
:returns: A string representing the RBD mirroring mode. It can be
|
||||
either 'pool' or 'image'.
|
||||
:rtype: str
|
||||
"""
|
||||
rbd_mirroring_mode_config = zaza.model.get_application_config(
|
||||
cinder_ceph_app_name).get('rbd-mirroring-mode')
|
||||
if rbd_mirroring_mode_config:
|
||||
rbd_mirroring_mode = rbd_mirroring_mode_config.get(
|
||||
'value', DEFAULT_CINDER_RBD_MIRRORING_MODE).lower()
|
||||
else:
|
||||
rbd_mirroring_mode = DEFAULT_CINDER_RBD_MIRRORING_MODE
|
||||
|
||||
return rbd_mirroring_mode
|
||||
|
||||
|
||||
def get_glance_image(glance):
|
||||
"""Get the Glance image object to be used by the Ceph tests.
|
||||
|
||||
It looks for the Cirros Glance image, and it's returned if it's found.
|
||||
If the Cirros image is not found, it will try and find the Ubuntu
|
||||
LTS image.
|
||||
|
||||
:param glance: Authenticated glanceclient
|
||||
:type glance: glanceclient.Client
|
||||
:returns: Glance image object
|
||||
:rtype: glanceclient.image
|
||||
"""
|
||||
images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME)
|
||||
if images:
|
||||
return images[0]
|
||||
logging.info("Failed to find {} image, falling back to {}".format(
|
||||
CIRROS_IMAGE_NAME,
|
||||
LTS_IMAGE_NAME))
|
||||
return openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0]
|
||||
|
||||
|
||||
def setup_cinder_repl_volume_type(cinder, type_name='repl',
|
||||
backend_name='cinder-ceph'):
|
||||
"""Set up the Cinder volume replication type.
|
||||
|
||||
:param cinder: Authenticated cinderclient
|
||||
:type cinder: cinder.Client
|
||||
:param type_name: Cinder volume type name
|
||||
:type type_name: str
|
||||
:param backend_name: Cinder volume backend name with replication enabled.
|
||||
:type backend_name: str
|
||||
:returns: Cinder volume type object
|
||||
:rtype: cinderclient.VolumeType
|
||||
"""
|
||||
try:
|
||||
vol_type = cinder.volume_types.find(name=type_name)
|
||||
except cinder_exceptions.NotFound:
|
||||
vol_type = cinder.volume_types.create(type_name)
|
||||
|
||||
vol_type.set_keys(metadata={
|
||||
'volume_backend_name': backend_name,
|
||||
'replication_enabled': '<is> True',
|
||||
})
|
||||
return vol_type
|
||||
|
||||
|
||||
# TODO: This function should be incorporated into
|
||||
# 'zaza.openstack.utilities.openstack.create_volume' helper, once the below
|
||||
# flakiness comments are addressed.
|
||||
def create_cinder_volume(cinder, name='zaza', image_id=None, type_id=None):
|
||||
"""Create a new Cinder volume.
|
||||
|
||||
:param cinder: Authenticated cinderclient.
|
||||
:type cinder: cinder.Client
|
||||
:param name: Volume name.
|
||||
:type name: str
|
||||
:param image_id: Glance image id, if the volume is created from image.
|
||||
:type image_id: str
|
||||
:param type_id: Cinder Volume type id, if the volume needs to use an
|
||||
explicit volume type.
|
||||
:type type_id: boolean
|
||||
:returns: Cinder volume
|
||||
:rtype: :class:`Volume`.
|
||||
"""
|
||||
# NOTE(fnordahl): for some reason create volume from image often fails
|
||||
# when run just after deployment is finished. We should figure out
|
||||
# why, resolve the underlying issue and then remove this.
|
||||
#
|
||||
# We do not use tenacity here as it will interfere with tenacity used
|
||||
# in ``resource_reaches_status``
|
||||
def create_volume(cinder, volume_params, retry=20):
|
||||
if retry < 1:
|
||||
return
|
||||
volume = cinder.volumes.create(**volume_params)
|
||||
try:
|
||||
# Note(coreycb): stop_after_attempt is increased because using
|
||||
# juju storage for ceph-osd backed by cinder on undercloud
|
||||
# takes longer than the prior method of directory-backed OSD
|
||||
# devices.
|
||||
openstack.resource_reaches_status(
|
||||
cinder.volumes, volume.id, msg='volume',
|
||||
stop_after_attempt=20)
|
||||
return volume
|
||||
except AssertionError:
|
||||
logging.info('retrying')
|
||||
volume.delete()
|
||||
return create_volume(cinder, volume_params, retry=retry - 1)
|
||||
|
||||
volume_params = {
|
||||
'size': 8,
|
||||
'name': name,
|
||||
}
|
||||
if image_id:
|
||||
volume_params['imageRef'] = image_id
|
||||
if type_id:
|
||||
volume_params['volume_type'] = type_id
|
||||
|
||||
return create_volume(cinder, volume_params)
|
||||
|
||||
|
||||
class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
|
||||
"""Base class for ``ceph-rbd-mirror`` tests."""
|
||||
|
||||
@@ -35,20 +160,26 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
|
||||
def setUpClass(cls):
|
||||
"""Run setup for ``ceph-rbd-mirror`` tests."""
|
||||
super().setUpClass()
|
||||
cls.cinder_ceph_app_name = 'cinder-ceph'
|
||||
cls.test_cinder_volume_name = 'test-cinder-ceph-volume'
|
||||
# get ready for multi-model Zaza
|
||||
cls.site_a_model = cls.site_b_model = zaza.model.get_juju_model()
|
||||
cls.site_b_app_suffix = '-b'
|
||||
|
||||
def run_status_action(self, application_name=None, model_name=None):
|
||||
def run_status_action(self, application_name=None, model_name=None,
|
||||
pools=[]):
|
||||
"""Run status action, decode and return response."""
|
||||
action_params = {
|
||||
'verbose': True,
|
||||
'format': 'json',
|
||||
}
|
||||
if len(pools) > 0:
|
||||
action_params['pools'] = ','.join(pools)
|
||||
result = zaza.model.run_action_on_leader(
|
||||
application_name or self.application_name,
|
||||
'status',
|
||||
model_name=model_name,
|
||||
action_params={
|
||||
'verbose': True,
|
||||
'format': 'json',
|
||||
})
|
||||
action_params=action_params)
|
||||
return json.loads(result.results['output'])
|
||||
|
||||
def get_pools(self):
|
||||
@@ -68,10 +199,26 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
|
||||
model_name=self.site_b_model)
|
||||
return sorted(site_a_pools.keys()), sorted(site_b_pools.keys())
|
||||
|
||||
def get_failover_pools(self):
|
||||
"""Get the failover Ceph pools' names, from both sites.
|
||||
|
||||
If the Cinder RBD mirroring mode is 'image', the 'cinder-ceph' pool
|
||||
needs to be excluded, since Cinder orchestrates the failover then.
|
||||
|
||||
:returns: Tuple with site-a pools and site-b pools.
|
||||
:rtype: Tuple[List[str], List[str]]
|
||||
"""
|
||||
site_a_pools, site_b_pools = self.get_pools()
|
||||
if get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) == 'image':
|
||||
site_a_pools.remove(self.cinder_ceph_app_name)
|
||||
site_b_pools.remove(self.cinder_ceph_app_name)
|
||||
return site_a_pools, site_b_pools
|
||||
|
||||
def wait_for_mirror_state(self, state, application_name=None,
|
||||
model_name=None,
|
||||
check_entries_behind_master=False,
|
||||
require_images_in=[]):
|
||||
require_images_in=[],
|
||||
pools=[]):
|
||||
"""Wait until all images reach requested state.
|
||||
|
||||
This function runs the ``status`` action and examines the data it
|
||||
@@ -90,6 +237,9 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
|
||||
:type check_entries_behind_master: bool
|
||||
:param require_images_in: List of pools to require images in
|
||||
:type require_images_in: list of str
|
||||
:param pools: List of pools to run status on. If this is empty, the
|
||||
status action will run on all the pools.
|
||||
:type pools: list of str
|
||||
:returns: True on success, never returns on failure
|
||||
"""
|
||||
rep = re.compile(r'.*entries_behind_master=(\d+)')
|
||||
@@ -97,7 +247,8 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
|
||||
try:
|
||||
# encapsulate in try except to work around LP: #1820976
|
||||
pool_status = self.run_status_action(
|
||||
application_name=application_name, model_name=model_name)
|
||||
application_name=application_name, model_name=model_name,
|
||||
pools=pools)
|
||||
except KeyError:
|
||||
continue
|
||||
for pool, status in pool_status.items():
|
||||
@@ -124,6 +275,41 @@ class CephRBDMirrorBase(test_utils.OpenStackBaseTest):
|
||||
# all images with state has expected state
|
||||
return True
|
||||
|
||||
def setup_test_cinder_volume(self):
|
||||
"""Set up the test Cinder volume into the Ceph RBD mirror environment.
|
||||
|
||||
If the volume already exists, then it's returned.
|
||||
|
||||
Also, if the Cinder RBD mirroring mode is 'image', the volume will
|
||||
use an explicit volume type with the appropriate replication flags.
|
||||
Otherwise, it is just a simple Cinder volume using the default backend.
|
||||
|
||||
:returns: Cinder volume
|
||||
:rtype: :class:`Volume`.
|
||||
"""
|
||||
session = openstack.get_overcloud_keystone_session()
|
||||
cinder = openstack.get_cinder_session_client(session, version=3)
|
||||
|
||||
try:
|
||||
return cinder.volumes.find(name=self.test_cinder_volume_name)
|
||||
except cinder_exceptions.NotFound:
|
||||
logging.info("Test Cinder volume doesn't exist. Creating it")
|
||||
|
||||
glance = openstack.get_glance_session_client(session)
|
||||
image = get_glance_image(glance)
|
||||
kwargs = {
|
||||
'cinder': cinder,
|
||||
'name': self.test_cinder_volume_name,
|
||||
'image_id': image.id,
|
||||
}
|
||||
if get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) == 'image':
|
||||
volume_type = setup_cinder_repl_volume_type(
|
||||
cinder,
|
||||
backend_name=self.cinder_ceph_app_name)
|
||||
kwargs['type_id'] = volume_type.id
|
||||
|
||||
return create_cinder_volume(**kwargs)
|
||||
|
||||
|
||||
class CephRBDMirrorTest(CephRBDMirrorBase):
|
||||
"""Encapsulate ``ceph-rbd-mirror`` tests."""
|
||||
@@ -195,44 +381,7 @@ class CephRBDMirrorTest(CephRBDMirrorBase):
|
||||
site B and subsequently comparing the contents we get a full end to end
|
||||
test.
|
||||
"""
|
||||
session = openstack.get_overcloud_keystone_session()
|
||||
glance = openstack.get_glance_session_client(session)
|
||||
cinder = openstack.get_cinder_session_client(session)
|
||||
|
||||
images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME)
|
||||
if images:
|
||||
image = images[0]
|
||||
else:
|
||||
logging.info("Failed to find {} image, falling back to {}".format(
|
||||
CIRROS_IMAGE_NAME,
|
||||
LTS_IMAGE_NAME))
|
||||
image = openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0]
|
||||
|
||||
# NOTE(fnordahl): for some reason create volume from image often fails
|
||||
# when run just after deployment is finished. We should figure out
|
||||
# why, resolve the underlying issue and then remove this.
|
||||
#
|
||||
# We do not use tenacity here as it will interfere with tenacity used
|
||||
# in ``resource_reaches_status``
|
||||
def create_volume_from_image(cinder, image, retry=20):
|
||||
if retry < 1:
|
||||
return
|
||||
volume = cinder.volumes.create(8, name='zaza', imageRef=image.id)
|
||||
try:
|
||||
# Note(coreycb): stop_after_attempt is increased because using
|
||||
# juju storage for ceph-osd backed by cinder on undercloud
|
||||
# takes longer than the prior method of directory-backed OSD
|
||||
# devices.
|
||||
openstack.resource_reaches_status(
|
||||
cinder.volumes, volume.id, msg='volume',
|
||||
stop_after_attempt=20)
|
||||
return volume
|
||||
except AssertionError:
|
||||
logging.info('retrying')
|
||||
volume.delete()
|
||||
return create_volume_from_image(cinder, image, retry=retry - 1)
|
||||
volume = create_volume_from_image(cinder, image)
|
||||
|
||||
volume = self.setup_test_cinder_volume()
|
||||
site_a_hash = zaza.openstack.utilities.ceph.get_rbd_hash(
|
||||
zaza.model.get_lead_unit_name('ceph-mon',
|
||||
model_name=self.site_a_model),
|
||||
@@ -244,6 +393,8 @@ class CephRBDMirrorTest(CephRBDMirrorBase):
|
||||
check_entries_behind_master=True,
|
||||
application_name=self.application_name + self.site_b_app_suffix,
|
||||
model_name=self.site_b_model)
|
||||
logging.info('Checking the Ceph RBD hashes of the primary and '
|
||||
'the secondary Ceph images')
|
||||
site_b_hash = zaza.openstack.utilities.ceph.get_rbd_hash(
|
||||
zaza.model.get_lead_unit_name('ceph-mon' + self.site_b_app_suffix,
|
||||
model_name=self.site_b_model),
|
||||
@@ -258,102 +409,399 @@ class CephRBDMirrorTest(CephRBDMirrorBase):
|
||||
class CephRBDMirrorControlledFailoverTest(CephRBDMirrorBase):
|
||||
"""Encapsulate ``ceph-rbd-mirror`` controlled failover tests."""
|
||||
|
||||
def test_fail_over_fall_back(self):
|
||||
"""Validate controlled fail over and fall back."""
|
||||
site_a_pools, site_b_pools = self.get_pools()
|
||||
def execute_failover_juju_actions(self,
|
||||
primary_site_app_name,
|
||||
primary_site_model,
|
||||
primary_site_pools,
|
||||
secondary_site_app_name,
|
||||
secondary_site_model,
|
||||
secondary_site_pools):
|
||||
"""Execute the failover Juju actions.
|
||||
|
||||
The failover / failback via Juju actions shares the same workflow. The
|
||||
failback is just a failover with sites in reversed order.
|
||||
|
||||
This function encapsulates the tasks to failover a primary site to
|
||||
a secondary site:
|
||||
1. Demote primary site
|
||||
2. Validation of the primary site demotion
|
||||
3. Promote secondary site
|
||||
4. Validation of the secondary site promotion
|
||||
|
||||
:param primary_site_app_name: Primary site Ceph RBD mirror app name.
|
||||
:type primary_site_app_name: str
|
||||
:param primary_site_model: Primary site Juju model name.
|
||||
:type primary_site_model: str
|
||||
:param primary_site_pools: Primary site pools.
|
||||
:type primary_site_pools: List[str]
|
||||
:param secondary_site_app_name: Secondary site Ceph RBD mirror
|
||||
app name.
|
||||
:type secondary_site_app_name: str
|
||||
:param secondary_site_model: Secondary site Juju model name.
|
||||
:type secondary_site_model: str
|
||||
:param secondary_site_pools: Secondary site pools.
|
||||
:type secondary_site_pools: List[str]
|
||||
"""
|
||||
# Check if primary and secondary pools sizes are the same.
|
||||
self.assertEqual(len(primary_site_pools), len(secondary_site_pools))
|
||||
|
||||
# Run the 'demote' Juju action against the primary site pools.
|
||||
logging.info('Demoting {} from model {}.'.format(
|
||||
primary_site_app_name, primary_site_model))
|
||||
result = zaza.model.run_action_on_leader(
|
||||
'ceph-rbd-mirror',
|
||||
primary_site_app_name,
|
||||
'demote',
|
||||
model_name=self.site_a_model,
|
||||
action_params={})
|
||||
model_name=primary_site_model,
|
||||
action_params={
|
||||
'pools': ','.join(primary_site_pools)
|
||||
})
|
||||
logging.info(result.results)
|
||||
self.assertEqual(int(result.results['Code']), 0)
|
||||
|
||||
# Validate that the demoted pools count matches the total primary site
|
||||
# pools count.
|
||||
n_pools_demoted = len(result.results['output'].split('\n'))
|
||||
self.assertEqual(len(site_a_pools), n_pools_demoted)
|
||||
self.wait_for_mirror_state('up+unknown', model_name=self.site_a_model)
|
||||
self.assertEqual(len(primary_site_pools), n_pools_demoted)
|
||||
|
||||
# At this point, both primary and secondary sites are demoted. Validate
|
||||
# that the Ceph images, from both sites, report 'up+unknown', since
|
||||
# there isn't a primary site at the moment.
|
||||
logging.info('Waiting until {} is demoted.'.format(
|
||||
primary_site_app_name))
|
||||
self.wait_for_mirror_state(
|
||||
'up+unknown',
|
||||
application_name=self.application_name + self.site_b_app_suffix,
|
||||
model_name=self.site_b_model)
|
||||
application_name=primary_site_app_name,
|
||||
model_name=primary_site_model,
|
||||
pools=primary_site_pools)
|
||||
self.wait_for_mirror_state(
|
||||
'up+unknown',
|
||||
application_name=secondary_site_app_name,
|
||||
model_name=secondary_site_model,
|
||||
pools=secondary_site_pools)
|
||||
|
||||
# Run the 'promote' Juju against the secondary site.
|
||||
logging.info('Promoting {} from model {}.'.format(
|
||||
secondary_site_app_name, secondary_site_model))
|
||||
result = zaza.model.run_action_on_leader(
|
||||
'ceph-rbd-mirror' + self.site_b_app_suffix,
|
||||
secondary_site_app_name,
|
||||
'promote',
|
||||
model_name=self.site_b_model,
|
||||
action_params={})
|
||||
model_name=secondary_site_model,
|
||||
action_params={
|
||||
'pools': ','.join(secondary_site_pools)
|
||||
})
|
||||
logging.info(result.results)
|
||||
self.assertEqual(int(result.results['Code']), 0)
|
||||
|
||||
# Validate that the promoted pools count matches the total secondary
|
||||
# site pools count.
|
||||
n_pools_promoted = len(result.results['output'].split('\n'))
|
||||
self.assertEqual(len(site_b_pools), n_pools_promoted)
|
||||
self.assertEqual(len(secondary_site_pools), n_pools_promoted)
|
||||
|
||||
# Validate that the Ceph images from the newly promoted site
|
||||
# report 'up+stopped' state (which is reported by primary Ceph images).
|
||||
logging.info('Waiting until {} is promoted.'.format(
|
||||
secondary_site_app_name))
|
||||
self.wait_for_mirror_state(
|
||||
'up+stopped',
|
||||
application_name=secondary_site_app_name,
|
||||
model_name=secondary_site_model,
|
||||
pools=secondary_site_pools)
|
||||
|
||||
# Validate that the Ceph images from site-a report 'up+replaying'
|
||||
# (which is reported by secondary Ceph images).
|
||||
self.wait_for_mirror_state(
|
||||
'up+replaying',
|
||||
model_name=self.site_a_model)
|
||||
check_entries_behind_master=True,
|
||||
application_name=primary_site_app_name,
|
||||
model_name=primary_site_model,
|
||||
pools=primary_site_pools)
|
||||
|
||||
def test_100_cinder_failover(self):
|
||||
"""Validate controlled failover via the Cinder API.
|
||||
|
||||
This test only makes sense if Cinder RBD mirroring mode is 'image'.
|
||||
It will return early, if this is not the case.
|
||||
"""
|
||||
cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode(
|
||||
self.cinder_ceph_app_name)
|
||||
if cinder_rbd_mirroring_mode != 'image':
|
||||
logging.warning(
|
||||
"Skipping 'test_100_cinder_failover' since Cinder RBD "
|
||||
"mirroring mode is {}.".format(cinder_rbd_mirroring_mode))
|
||||
return
|
||||
|
||||
session = openstack.get_overcloud_keystone_session()
|
||||
cinder = openstack.get_cinder_session_client(session, version=3)
|
||||
|
||||
# Check if the Cinder volume host is available with replication
|
||||
# enabled.
|
||||
host = 'cinder@{}'.format(self.cinder_ceph_app_name)
|
||||
svc = cinder.services.list(host=host, binary='cinder-volume')[0]
|
||||
self.assertEqual(svc.replication_status, 'enabled')
|
||||
self.assertEqual(svc.status, 'enabled')
|
||||
|
||||
# Setup the test Cinder volume
|
||||
volume = self.setup_test_cinder_volume()
|
||||
|
||||
# Check if the volume is properly mirrored
|
||||
self.wait_for_mirror_state(
|
||||
'up+stopped',
|
||||
'up+replaying',
|
||||
check_entries_behind_master=True,
|
||||
application_name=self.application_name + self.site_b_app_suffix,
|
||||
model_name=self.site_b_model)
|
||||
result = zaza.model.run_action_on_leader(
|
||||
'ceph-rbd-mirror' + self.site_b_app_suffix,
|
||||
'demote',
|
||||
model_name=self.site_b_model,
|
||||
action_params={
|
||||
})
|
||||
logging.info(result.results)
|
||||
n_pools_demoted = len(result.results['output'].split('\n'))
|
||||
self.assertEqual(len(site_a_pools), n_pools_demoted)
|
||||
self.wait_for_mirror_state(
|
||||
'up+unknown',
|
||||
model_name=self.site_a_model)
|
||||
self.wait_for_mirror_state(
|
||||
'up+unknown',
|
||||
application_name=self.application_name + self.site_b_app_suffix,
|
||||
model_name=self.site_b_model)
|
||||
pools=[self.cinder_ceph_app_name])
|
||||
|
||||
# Execute the Cinder volume failover
|
||||
openstack.failover_cinder_volume_host(
|
||||
cinder=cinder,
|
||||
backend_name=self.cinder_ceph_app_name,
|
||||
target_backend_id='ceph',
|
||||
target_status='disabled',
|
||||
target_replication_status='failed-over')
|
||||
|
||||
# Check if the test volume is still available after failover
|
||||
self.assertEqual(cinder.volumes.get(volume.id).status, 'available')
|
||||
|
||||
def test_101_cinder_failback(self):
|
||||
"""Validate controlled failback via the Cinder API.
|
||||
|
||||
This test only makes sense if Cinder RBD mirroring mode is 'image'.
|
||||
It will return early, if this is not the case.
|
||||
|
||||
The test needs to be executed when the Cinder volume host is already
|
||||
failed-over with the test volume on it.
|
||||
"""
|
||||
cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode(
|
||||
self.cinder_ceph_app_name)
|
||||
if cinder_rbd_mirroring_mode != 'image':
|
||||
logging.warning(
|
||||
"Skipping 'test_101_cinder_failback' since Cinder RBD "
|
||||
"mirroring mode is {}.".format(cinder_rbd_mirroring_mode))
|
||||
return
|
||||
|
||||
session = openstack.get_overcloud_keystone_session()
|
||||
cinder = openstack.get_cinder_session_client(session, version=3)
|
||||
|
||||
# Check if the Cinder volume host is already failed-over
|
||||
host = 'cinder@{}'.format(self.cinder_ceph_app_name)
|
||||
svc = cinder.services.list(host=host, binary='cinder-volume')[0]
|
||||
self.assertEqual(svc.replication_status, 'failed-over')
|
||||
self.assertEqual(svc.status, 'disabled')
|
||||
|
||||
# Check if the test Cinder volume is already present. The method
|
||||
# 'cinder.volumes.find' raises 404 if the volume is not found.
|
||||
volume = cinder.volumes.find(name=self.test_cinder_volume_name)
|
||||
|
||||
# Execute the Cinder volume failback
|
||||
openstack.failover_cinder_volume_host(
|
||||
cinder=cinder,
|
||||
backend_name=self.cinder_ceph_app_name,
|
||||
target_backend_id='default',
|
||||
target_status='enabled',
|
||||
target_replication_status='enabled')
|
||||
|
||||
# Check if the test volume is still available after failback
|
||||
self.assertEqual(cinder.volumes.get(volume.id).status, 'available')
|
||||
|
||||
def test_200_juju_failover(self):
|
||||
"""Validate controlled failover via Juju actions."""
|
||||
# Get the Ceph pools needed to failover
|
||||
site_a_pools, site_b_pools = self.get_failover_pools()
|
||||
|
||||
# Execute the failover Juju actions with the appropriate parameters.
|
||||
site_b_app_name = self.application_name + self.site_b_app_suffix
|
||||
self.execute_failover_juju_actions(
|
||||
primary_site_app_name=self.application_name,
|
||||
primary_site_model=self.site_a_model,
|
||||
primary_site_pools=site_a_pools,
|
||||
secondary_site_app_name=site_b_app_name,
|
||||
secondary_site_model=self.site_b_model,
|
||||
secondary_site_pools=site_b_pools)
|
||||
|
||||
def test_201_juju_failback(self):
|
||||
"""Validate controlled failback via Juju actions."""
|
||||
# Get the Ceph pools needed to failback
|
||||
site_a_pools, site_b_pools = self.get_failover_pools()
|
||||
|
||||
# Execute the failover Juju actions with the appropriate parameters.
|
||||
# The failback operation is just a failover with sites in reverse
|
||||
# order.
|
||||
site_b_app_name = self.application_name + self.site_b_app_suffix
|
||||
self.execute_failover_juju_actions(
|
||||
primary_site_app_name=site_b_app_name,
|
||||
primary_site_model=self.site_b_model,
|
||||
primary_site_pools=site_b_pools,
|
||||
secondary_site_app_name=self.application_name,
|
||||
secondary_site_model=self.site_a_model,
|
||||
secondary_site_pools=site_a_pools)
|
||||
|
||||
def test_203_juju_resync(self):
|
||||
"""Validate the 'resync-pools' Juju action.
|
||||
|
||||
The 'resync-pools' Juju action is meant to flag Ceph images from the
|
||||
secondary site to re-sync against the Ceph images from the primary
|
||||
site.
|
||||
|
||||
This use case is useful when the Ceph secondary images are out of sync.
|
||||
"""
|
||||
# Get the Ceph pools needed to failback
|
||||
_, site_b_pools = self.get_failover_pools()
|
||||
|
||||
# Run the 'resync-pools' Juju action against the pools from site-b.
|
||||
# This will make sure that the Ceph images from site-b are properly
|
||||
# synced with the primary images from site-a.
|
||||
site_b_app_name = self.application_name + self.site_b_app_suffix
|
||||
logging.info('Re-syncing {} from model {}'.format(
|
||||
site_b_app_name, self.site_b_model))
|
||||
result = zaza.model.run_action_on_leader(
|
||||
'ceph-rbd-mirror',
|
||||
'promote',
|
||||
model_name=self.site_a_model,
|
||||
action_params={
|
||||
})
|
||||
logging.info(result.results)
|
||||
n_pools_promoted = len(result.results['output'].split('\n'))
|
||||
self.assertEqual(len(site_b_pools), n_pools_promoted)
|
||||
self.wait_for_mirror_state(
|
||||
'up+stopped',
|
||||
model_name=self.site_a_model)
|
||||
result = zaza.model.run_action_on_leader(
|
||||
'ceph-rbd-mirror' + self.site_b_app_suffix,
|
||||
site_b_app_name,
|
||||
'resync-pools',
|
||||
model_name=self.site_b_model,
|
||||
action_params={
|
||||
'pools': ','.join(site_b_pools),
|
||||
'i-really-mean-it': True,
|
||||
})
|
||||
logging.info(result.results)
|
||||
self.assertEqual(int(result.results['Code']), 0)
|
||||
|
||||
# Validate that the Ceph images from site-b report 'up+replaying'
|
||||
# (which is reported by secondary Ceph images). And check that images
|
||||
# exist in Cinder and Glance pools.
|
||||
self.wait_for_mirror_state(
|
||||
'up+replaying',
|
||||
application_name=self.application_name + self.site_b_app_suffix,
|
||||
check_entries_behind_master=True,
|
||||
application_name=site_b_app_name,
|
||||
model_name=self.site_b_model,
|
||||
require_images_in=['cinder-ceph', 'glance'])
|
||||
require_images_in=[self.cinder_ceph_app_name, 'glance'],
|
||||
pools=site_b_pools)
|
||||
|
||||
|
||||
class CephRBDMirrorDisasterFailoverTest(CephRBDMirrorBase):
|
||||
"""Encapsulate ``ceph-rbd-mirror`` destructive tests."""
|
||||
|
||||
def test_kill_site_a_fail_over(self):
|
||||
"""Validate fail over after uncontrolled shutdown of primary."""
|
||||
for application in 'ceph-rbd-mirror', 'ceph-mon', 'ceph-osd':
|
||||
def apply_cinder_ceph_workaround(self):
|
||||
"""Set minimal timeouts / retries to the Cinder Ceph backend.
|
||||
|
||||
This is needed because the failover via Cinder API will try to do a
|
||||
demotion of the site-a. However, when site-a is down, and with the
|
||||
default timeouts / retries, the operation takes an unreasonably amount
|
||||
of time (or sometimes it never finishes).
|
||||
"""
|
||||
# These new config options need to be set under the Cinder Ceph backend
|
||||
# section in the main Cinder config file.
|
||||
# At the moment, we don't the possibility of using Juju config to set
|
||||
# these options. And also, it's not even a good practice to have them
|
||||
# in production.
|
||||
# These should be set only to do the Ceph failover via Cinder API, and
|
||||
# they need to be removed after.
|
||||
configs = {
|
||||
'rados_connect_timeout': '1',
|
||||
'rados_connection_retries': '1',
|
||||
'rados_connection_interval': '0',
|
||||
'replication_connect_timeout': '1',
|
||||
}
|
||||
|
||||
# Small Python script that will be executed via Juju run to update
|
||||
# the Cinder config file.
|
||||
update_cinder_conf_script = (
|
||||
"import configparser; "
|
||||
"config = configparser.ConfigParser(); "
|
||||
"config.read('/etc/cinder/cinder.conf'); "
|
||||
"{}"
|
||||
"f = open('/etc/cinder/cinder.conf', 'w'); "
|
||||
"config.write(f); "
|
||||
"f.close()")
|
||||
set_cmd = ''
|
||||
for cfg_name in configs:
|
||||
set_cmd += "config.set('{0}', '{1}', '{2}'); ".format(
|
||||
self.cinder_ceph_app_name, cfg_name, configs[cfg_name])
|
||||
script = update_cinder_conf_script.format(set_cmd)
|
||||
|
||||
# Run the workaround script via Juju run
|
||||
zaza.model.run_on_leader(
|
||||
self.cinder_ceph_app_name,
|
||||
'python3 -c "{}"; systemctl restart cinder-volume'.format(script))
|
||||
|
||||
def kill_primary_site(self):
|
||||
"""Simulate an unexpected primary site shutdown."""
|
||||
logging.info('Killing the Ceph primary site')
|
||||
for application in ['ceph-rbd-mirror', 'ceph-mon', 'ceph-osd']:
|
||||
zaza.model.remove_application(
|
||||
application,
|
||||
model_name=self.site_a_model,
|
||||
forcefully_remove_machines=True)
|
||||
|
||||
def test_100_forced_juju_failover(self):
|
||||
"""Validate Ceph failover via Juju when the primary site is down.
|
||||
|
||||
* Kill the primary site
|
||||
* Execute the forced failover via Juju actions
|
||||
"""
|
||||
# Get the site-b Ceph pools that need to be promoted
|
||||
_, site_b_pools = self.get_failover_pools()
|
||||
site_b_app_name = self.application_name + self.site_b_app_suffix
|
||||
|
||||
# Simulate primary site unexpected shutdown
|
||||
self.kill_primary_site()
|
||||
|
||||
# Try and promote the site-b to primary.
|
||||
result = zaza.model.run_action_on_leader(
|
||||
'ceph-rbd-mirror' + self.site_b_app_suffix,
|
||||
site_b_app_name,
|
||||
'promote',
|
||||
model_name=self.site_b_model,
|
||||
action_params={
|
||||
'pools': ','.join(site_b_pools),
|
||||
})
|
||||
self.assertEqual(int(result.results['Code']), 0)
|
||||
|
||||
# The site-b 'promote' Juju action is expected to fail, because the
|
||||
# primary site is down.
|
||||
self.assertEqual(result.status, 'failed')
|
||||
|
||||
# Retry to promote site-b using the 'force' Juju action parameter.
|
||||
result = zaza.model.run_action_on_leader(
|
||||
'ceph-rbd-mirror' + self.site_b_app_suffix,
|
||||
site_b_app_name,
|
||||
'promote',
|
||||
model_name=self.site_b_model,
|
||||
action_params={
|
||||
'force': True,
|
||||
'pools': ','.join(site_b_pools),
|
||||
})
|
||||
self.assertEqual(int(result.results['Code']), 0)
|
||||
|
||||
# Validate successful Juju action execution
|
||||
self.assertEqual(result.status, 'completed')
|
||||
|
||||
def test_200_forced_cinder_failover(self):
|
||||
"""Validate Ceph failover via Cinder when the primary site is down.
|
||||
|
||||
This test only makes sense if Cinder RBD mirroring mode is 'image'.
|
||||
It will return early, if this is not the case.
|
||||
|
||||
This assumes that the primary site is already killed.
|
||||
"""
|
||||
cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode(
|
||||
self.cinder_ceph_app_name)
|
||||
if cinder_rbd_mirroring_mode != 'image':
|
||||
logging.warning(
|
||||
"Skipping 'test_200_cinder_failover_without_primary_site' "
|
||||
"since Cinder RBD mirroring mode is {}.".format(
|
||||
cinder_rbd_mirroring_mode))
|
||||
return
|
||||
|
||||
# Make sure that the Cinder Ceph backend workaround is applied.
|
||||
self.apply_cinder_ceph_workaround()
|
||||
|
||||
session = openstack.get_overcloud_keystone_session()
|
||||
cinder = openstack.get_cinder_session_client(session, version=3)
|
||||
openstack.failover_cinder_volume_host(
|
||||
cinder=cinder,
|
||||
backend_name=self.cinder_ceph_app_name,
|
||||
target_backend_id='ceph',
|
||||
target_status='disabled',
|
||||
target_replication_status='failed-over')
|
||||
|
||||
# Check that the Cinder volumes are still available after forced
|
||||
# failover.
|
||||
for volume in cinder.volumes.list():
|
||||
self.assertEqual(volume.status, 'available')
|
||||
|
||||
@@ -2489,6 +2489,40 @@ def attach_volume(nova, volume_id, instance_id):
|
||||
device='/dev/vdx')
|
||||
|
||||
|
||||
def failover_cinder_volume_host(cinder, backend_name='cinder-ceph',
|
||||
target_backend_id='ceph',
|
||||
target_status='disabled',
|
||||
target_replication_status='failed-over'):
|
||||
"""Failover Cinder volume host with replication enabled.
|
||||
|
||||
:param cinder: Authenticated cinderclient
|
||||
:type cinder: cinder.Client
|
||||
:param backend_name: Cinder volume backend name with
|
||||
replication enabled.
|
||||
:type backend_name: str
|
||||
:param target_backend_id: Failover target Cinder backend id.
|
||||
:type target_backend_id: str
|
||||
:param target_status: Target Cinder volume status after failover.
|
||||
:type target_status: str
|
||||
:param target_replication_status: Target Cinder volume replication
|
||||
status after failover.
|
||||
:type target_replication_status: str
|
||||
:raises: AssertionError
|
||||
"""
|
||||
host = 'cinder@{}'.format(backend_name)
|
||||
logging.info('Failover Cinder volume host %s to backend_id %s',
|
||||
host, target_backend_id)
|
||||
cinder.services.failover_host(host=host, backend_id=target_backend_id)
|
||||
for attempt in tenacity.Retrying(
|
||||
retry=tenacity.retry_if_exception_type(AssertionError),
|
||||
stop=tenacity.stop_after_attempt(10),
|
||||
wait=tenacity.wait_exponential(multiplier=1, min=2, max=10)):
|
||||
with attempt:
|
||||
svc = cinder.services.list(host=host, binary='cinder-volume')[0]
|
||||
assert svc.status == target_status
|
||||
assert svc.replication_status == target_replication_status
|
||||
|
||||
|
||||
def create_volume_backup(cinder, volume_id, name=None):
|
||||
"""Create cinder volume backup.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user