Files
zaza-openstack-tests/zaza/openstack/charm_tests/ceph/rbd_mirror/tests.py
T
Luciano Lo Giudice fd160e8205 Fix function calls
2024-08-14 14:41:52 -03:00

857 lines
34 KiB
Python

# Copyright 2019 Canonical Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Encapsulate ``ceph-rbd-mirror`` testing."""
import json
import logging
import re
import time
import unittest
import cinderclient.exceptions as cinder_exceptions
import zaza.openstack.charm_tests.test_utils as test_utils
import zaza.model
import zaza.openstack.utilities.ceph
import zaza.openstack.utilities.openstack as openstack
from zaza.openstack.charm_tests.glance.setup import (
LTS_IMAGE_NAME,
CIRROS_IMAGE_NAME)
DEFAULT_CINDER_RBD_MIRRORING_MODE = 'pool'
def get_cinder_rbd_mirroring_mode(cinder_ceph_app_name='cinder-ceph'):
"""Get the RBD mirroring mode for the Cinder Ceph pool.
:param cinder_ceph_app_name: Cinder Ceph Juju application name.
:type cinder_ceph_app_name: str
:returns: A string representing the RBD mirroring mode. It can be
either 'pool' or 'image'.
:rtype: str
"""
rbd_mirroring_mode_config = zaza.model.get_application_config(
cinder_ceph_app_name).get('rbd-mirroring-mode')
if rbd_mirroring_mode_config:
rbd_mirroring_mode = rbd_mirroring_mode_config.get(
'value', DEFAULT_CINDER_RBD_MIRRORING_MODE).lower()
else:
rbd_mirroring_mode = DEFAULT_CINDER_RBD_MIRRORING_MODE
return rbd_mirroring_mode
def get_glance_image(glance):
"""Get the Glance image object to be used by the Ceph tests.
It looks for the Cirros Glance image, and it's returned if it's found.
If the Cirros image is not found, it will try and find the Ubuntu
LTS image.
:param glance: Authenticated glanceclient
:type glance: glanceclient.Client
:returns: Glance image object
:rtype: glanceclient.image
"""
images = openstack.get_images_by_name(glance, CIRROS_IMAGE_NAME)
if images:
return images[0]
logging.info("Failed to find {} image, falling back to {}".format(
CIRROS_IMAGE_NAME,
LTS_IMAGE_NAME))
return openstack.get_images_by_name(glance, LTS_IMAGE_NAME)[0]
def setup_cinder_repl_volume_type(cinder, type_name='repl',
backend_name='cinder-ceph'):
"""Set up the Cinder volume replication type.
:param cinder: Authenticated cinderclient
:type cinder: cinder.Client
:param type_name: Cinder volume type name
:type type_name: str
:param backend_name: Cinder volume backend name with replication enabled.
:type backend_name: str
:returns: Cinder volume type object
:rtype: cinderclient.VolumeType
"""
try:
vol_type = cinder.volume_types.find(name=type_name)
except cinder_exceptions.NotFound:
vol_type = cinder.volume_types.create(type_name)
vol_type.set_keys(metadata={
'volume_backend_name': backend_name,
'replication_enabled': '<is> True',
})
return vol_type
# TODO: This function should be incorporated into
# 'zaza.openstack.utilities.openstack.create_volume' helper, once the below
# flakiness comments are addressed.
def create_cinder_volume(cinder, name='zaza', image_id=None, type_id=None):
"""Create a new Cinder volume.
:param cinder: Authenticated cinderclient.
:type cinder: cinder.Client
:param name: Volume name.
:type name: str
:param image_id: Glance image id, if the volume is created from image.
:type image_id: str
:param type_id: Cinder Volume type id, if the volume needs to use an
explicit volume type.
:type type_id: boolean
:returns: Cinder volume
:rtype: :class:`Volume`.
"""
# NOTE(fnordahl): for some reason create volume from image often fails
# when run just after deployment is finished. We should figure out
# why, resolve the underlying issue and then remove this.
#
# We do not use tenacity here as it will interfere with tenacity used
# in ``resource_reaches_status``
def create_volume(cinder, volume_params, retry=20):
if retry < 1:
return
volume = cinder.volumes.create(**volume_params)
try:
# Note(coreycb): stop_after_attempt is increased because using
# juju storage for ceph-osd backed by cinder on undercloud
# takes longer than the prior method of directory-backed OSD
# devices.
openstack.resource_reaches_status(
cinder.volumes, volume.id, msg='volume',
stop_after_attempt=20)
return volume
except AssertionError:
logging.info('retrying')
volume.delete()
return create_volume(cinder, volume_params, retry=retry - 1)
volume_params = {
'size': 8,
'name': name,
}
if image_id:
volume_params['imageRef'] = image_id
if type_id:
volume_params['volume_type'] = type_id
return create_volume(cinder, volume_params)
def setup_rbd_mirror():
"""Set up an RBD pool in case Cinder isn't present."""
zaza.model.run_action_on_leader(
'ceph-mon',
'create-pool',
action_params={
'name': 'zaza-boot',
'app-name': 'rbd',
}
)
zaza.model.run_action_on_leader(
'ceph-rbd-mirror',
'refresh-pools',
action_params={}
)
class CephRBDMirrorBase(test_utils.BaseCharmTest):
"""Base class for ``ceph-rbd-mirror`` tests."""
@classmethod
def setUpClass(cls):
"""Run setup for ``ceph-rbd-mirror`` tests."""
super().setUpClass()
cls.cinder_ceph_app_name = 'cinder-ceph'
cls.test_cinder_volume_name = 'test-cinder-ceph-volume'
# get ready for multi-model Zaza
cls.site_a_model = cls.site_b_model = zaza.model.get_juju_model()
cls.site_b_app_suffix = '-b'
def test_if_cinder_present(self):
"""Test if the cinder-ceph application is present."""
try:
zaza.model.get_application(self.cinder_ceph_app_name)
return True
except KeyError:
return False
def skip_test_if_cinder_not_present(self, caller):
"""Skip a test if Cinder isn't present."""
if not self.test_if_cinder_present():
raise unittest.SkipTest('Skipping %s due to lack of Cinder'
% caller)
def run_status_action(self, application_name=None, model_name=None,
pools=[]):
"""Run status action, decode and return response."""
action_params = {
'verbose': True,
'format': 'json',
}
if len(pools) > 0:
action_params['pools'] = ','.join(pools)
result = zaza.model.run_action_on_leader(
application_name or self.application_name,
'status',
model_name=model_name,
action_params=action_params)
if result.status == "failed":
logging.error("status action failed: %s", result.message)
return
return json.loads(result.results['output'])
def get_pools(self):
"""Retrieve list of pools from both sites.
:returns: Tuple with list of pools on each side.
:rtype: tuple
"""
site_a_pools = zaza.openstack.utilities.ceph.get_ceph_pools(
zaza.model.get_lead_unit_name(
'ceph-mon', model_name=self.site_a_model),
model_name=self.site_a_model)
site_b_pools = zaza.openstack.utilities.ceph.get_ceph_pools(
zaza.model.get_lead_unit_name(
'ceph-mon' + self.site_b_app_suffix,
model_name=self.site_b_model),
model_name=self.site_b_model)
return sorted(site_a_pools.keys()), sorted(site_b_pools.keys())
def get_failover_pools(self):
"""Get the failover Ceph pools' names, from both sites.
If the Cinder RBD mirroring mode is 'image', the 'cinder-ceph' pool
needs to be excluded, since Cinder orchestrates the failover then.
Also remove .mgr pools as they're not failed over
:returns: Tuple with site-a pools and site-b pools.
:rtype: Tuple[List[str], List[str]]
"""
site_a_pools, site_b_pools = self.get_pools()
if (self.test_if_cinder_present() and
get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) ==
'image'):
site_a_pools.remove(self.cinder_ceph_app_name)
site_b_pools.remove(self.cinder_ceph_app_name)
site_a_pools.remove(".mgr")
site_b_pools.remove(".mgr")
return site_a_pools, site_b_pools
def wait_for_mirror_state(self, state, application_name=None,
model_name=None,
check_entries_behind_master=False,
require_images_in=[],
pools=[]):
"""Wait until all images reach requested state.
This function runs the ``status`` action and examines the data it
returns.
:param state: State to expect all images to be in
:type state: str
:param application_name: Application to run action on
:type application_name: str
:param model_name: Model to run in
:type model_name: str
:param check_entries_behind_master: Wait for ``entries_behind_master``
to become '0'. Only makes sense
when used with state
``up+replying``.
:type check_entries_behind_master: bool
:param require_images_in: List of pools to require images in
:type require_images_in: list of str
:param pools: List of pools to run status on. If this is empty, the
status action will run on all the pools.
:type pools: list of str
:returns: True on success, never returns on failure
"""
rep = re.compile(r'.*"entries_behind_primary":(\d+),')
while True:
pool_status = self.run_status_action(
application_name=application_name, model_name=model_name,
pools=pools)
if pool_status is None:
logging.debug("status action failed, retrying")
time.sleep(5) # don't spam juju run-action
continue
for pool, status in pool_status.items():
images = status.get('images', [])
logging.debug("checking pool %s, images: %s", pool, images)
if not len(images) and pool in require_images_in:
break
for image in images:
if image['state'] and image['state'] != state:
break
if check_entries_behind_master:
m = rep.match(image['description'])
# NOTE(fnordahl): Tactical fix for upstream Ceph
# Luminous bug https://tracker.ceph.com/issues/23516
if m and int(m.group(1)) > 42:
logging.info('entries_behind_primary:{}'
.format(m.group(1)))
break
else:
# not found here, check next pool
continue
# found here, pass on to outer loop
break
else:
# all images with state has expected state
return True
time.sleep(5) # don't spam juju run-action
def setup_test_cinder_volume(self):
"""Set up the test Cinder volume into the Ceph RBD mirror environment.
If the volume already exists, then it's returned.
Also, if the Cinder RBD mirroring mode is 'image', the volume will
use an explicit volume type with the appropriate replication flags.
Otherwise, it is just a simple Cinder volume using the default backend.
:returns: Cinder volume
:rtype: :class:`Volume`.
"""
session = openstack.get_overcloud_keystone_session()
cinder = openstack.get_cinder_session_client(session, version=3)
try:
return cinder.volumes.find(name=self.test_cinder_volume_name)
except cinder_exceptions.NotFound:
logging.info("Test Cinder volume doesn't exist. Creating it")
glance = openstack.get_glance_session_client(session)
image = get_glance_image(glance)
kwargs = {
'cinder': cinder,
'name': self.test_cinder_volume_name,
'image_id': image.id,
}
if get_cinder_rbd_mirroring_mode(self.cinder_ceph_app_name) == 'image':
volume_type = setup_cinder_repl_volume_type(
cinder,
backend_name=self.cinder_ceph_app_name)
kwargs['type_id'] = volume_type.id
return create_cinder_volume(**kwargs)
class CephRBDMirrorTest(CephRBDMirrorBase):
"""Encapsulate ``ceph-rbd-mirror`` tests."""
def test_pause_resume(self):
"""Run pause and resume tests."""
self.pause_resume(['rbd-mirror'])
def test_pool_broker_synced(self):
"""Validate that pools created with broker protocol are synced.
The functional test bundle includes the ``cinder``, ``cinder-ceph`` and
``glance`` charms. The ``cinder-ceph`` and ``glance`` charms will
create pools using the ceph charms broker protocol at deploy time.
"""
site_a_pools, site_b_pools = self.get_pools()
self.assertEqual(site_a_pools, site_b_pools)
def test_pool_manual_synced(self):
"""Validate that manually created pools are synced after refresh.
The ``ceph-rbd-mirror`` charm does not get notified when the operator
creates a pool manually without using the ceph charms broker protocol.
To alleviate this the charm has a ``refresh-pools`` action the operator
can call to have it discover such pools. Validate its operation.
"""
# use action on ceph-mon to create a pool directly in the Ceph cluster
# without using the broker protocol
zaza.model.run_action_on_leader(
'ceph-mon',
'create-pool',
model_name=self.site_a_model,
action_params={
'name': 'zaza',
'app-name': 'rbd',
})
# tell ceph-rbd-mirror unit on site_a to refresh list of pools
zaza.model.run_action_on_leader(
'ceph-rbd-mirror',
'refresh-pools',
model_name=self.site_a_model,
action_params={
})
# wait for execution to start
zaza.model.wait_for_agent_status(model_name=self.site_a_model)
zaza.model.wait_for_agent_status(model_name=self.site_b_model)
# wait for execution to finish
zaza.model.wait_for_application_states(model_name=self.site_a_model)
zaza.model.wait_for_application_states(model_name=self.site_b_model)
# make sure everything is idle before we test
zaza.model.block_until_all_units_idle(model_name=self.site_a_model)
zaza.model.block_until_all_units_idle(model_name=self.site_b_model)
# validate result
site_a_pools, site_b_pools = self.get_pools()
self.assertEqual(site_a_pools, site_b_pools)
def test_cinder_volume_mirrored(self):
"""Validate that a volume created through Cinder is mirrored.
For RBD Mirroring to work clients must enable the correct set of
features when creating images.
The RBD image feature settings are announced by the ``ceph-mon`` charm
over the client relation when it has units related on its
``rbd-mirror`` endpoint.
By creating a volume through cinder on site A, checking for presence on
site B and subsequently comparing the contents we get a full end to end
test.
"""
self.skip_test_if_cinder_not_present('test_cinder_volume_mirrored')
volume = self.setup_test_cinder_volume()
site_a_hash = zaza.openstack.utilities.ceph.get_rbd_hash(
zaza.model.get_lead_unit_name('ceph-mon',
model_name=self.site_a_model),
'cinder-ceph',
'volume-{}'.format(volume.id),
model_name=self.site_a_model)
self.wait_for_mirror_state(
'up+replaying',
check_entries_behind_master=True,
application_name=self.application_name + self.site_b_app_suffix,
model_name=self.site_b_model)
logging.info('Checking the Ceph RBD hashes of the primary and '
'the secondary Ceph images')
site_b_hash = zaza.openstack.utilities.ceph.get_rbd_hash(
zaza.model.get_lead_unit_name('ceph-mon' + self.site_b_app_suffix,
model_name=self.site_b_model),
'cinder-ceph',
'volume-{}'.format(volume.id),
model_name=self.site_b_model)
logging.info(site_a_hash)
logging.info(site_b_hash)
self.assertEqual(site_a_hash, site_b_hash)
class CephRBDMirrorControlledFailoverTest(CephRBDMirrorBase):
"""Encapsulate ``ceph-rbd-mirror`` controlled failover tests."""
def execute_failover_juju_actions(self,
primary_site_app_name,
primary_site_model,
primary_site_pools,
secondary_site_app_name,
secondary_site_model,
secondary_site_pools):
"""Execute the failover Juju actions.
The failover / failback via Juju actions shares the same workflow. The
failback is just a failover with sites in reversed order.
This function encapsulates the tasks to failover a primary site to
a secondary site:
1. Demote primary site
2. Validation of the primary site demotion
3. Promote secondary site
4. Validation of the secondary site promotion
:param primary_site_app_name: Primary site Ceph RBD mirror app name.
:type primary_site_app_name: str
:param primary_site_model: Primary site Juju model name.
:type primary_site_model: str
:param primary_site_pools: Primary site pools.
:type primary_site_pools: List[str]
:param secondary_site_app_name: Secondary site Ceph RBD mirror
app name.
:type secondary_site_app_name: str
:param secondary_site_model: Secondary site Juju model name.
:type secondary_site_model: str
:param secondary_site_pools: Secondary site pools.
:type secondary_site_pools: List[str]
"""
# Check if primary and secondary pools sizes are the same.
self.assertEqual(len(primary_site_pools), len(secondary_site_pools))
# Run the 'demote' Juju action against the primary site pools.
logging.info('Demoting {} from model {}.'.format(
primary_site_app_name, primary_site_model))
result = zaza.model.run_on_leader(
primary_site_app_name,
'demote',
model_name=primary_site_model,
action_params={
'pools': ','.join(primary_site_pools)
})
logging.info(result)
self.assertEqual(int(result.get('Code')), 0)
# Validate that the demoted pools count matches the total primary site
# pools count.
n_pools_demoted = len(result.get('Stdout').split('\n'))
self.assertEqual(len(primary_site_pools), n_pools_demoted)
# At this point, both primary and secondary sites are demoted. Validate
# that the Ceph images, from both sites, report 'up+unknown', since
# there isn't a primary site at the moment.
logging.info('Waiting until {} is demoted.'.format(
primary_site_app_name))
self.wait_for_mirror_state(
'up+unknown',
application_name=primary_site_app_name,
model_name=primary_site_model,
pools=primary_site_pools)
self.wait_for_mirror_state(
'up+unknown',
application_name=secondary_site_app_name,
model_name=secondary_site_model,
pools=secondary_site_pools)
# Run the 'promote' Juju against the secondary site.
logging.info('Promoting {} from model {}.'.format(
secondary_site_app_name, secondary_site_model))
result = zaza.model.run_on_leader(
secondary_site_app_name,
'promote',
model_name=secondary_site_model,
action_params={
'pools': ','.join(secondary_site_pools)
})
logging.info(result)
self.assertEqual(int(result.get('Code')), 0)
# Validate that the promoted pools count matches the total secondary
# site pools count.
n_pools_promoted = len(result.get('Stdout').split('\n'))
self.assertEqual(len(secondary_site_pools), n_pools_promoted)
# Validate that the Ceph images from the newly promoted site
# report 'up+stopped' state (which is reported by primary Ceph images).
logging.info('Waiting until {} is promoted.'.format(
secondary_site_app_name))
self.wait_for_mirror_state(
'up+stopped',
application_name=secondary_site_app_name,
model_name=secondary_site_model,
pools=secondary_site_pools)
# Validate that the Ceph images from site-a report 'up+replaying'
# (which is reported by secondary Ceph images).
self.wait_for_mirror_state(
'up+replaying',
check_entries_behind_master=True,
application_name=primary_site_app_name,
model_name=primary_site_model,
pools=primary_site_pools)
def test_100_cinder_failover(self):
"""Validate controlled failover via the Cinder API.
This test only makes sense if Cinder RBD mirroring mode is 'image'.
It will return early, if this is not the case.
"""
self.skip_test_if_cinder_not_present('test_100_cinder_failover')
cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode(
self.cinder_ceph_app_name)
if cinder_rbd_mirroring_mode != 'image':
logging.warning(
"Skipping 'test_100_cinder_failover' since Cinder RBD "
"mirroring mode is {}.".format(cinder_rbd_mirroring_mode))
return
session = openstack.get_overcloud_keystone_session()
cinder = openstack.get_cinder_session_client(session, version=3)
# Check if the Cinder volume host is available with replication
# enabled.
host = 'cinder@{}'.format(self.cinder_ceph_app_name)
svc = cinder.services.list(host=host, binary='cinder-volume')[0]
self.assertEqual(svc.replication_status, 'enabled')
self.assertEqual(svc.status, 'enabled')
# Setup the test Cinder volume
volume = self.setup_test_cinder_volume()
# Check if the volume is properly mirrored
self.wait_for_mirror_state(
'up+replaying',
check_entries_behind_master=True,
application_name=self.application_name + self.site_b_app_suffix,
model_name=self.site_b_model,
pools=[self.cinder_ceph_app_name])
# Execute the Cinder volume failover
openstack.failover_cinder_volume_host(
cinder=cinder,
backend_name=self.cinder_ceph_app_name,
target_backend_id='ceph',
target_status='disabled',
target_replication_status='failed-over')
# Check if the test volume is still available after failover
self.assertEqual(cinder.volumes.get(volume.id).status, 'available')
def test_101_cinder_failback(self):
"""Validate controlled failback via the Cinder API.
This test only makes sense if Cinder RBD mirroring mode is 'image'.
It will return early, if this is not the case.
The test needs to be executed when the Cinder volume host is already
failed-over with the test volume on it.
"""
self.skip_test_if_cinder_not_present('test_101_cinder_failback')
cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode(
self.cinder_ceph_app_name)
if cinder_rbd_mirroring_mode != 'image':
logging.warning(
"Skipping 'test_101_cinder_failback' since Cinder RBD "
"mirroring mode is {}.".format(cinder_rbd_mirroring_mode))
return
session = openstack.get_overcloud_keystone_session()
cinder = openstack.get_cinder_session_client(session, version=3)
# Check if the Cinder volume host is already failed-over
host = 'cinder@{}'.format(self.cinder_ceph_app_name)
svc = cinder.services.list(host=host, binary='cinder-volume')[0]
self.assertEqual(svc.replication_status, 'failed-over')
self.assertEqual(svc.status, 'disabled')
# Check if the test Cinder volume is already present. The method
# 'cinder.volumes.find' raises 404 if the volume is not found.
volume = cinder.volumes.find(name=self.test_cinder_volume_name)
# Execute the Cinder volume failback
openstack.failover_cinder_volume_host(
cinder=cinder,
backend_name=self.cinder_ceph_app_name,
target_backend_id='default',
target_status='enabled',
target_replication_status='enabled')
# Check if the test volume is still available after failback
self.assertEqual(cinder.volumes.get(volume.id).status, 'available')
def test_200_juju_failover(self):
"""Validate controlled failover via Juju actions."""
# Get the Ceph pools needed to failover
site_a_pools, site_b_pools = self.get_failover_pools()
# Execute the failover Juju actions with the appropriate parameters.
site_b_app_name = self.application_name + self.site_b_app_suffix
self.execute_failover_juju_actions(
primary_site_app_name=self.application_name,
primary_site_model=self.site_a_model,
primary_site_pools=site_a_pools,
secondary_site_app_name=site_b_app_name,
secondary_site_model=self.site_b_model,
secondary_site_pools=site_b_pools)
def test_201_juju_failback(self):
"""Validate controlled failback via Juju actions."""
# Get the Ceph pools needed to failback
site_a_pools, site_b_pools = self.get_failover_pools()
# Execute the failover Juju actions with the appropriate parameters.
# The failback operation is just a failover with sites in reverse
# order.
site_b_app_name = self.application_name + self.site_b_app_suffix
self.execute_failover_juju_actions(
primary_site_app_name=site_b_app_name,
primary_site_model=self.site_b_model,
primary_site_pools=site_b_pools,
secondary_site_app_name=self.application_name,
secondary_site_model=self.site_a_model,
secondary_site_pools=site_a_pools)
def test_203_juju_resync(self):
"""Validate the 'resync-pools' Juju action.
The 'resync-pools' Juju action is meant to flag Ceph images from the
secondary site to re-sync against the Ceph images from the primary
site.
This use case is useful when the Ceph secondary images are out of sync.
"""
# Get the Ceph pools needed to failback
_, site_b_pools = self.get_failover_pools()
# Run the 'resync-pools' Juju action against the pools from site-b.
# This will make sure that the Ceph images from site-b are properly
# synced with the primary images from site-a.
site_b_app_name = self.application_name + self.site_b_app_suffix
logging.info('Re-syncing {} from model {}'.format(
site_b_app_name, self.site_b_model))
result = zaza.model.run_on_leader(
site_b_app_name,
'resync-pools',
model_name=self.site_b_model,
action_params={
'pools': ','.join(site_b_pools),
'i-really-mean-it': True,
})
logging.info(result)
self.assertEqual(int(result.get('Code')), 0)
# Validate that the Ceph images from site-b report 'up+replaying'
# (which is reported by secondary Ceph images). And check that images
# exist in Cinder and Glance pools.
self.wait_for_mirror_state(
'up+replaying',
check_entries_behind_master=True,
application_name=site_b_app_name,
model_name=self.site_b_model,
require_images_in=[self.cinder_ceph_app_name, 'glance'],
pools=site_b_pools)
class CephRBDMirrorDisasterFailoverTest(CephRBDMirrorBase):
"""Encapsulate ``ceph-rbd-mirror`` destructive tests."""
def apply_cinder_ceph_workaround(self):
"""Set minimal timeouts / retries to the Cinder Ceph backend.
This is needed because the failover via Cinder API will try to do a
demotion of the site-a. However, when site-a is down, and with the
default timeouts / retries, the operation takes an unreasonably amount
of time (or sometimes it never finishes).
"""
# These new config options need to be set under the Cinder Ceph backend
# section in the main Cinder config file.
# At the moment, we don't the possibility of using Juju config to set
# these options. And also, it's not even a good practice to have them
# in production.
# These should be set only to do the Ceph failover via Cinder API, and
# they need to be removed after.
configs = {
'rados_connect_timeout': '1',
'rados_connection_retries': '1',
'rados_connection_interval': '0',
'replication_connect_timeout': '1',
}
# Small Python script that will be executed via Juju run to update
# the Cinder config file.
update_cinder_conf_script = (
"import configparser; "
"config = configparser.ConfigParser(); "
"config.read('/etc/cinder/cinder.conf'); "
"{}"
"f = open('/etc/cinder/cinder.conf', 'w'); "
"config.write(f); "
"f.close()")
set_cmd = ''
for cfg_name in configs:
set_cmd += "config.set('{0}', '{1}', '{2}'); ".format(
self.cinder_ceph_app_name, cfg_name, configs[cfg_name])
script = update_cinder_conf_script.format(set_cmd)
# Run the workaround script via Juju run
zaza.model.run_on_leader(
self.cinder_ceph_app_name,
'python3 -c "{}"; systemctl restart cinder-volume'.format(script))
def kill_primary_site(self):
"""Simulate an unexpected primary site shutdown."""
logging.info('Killing the Ceph primary site')
for application in ['ceph-rbd-mirror', 'ceph-mon', 'ceph-osd']:
zaza.model.remove_application(
application,
model_name=self.site_a_model,
forcefully_remove_machines=True)
def test_100_forced_juju_failover(self):
"""Validate Ceph failover via Juju when the primary site is down.
* Kill the primary site
* Execute the forced failover via Juju actions
"""
# Get the site-b Ceph pools that need to be promoted
_, site_b_pools = self.get_failover_pools()
site_b_app_name = self.application_name + self.site_b_app_suffix
# Simulate primary site unexpected shutdown
self.kill_primary_site()
# Try and promote the site-b to primary.
result = zaza.model.run_on_leader(
site_b_app_name,
'promote',
model_name=self.site_b_model,
action_params={
'pools': ','.join(site_b_pools),
})
self.assertEqual(int(result.get('Code')), 0)
# The action may not show up as 'failed' if there are no pools that
# needed to be promoted.
# self.assertEqual(result.status, 'failed')
# Retry to promote site-b using the 'force' Juju action parameter.
result = zaza.model.run_action_on_leader(
site_b_app_name,
'promote',
model_name=self.site_b_model,
action_params={
'force': True,
'pools': ','.join(site_b_pools),
})
# Validate successful Juju action execution
self.assertEqual(result.status, 'completed')
def test_200_forced_cinder_failover(self):
"""Validate Ceph failover via Cinder when the primary site is down.
This test only makes sense if Cinder RBD mirroring mode is 'image'.
It will return early, if this is not the case.
This assumes that the primary site is already killed.
"""
self.skip_test_if_cinder_not_present('test_200_forced_cinder_failover')
cinder_rbd_mirroring_mode = get_cinder_rbd_mirroring_mode(
self.cinder_ceph_app_name)
if cinder_rbd_mirroring_mode != 'image':
logging.warning(
"Skipping 'test_200_cinder_failover_without_primary_site' "
"since Cinder RBD mirroring mode is {}.".format(
cinder_rbd_mirroring_mode))
return
# Make sure that the Cinder Ceph backend workaround is applied.
self.apply_cinder_ceph_workaround()
session = openstack.get_overcloud_keystone_session()
cinder = openstack.get_cinder_session_client(session, version=3)
openstack.failover_cinder_volume_host(
cinder=cinder,
backend_name=self.cinder_ceph_app_name,
target_backend_id='ceph',
target_status='disabled',
target_replication_status='failed-over')
# Check that the Cinder volumes are still available after forced
# failover.
for volume in cinder.volumes.list():
self.assertEqual(volume.status, 'available')