c8ae6cdc55
The create_segment() function is often failing in the gate due to being unable to establish a connection to the masakari endpoint. This will allow some more time for the endpoint to become available when this error occurs.
261 lines
9.2 KiB
Python
261 lines
9.2 KiB
Python
# Copyright 2019 Canonical Ltd.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Configure and manage masakari.
|
|
|
|
Functions for managing masakari resources and simulating compute node loss
|
|
and recovery.
|
|
"""
|
|
|
|
import logging
|
|
import openstack.exceptions as ostack_except
|
|
import tenacity
|
|
import urllib3
|
|
|
|
import zaza.model
|
|
import zaza.openstack.utilities.openstack as openstack_utils
|
|
|
|
ROUND_ROBIN = 'round-robin'
|
|
|
|
|
|
def roundrobin_assign_hosts_to_segments(nova_client, masakari_client):
|
|
"""Assign hypervisors to segments in a round-robin fashion.
|
|
|
|
:param nova_client: Authenticated nova client
|
|
:type nova_client: novaclient.Client
|
|
:param masakari_client: Authenticated masakari client
|
|
:type masakari_client: openstack.instance_ha.v1._proxy.Proxy
|
|
"""
|
|
hypervisors = nova_client.hypervisors.list()
|
|
segment_ids = [s.uuid for s in masakari_client.segments()]
|
|
segment_ids = segment_ids * len(hypervisors)
|
|
for hypervisor in hypervisors:
|
|
target_segment = segment_ids.pop()
|
|
hostname = hypervisor.hypervisor_hostname
|
|
logging.info('Adding {} to segment {}'.format(hostname,
|
|
target_segment))
|
|
masakari_client.create_host(
|
|
name=hostname,
|
|
segment_id=target_segment,
|
|
recovery_method='auto',
|
|
control_attributes='SSH',
|
|
type='COMPUTE')
|
|
|
|
|
|
HOST_ASSIGNMENT_METHODS = {
|
|
ROUND_ROBIN: roundrobin_assign_hosts_to_segments
|
|
}
|
|
|
|
|
|
@tenacity.retry(
|
|
wait=tenacity.wait_exponential(multiplier=2, max=60),
|
|
reraise=True, stop=tenacity.stop_after_attempt(10),
|
|
retry=tenacity.retry_if_exception_type(urllib3.connection.HTTPSConnection))
|
|
def create_segments(segment_number=1, host_assignment_method=None):
|
|
"""Create a masakari segment and populate it with hypervisors.
|
|
|
|
:param segment_number: Number of segments to create
|
|
:type segment_number: int
|
|
:param host_assignment_method: Method to use to assign hypervisors to
|
|
segments
|
|
:type host_assignment_method: f()
|
|
"""
|
|
host_assignment_method = host_assignment_method or ROUND_ROBIN
|
|
keystone_session = openstack_utils.get_overcloud_keystone_session()
|
|
nova_client = openstack_utils.get_nova_session_client(keystone_session)
|
|
masakari_client = openstack_utils.get_masakari_session_client(
|
|
keystone_session)
|
|
for segment_number in range(0, segment_number):
|
|
segment_name = 'seg{}'.format(segment_number)
|
|
logging.info('Creating segment {}'.format(segment_name))
|
|
masakari_client.create_segment(
|
|
name=segment_name,
|
|
recovery_method='auto',
|
|
service_type='COMPUTE')
|
|
HOST_ASSIGNMENT_METHODS[host_assignment_method](
|
|
nova_client,
|
|
masakari_client)
|
|
|
|
|
|
@tenacity.retry(
|
|
wait=tenacity.wait_exponential(multiplier=2, max=60),
|
|
reraise=True, stop=tenacity.stop_after_attempt(10),
|
|
retry=tenacity.retry_if_exception_type(ostack_except.ConflictException))
|
|
def enable_host(masakari_client, host, segment):
|
|
"""Enable hypervisor within masakari.
|
|
|
|
:param masakari_client: Authenticated masakari client
|
|
:type masakari_client: openstack.instance_ha.v1._proxy.Proxy
|
|
:param host: Uuid of host to enable
|
|
:type host: str
|
|
:param segment: Uuid of segment host is associated with.
|
|
:type segment: str
|
|
"""
|
|
logging.info("Removing maintenance mode from masakari "
|
|
"host {}".format(host))
|
|
masakari_client.update_host(
|
|
host,
|
|
segment_id=segment,
|
|
**{'on_maintenance': False})
|
|
|
|
|
|
def enable_hosts(masakari_client=None):
|
|
"""Enable all hypervisors within masakari.
|
|
|
|
Enable all hosts across all segments within masakari. This does not
|
|
enable the hypervisor from a nova POV.
|
|
|
|
:param masakari_client: Authenticated masakari client
|
|
:type masakari_client: openstack.instance_ha.v1._proxy.Proxy
|
|
"""
|
|
if not masakari_client:
|
|
keystone_session = openstack_utils.get_overcloud_keystone_session()
|
|
masakari_client = openstack_utils.get_masakari_session_client(
|
|
keystone_session)
|
|
|
|
for segment in masakari_client.segments():
|
|
for host in masakari_client.hosts(segment_id=segment.uuid):
|
|
if host.on_maintenance:
|
|
enable_host(masakari_client, host.uuid, segment.uuid)
|
|
|
|
|
|
def _svc_control(unit_name, action, services, model_name):
|
|
"""Enable/Disable services on a unit.
|
|
|
|
This is a simplistic method for controlling services, hence its private.
|
|
|
|
:param unit_name: Juju name of unit (app/n)
|
|
:type unit_name: str
|
|
:param action: systemctl action to perform on unit (start/stop etc)
|
|
:type action: str
|
|
:param services: List of services to perform action against
|
|
:type services: []
|
|
:param model_name: Name of model unit_name resides in.
|
|
:type model_name: str
|
|
"""
|
|
logging.info('{} {} on {}'.format(action.title(), services, unit_name))
|
|
cmds = []
|
|
for svc in services:
|
|
cmds.append("systemctl {} {}".format(action, svc))
|
|
zaza.model.run_on_unit(
|
|
unit_name, command=';'.join(cmds),
|
|
model_name=model_name)
|
|
|
|
|
|
def _svc_set_systemd_restart_mode(unit_name, service_name, mode, model_name):
|
|
"""Update the restart mode of the given systemd service.
|
|
|
|
:param unit_name: Juju name of unit (app/n)
|
|
:type unit_name: str
|
|
:param service_name: Name of systemd service to update
|
|
:type service_name: str
|
|
:param mode: Restart mode to switch to eg 'no', 'on-success', 'on-failure',
|
|
'on-abort' or 'always'
|
|
:type mode: str
|
|
:param model_name: Name of model unit_name resides in.
|
|
:type model_name: str
|
|
"""
|
|
# Restart options include: no, on-success, on-failure, on-abort or always
|
|
logging.info('Setting systemd restart mode for {} to {}'.format(
|
|
service_name,
|
|
mode))
|
|
cmds = [
|
|
("sed -i -e 's/^Restart=.*/Restart={}/g' "
|
|
"/lib/systemd/system/{}.service").format(mode, service_name),
|
|
'systemctl daemon-reload']
|
|
logging.info('Running {} on {}'.format(cmds, unit_name))
|
|
zaza.model.run_on_unit(
|
|
unit_name, command=';'.join(cmds),
|
|
model_name=model_name)
|
|
|
|
|
|
def simulate_compute_host_failure(unit_name, model_name):
|
|
"""Simulate compute node failure from a masakari and nova POV.
|
|
|
|
Masakari uses corosync/pacemaker to detect failure and nova check
|
|
nova-compute. Shutting down these services causes masakari and nova to
|
|
mark them as down.
|
|
|
|
:param unit_name: Juju name of unit (app/n)
|
|
:type unit_name: str
|
|
:param model_name: Name of model unit_name resides in.
|
|
:type model_name: str
|
|
"""
|
|
logging.info('Simulating failure of compute node {}'.format(unit_name))
|
|
_svc_set_systemd_restart_mode(
|
|
unit_name,
|
|
'pacemaker_remote',
|
|
'no',
|
|
model_name)
|
|
_svc_control(
|
|
unit_name,
|
|
'stop',
|
|
['corosync', 'nova-compute'],
|
|
model_name)
|
|
compute_app = unit_name.split('/')[0]
|
|
release_pair = openstack_utils.get_current_os_release_pair(
|
|
application=compute_app)
|
|
if (openstack_utils.get_os_release(release_pair=release_pair) >=
|
|
openstack_utils.get_os_release('focal_ussuri')):
|
|
pacemaker_proc = '/usr/sbin/pacemaker-remoted'
|
|
else:
|
|
pacemaker_proc = '/usr/sbin/pacemaker_remoted'
|
|
logging.info('Sending {} a SIGTERM'.format(pacemaker_proc))
|
|
zaza.model.run_on_unit(
|
|
unit_name,
|
|
'pkill -9 -f {}'.format(pacemaker_proc),
|
|
model_name=model_name)
|
|
|
|
|
|
def simulate_compute_host_recovery(unit_name, model_name):
|
|
"""Simulate compute node recovery from a masakari and nova POV.
|
|
|
|
Masakari uses corosync/pacemaker to detect failure and nova check
|
|
nova-compute. Starting these services is a prerequisite to marking
|
|
them as recovered.
|
|
|
|
:param unit_name: Juju name of unit (app/n)
|
|
:type unit_name: str
|
|
:param model_name: Name of model unit_name resides in.
|
|
:type model_name: str
|
|
"""
|
|
logging.info('Simulating recovery of compute node {}'.format(unit_name))
|
|
_svc_set_systemd_restart_mode(
|
|
unit_name,
|
|
'pacemaker_remote',
|
|
'on-failure',
|
|
model_name)
|
|
_svc_control(
|
|
unit_name,
|
|
'start',
|
|
['corosync', 'pacemaker_remote', 'nova-compute'],
|
|
model_name)
|
|
|
|
|
|
def simulate_guest_crash(guest_pid, compute_unit_name, model_name):
|
|
"""Simulate a guest crashing.
|
|
|
|
:param guest_pid: PID of running qemu provess for guest.
|
|
:type guest_pid: str
|
|
:param compute_unit_name: Juju name of hypervisor hosting guest (app/n)
|
|
:type compute_unit_name: str
|
|
:param model_name: Name of model unit_name resides in.
|
|
:type model_name: str
|
|
"""
|
|
pid_kill_cmd = 'kill -9 {}'
|
|
zaza.model.run_on_unit(
|
|
compute_unit_name,
|
|
pid_kill_cmd.format(guest_pid),
|
|
model_name=model_name)
|