From 89ea43d0d51e6a245cbe20b92ee7228ed4a4df1c Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Tue, 15 Sep 2020 20:29:48 +0200 Subject: [PATCH 1/5] Log resource cleanup failure and carry on Fixes #417 --- zaza/openstack/charm_tests/test_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/zaza/openstack/charm_tests/test_utils.py b/zaza/openstack/charm_tests/test_utils.py index ee61336..250c183 100644 --- a/zaza/openstack/charm_tests/test_utils.py +++ b/zaza/openstack/charm_tests/test_utils.py @@ -495,6 +495,11 @@ class OpenStackBaseTest(BaseCharmTest): self.nova_client.servers, server.id, msg="server") + except AssertionError as e: + # Resource failed to be removed within the expected time frame, + # log this fact and carry on. + logging.warning('Gave up waiting for resource cleanup: "{}"' + .format(str(e))) except AttributeError: # Test did not define self.RESOURCE_PREFIX, ignore. pass From e9215f1d73186cc7246ad7f5c74941435a72e89f Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Wed, 16 Sep 2020 08:17:25 +0200 Subject: [PATCH 2/5] Retry load balancer resource creation Creating a load balancer consist of many independent API calls. If any one of them fail today the whole job will be killed. On failure to create a lb resoruce, clean up and retry. --- zaza/openstack/charm_tests/octavia/tests.py | 44 ++++++++++++++++++--- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/zaza/openstack/charm_tests/octavia/tests.py b/zaza/openstack/charm_tests/octavia/tests.py index e84ac3e..1bdc5e4 100644 --- a/zaza/openstack/charm_tests/octavia/tests.py +++ b/zaza/openstack/charm_tests/octavia/tests.py @@ -18,6 +18,7 @@ import logging import subprocess import tenacity +from keystoneauth1 import exceptions as keystone_exceptions import osc_lib.exceptions import zaza.openstack.charm_tests.test_utils as test_utils @@ -80,8 +81,15 @@ class LBAASv2Test(test_utils.OpenStackBaseTest): # List of floating IPs created by this test cls.fips = [] - def resource_cleanup(self): - """Remove resources created during test execution.""" + @tenacity.retry(stop=tenacity.stop_after_attempt(3), + wait=tenacity.wait_exponential( + multiplier=1, min=2, max=10)) + def resource_cleanup(self, only_local=False): + """Remove resources created during test execution. + + :param only_local: When set to true do not call parent method + :type only_local: bool + """ for lb in self.loadbalancers: self.octavia_client.load_balancer_delete(lb['id'], cascade=True) try: @@ -90,8 +98,16 @@ class LBAASv2Test(test_utils.OpenStackBaseTest): provisioning_status='DELETED') except osc_lib.exceptions.NotFound: pass + # allow resource cleanup to be run multiple times + self.loadbalancers = [] for fip in self.fips: self.neutron_client.delete_floatingip(fip) + # allow resource cleanup to be run multiple times + self.fips = [] + + if only_local: + return + # we run the parent resource_cleanup last as it will remove instances # referenced as members in the above cleaned up load balancers super(LBAASv2Test, self).resource_cleanup() @@ -157,6 +173,7 @@ class LBAASv2Test(test_utils.OpenStackBaseTest): 'provider': provider, }}) lb = result['loadbalancer'] + self.loadbalancers.append(lb) lb_id = lb['id'] logging.info('Awaiting loadbalancer to reach provisioning_status ' @@ -283,10 +300,25 @@ class LBAASv2Test(test_utils.OpenStackBaseTest): for provider in self.get_lb_providers(self.octavia_client).keys(): logging.info('Creating loadbalancer with provider {}' .format(provider)) - lb = self._create_lb_resources(self.octavia_client, provider, - vip_subnet_id, subnet_id, - payload_ips) - self.loadbalancers.append(lb) + final_exc = None + # NOTE: we cannot use tenacity here as the method we call into + # already uses it to wait for operations to complete. + for retry in range(0, 3): + try: + lb = self._create_lb_resources(self.octavia_client, + provider, + vip_subnet_id, + subnet_id, + payload_ips) + break + except (AssertionError, + keystone_exceptions.connection.ConnectFailure) as e: + logging.info('Retrying load balancer creation, last ' + 'failure: "{}"'.format(str(e))) + self.resource_cleanup(only_local=True) + final_exc = e + else: + raise final_exc lb_fp = openstack_utils.create_floating_ip( self.neutron_client, 'ext_net', port={'id': lb['vip_port_id']}) From 430704ef80c40f8a02ab0643e83449e35a6d70f0 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Wed, 16 Sep 2020 11:01:53 +0200 Subject: [PATCH 3/5] Retry retrofit action The retrofit process involves downloading packages from the internet and is as such susceptible to random failures due to internet gremlins. --- .../octavia/diskimage_retrofit/setup.py | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/zaza/openstack/charm_tests/octavia/diskimage_retrofit/setup.py b/zaza/openstack/charm_tests/octavia/diskimage_retrofit/setup.py index 7fbca5e..73d1aad 100644 --- a/zaza/openstack/charm_tests/octavia/diskimage_retrofit/setup.py +++ b/zaza/openstack/charm_tests/octavia/diskimage_retrofit/setup.py @@ -15,6 +15,7 @@ """Code for configuring octavia-diskimage-retrofit.""" import logging +import tenacity import zaza.model @@ -39,12 +40,20 @@ def retrofit_amphora_image(unit='octavia-diskimage-retrofit/0', if image_id: params.update({'source-image': image_id}) - # NOTE(fnordahl) ``zaza.model.run_action_on_leader`` fails here, - # apparently has to do with handling of subordinates in ``libjuju`` or - # ``juju`` itself. - action = zaza.model.run_action( - unit, - 'retrofit-image', - action_params=params, - raise_on_failure=True) + # NOTE(fnordahl) the retrofit process involves downloading packages from + # the internet and is as such susceptible to random failures due to + # internet gremlins. + for attempt in tenacity.Retrying( + stop=tenacity.stop_after_attempt(3), + wait=tenacity.wait_exponential( + multiplier=1, min=2, max=10)): + with attempt: + # NOTE(fnordahl) ``zaza.model.run_action_on_leader`` fails here, + # apparently has to do with handling of subordinates in ``libjuju`` + # or ``juju`` itself. + action = zaza.model.run_action( + unit, + 'retrofit-image', + action_params=params, + raise_on_failure=True) return action From ced5b3dd13f84b23371fde76e309e6af4a56f048 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Thu, 17 Sep 2020 14:31:57 +0200 Subject: [PATCH 4/5] Retry scp/ssh operations in `add_interface_to_netplan` helper Fixes #420 --- zaza/openstack/utilities/openstack.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/zaza/openstack/utilities/openstack.py b/zaza/openstack/utilities/openstack.py index 0c31c05..1e8f625 100644 --- a/zaza/openstack/utilities/openstack.py +++ b/zaza/openstack/utilities/openstack.py @@ -690,14 +690,20 @@ def add_interface_to_netplan(server_name, mac_address): "{}\nserver_name: {}".format(body_value, unit_name, interface, mac_address, server_name)) - with tempfile.NamedTemporaryFile(mode="w") as netplan_file: - netplan_file.write(body_value) - netplan_file.flush() - model.scp_to_unit(unit_name, netplan_file.name, - '/home/ubuntu/60-dataport.yaml', user="ubuntu") - run_cmd_mv = "sudo mv /home/ubuntu/60-dataport.yaml /etc/netplan/" - model.run_on_unit(unit_name, run_cmd_mv) - model.run_on_unit(unit_name, "sudo netplan apply") + for attempt in tenacity.Retrying( + stop=tenacity.stop_after_attempt(3), + wait=tenacity.wait_exponential( + multiplier=1, min=2, max=10)): + with attempt: + with tempfile.NamedTemporaryFile(mode="w") as netplan_file: + netplan_file.write(body_value) + netplan_file.flush() + model.scp_to_unit( + unit_name, netplan_file.name, + '/home/ubuntu/60-dataport.yaml', user="ubuntu") + run_cmd_mv = "sudo mv /home/ubuntu/60-dataport.yaml /etc/netplan/" + model.run_on_unit(unit_name, run_cmd_mv) + model.run_on_unit(unit_name, "sudo netplan apply") def configure_gateway_ext_port(novaclient, neutronclient, net_id=None, From be59370476eaeb8ca0d6e1adca58c9acfa8faa77 Mon Sep 17 00:00:00 2001 From: Frode Nordahl Date: Tue, 22 Sep 2020 14:50:09 +0200 Subject: [PATCH 5/5] Forcefully remove amphorae when Octavia is unable to remove --- zaza/openstack/charm_tests/octavia/tests.py | 42 ++++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/zaza/openstack/charm_tests/octavia/tests.py b/zaza/openstack/charm_tests/octavia/tests.py index 1bdc5e4..eb0b217 100644 --- a/zaza/openstack/charm_tests/octavia/tests.py +++ b/zaza/openstack/charm_tests/octavia/tests.py @@ -19,6 +19,7 @@ import subprocess import tenacity from keystoneauth1 import exceptions as keystone_exceptions +import octaviaclient.api.v2.octavia import osc_lib.exceptions import zaza.openstack.charm_tests.test_utils as test_utils @@ -81,6 +82,27 @@ class LBAASv2Test(test_utils.OpenStackBaseTest): # List of floating IPs created by this test cls.fips = [] + def _remove_amphorae_instances(self): + """Remove amphorae instances forcefully. + + In some situations Octavia is unable to remove load balancer resources. + This helper can be used to remove the underlying instances. + """ + result = self.octavia_client.amphora_list() + for amphora in result.get('amphorae', []): + for server in self.nova_client.servers.list(): + if 'compute_id' in amphora and server.id == amphora[ + 'compute_id']: + try: + openstack_utils.delete_resource( + self.nova_client.servers, + server.id, + msg="server") + except AssertionError as e: + logging.warning( + 'Gave up waiting for resource cleanup: "{}"' + .format(str(e))) + @tenacity.retry(stop=tenacity.stop_after_attempt(3), wait=tenacity.wait_exponential( multiplier=1, min=2, max=10)) @@ -91,13 +113,21 @@ class LBAASv2Test(test_utils.OpenStackBaseTest): :type only_local: bool """ for lb in self.loadbalancers: - self.octavia_client.load_balancer_delete(lb['id'], cascade=True) try: - self.wait_for_lb_resource( - self.octavia_client.load_balancer_show, lb['id'], - provisioning_status='DELETED') - except osc_lib.exceptions.NotFound: - pass + self.octavia_client.load_balancer_delete( + lb['id'], cascade=True) + except octaviaclient.api.v2.octavia.OctaviaClientException as e: + logging.info('Octavia is unable to delete load balancer: "{}"' + .format(e)) + logging.info('Attempting to forcefully remove amphorae') + self._remove_amphorae_instances() + else: + try: + self.wait_for_lb_resource( + self.octavia_client.load_balancer_show, lb['id'], + provisioning_status='DELETED') + except osc_lib.exceptions.NotFound: + pass # allow resource cleanup to be run multiple times self.loadbalancers = [] for fip in self.fips: