Merge pull request #418 from openstack-charmers/issue/417

Log resource cleanup failure and carry on, retry lb resource creation and various other retry fixes
2020-09-24 10:11:23 +02:00
parent c6a75b6d14 be59370476
commit 9406ea0380
4 changed files with 110 additions and 28 deletions
@@ -15,6 +15,7 @@
 """Code for configuring octavia-diskimage-retrofit."""

 import logging
+import tenacity

 import zaza.model

@@ -39,12 +40,20 @@ def retrofit_amphora_image(unit='octavia-diskimage-retrofit/0',
    if image_id:
        params.update({'source-image': image_id})

-    # NOTE(fnordahl) ``zaza.model.run_action_on_leader`` fails here,
-    # apparently has to do with handling of subordinates in ``libjuju`` or
-    # ``juju`` itself.
-    action = zaza.model.run_action(
-        unit,
-        'retrofit-image',
-        action_params=params,
-        raise_on_failure=True)
+    # NOTE(fnordahl) the retrofit process involves downloading packages from
+    # the internet and is as such susceptible to random failures due to
+    # internet gremlins.
+    for attempt in tenacity.Retrying(
+            stop=tenacity.stop_after_attempt(3),
+            wait=tenacity.wait_exponential(
+            multiplier=1, min=2, max=10)):
+        with attempt:
+            # NOTE(fnordahl) ``zaza.model.run_action_on_leader`` fails here,
+            # apparently has to do with handling of subordinates in ``libjuju``
+            # or ``juju`` itself.
+            action = zaza.model.run_action(
+                unit,
+                'retrofit-image',
+                action_params=params,
+                raise_on_failure=True)
    return action
@@ -18,6 +18,8 @@ import logging
 import subprocess
 import tenacity

+from keystoneauth1 import exceptions as keystone_exceptions
+import octaviaclient.api.v2.octavia
 import osc_lib.exceptions

 import zaza.openstack.charm_tests.test_utils as test_utils
@@ -80,18 +82,62 @@ class LBAASv2Test(test_utils.OpenStackBaseTest):
        # List of floating IPs created by this test
        cls.fips = []

-    def resource_cleanup(self):
-        """Remove resources created during test execution."""
+    def _remove_amphorae_instances(self):
+        """Remove amphorae instances forcefully.
+
+        In some situations Octavia is unable to remove load balancer resources.
+        This helper can be used to remove the underlying instances.
+        """
+        result = self.octavia_client.amphora_list()
+        for amphora in result.get('amphorae', []):
+            for server in self.nova_client.servers.list():
+                if 'compute_id' in amphora and server.id == amphora[
+                        'compute_id']:
+                    try:
+                        openstack_utils.delete_resource(
+                            self.nova_client.servers,
+                            server.id,
+                            msg="server")
+                    except AssertionError as e:
+                        logging.warning(
+                            'Gave up waiting for resource cleanup: "{}"'
+                            .format(str(e)))
+
+    @tenacity.retry(stop=tenacity.stop_after_attempt(3),
+                    wait=tenacity.wait_exponential(
+                        multiplier=1, min=2, max=10))
+    def resource_cleanup(self, only_local=False):
+        """Remove resources created during test execution.
+
+        :param only_local: When set to true do not call parent method
+        :type only_local: bool
+        """
        for lb in self.loadbalancers:
-            self.octavia_client.load_balancer_delete(lb['id'], cascade=True)
            try:
-                self.wait_for_lb_resource(
-                    self.octavia_client.load_balancer_show, lb['id'],
-                    provisioning_status='DELETED')
-            except osc_lib.exceptions.NotFound:
-                pass
+                self.octavia_client.load_balancer_delete(
+                    lb['id'], cascade=True)
+            except octaviaclient.api.v2.octavia.OctaviaClientException as e:
+                logging.info('Octavia is unable to delete load balancer: "{}"'
+                             .format(e))
+                logging.info('Attempting to forcefully remove amphorae')
+                self._remove_amphorae_instances()
+            else:
+                try:
+                    self.wait_for_lb_resource(
+                        self.octavia_client.load_balancer_show, lb['id'],
+                        provisioning_status='DELETED')
+                except osc_lib.exceptions.NotFound:
+                    pass
+        # allow resource cleanup to be run multiple times
+        self.loadbalancers = []
        for fip in self.fips:
            self.neutron_client.delete_floatingip(fip)
+        # allow resource cleanup to be run multiple times
+        self.fips = []
+
+        if only_local:
+            return
+
        # we run the parent resource_cleanup last as it will remove instances
        # referenced as members in the above cleaned up load balancers
        super(LBAASv2Test, self).resource_cleanup()
@@ -157,6 +203,7 @@ class LBAASv2Test(test_utils.OpenStackBaseTest):
                    'provider': provider,
                }})
        lb = result['loadbalancer']
+        self.loadbalancers.append(lb)
        lb_id = lb['id']

        logging.info('Awaiting loadbalancer to reach provisioning_status '
@@ -283,10 +330,25 @@ class LBAASv2Test(test_utils.OpenStackBaseTest):
        for provider in self.get_lb_providers(self.octavia_client).keys():
            logging.info('Creating loadbalancer with provider {}'
                         .format(provider))
-            lb = self._create_lb_resources(self.octavia_client, provider,
-                                           vip_subnet_id, subnet_id,
-                                           payload_ips)
-            self.loadbalancers.append(lb)
+            final_exc = None
+            # NOTE: we cannot use tenacity here as the method we call into
+            # already uses it to wait for operations to complete.
+            for retry in range(0, 3):
+                try:
+                    lb = self._create_lb_resources(self.octavia_client,
+                                                   provider,
+                                                   vip_subnet_id,
+                                                   subnet_id,
+                                                   payload_ips)
+                    break
+                except (AssertionError,
+                        keystone_exceptions.connection.ConnectFailure) as e:
+                    logging.info('Retrying load balancer creation, last '
+                                 'failure: "{}"'.format(str(e)))
+                    self.resource_cleanup(only_local=True)
+                    final_exc = e
+            else:
+                raise final_exc

            lb_fp = openstack_utils.create_floating_ip(
                self.neutron_client, 'ext_net', port={'id': lb['vip_port_id']})
@@ -495,6 +495,11 @@ class OpenStackBaseTest(BaseCharmTest):
                        self.nova_client.servers,
                        server.id,
                        msg="server")
+        except AssertionError as e:
+            # Resource failed to be removed within the expected time frame,
+            # log this fact and carry on.
+            logging.warning('Gave up waiting for resource cleanup: "{}"'
+                            .format(str(e)))
        except AttributeError:
            # Test did not define self.RESOURCE_PREFIX, ignore.
            pass
@@ -690,14 +690,20 @@ def add_interface_to_netplan(server_name, mac_address):
                  "{}\nserver_name: {}".format(body_value, unit_name,
                                               interface, mac_address,
                                               server_name))
-    with tempfile.NamedTemporaryFile(mode="w") as netplan_file:
-        netplan_file.write(body_value)
-        netplan_file.flush()
-        model.scp_to_unit(unit_name, netplan_file.name,
-                          '/home/ubuntu/60-dataport.yaml', user="ubuntu")
-    run_cmd_mv = "sudo mv /home/ubuntu/60-dataport.yaml /etc/netplan/"
-    model.run_on_unit(unit_name, run_cmd_mv)
-    model.run_on_unit(unit_name, "sudo netplan apply")
+    for attempt in tenacity.Retrying(
+            stop=tenacity.stop_after_attempt(3),
+            wait=tenacity.wait_exponential(
+            multiplier=1, min=2, max=10)):
+        with attempt:
+            with tempfile.NamedTemporaryFile(mode="w") as netplan_file:
+                netplan_file.write(body_value)
+                netplan_file.flush()
+                model.scp_to_unit(
+                    unit_name, netplan_file.name,
+                    '/home/ubuntu/60-dataport.yaml', user="ubuntu")
+            run_cmd_mv = "sudo mv /home/ubuntu/60-dataport.yaml /etc/netplan/"
+            model.run_on_unit(unit_name, run_cmd_mv)
+            model.run_on_unit(unit_name, "sudo netplan apply")


 def configure_gateway_ext_port(novaclient, neutronclient, net_id=None,