Merge pull request #719 from AurelienLourot/nvidia-vgpu

Add NovaComputeNvidiaVgpuTest
2022-03-22 10:00:10 +00:00
parent e9f32f0f85 7338af093e
commit 57f420629e
4 changed files with 310 additions and 45 deletions
--- a/zaza/openstack/charm_tests/nova/tests.py
+++ b/zaza/openstack/charm_tests/nova/tests.py
@@ -18,15 +18,22 @@

 import json
 import logging
+import os
+import tempfile
 import unittest
+import urllib
 from configparser import ConfigParser
 from time import sleep

+import novaclient.exceptions
+
 import zaza.model
 import zaza.openstack.charm_tests.glance.setup as glance_setup
 import zaza.openstack.charm_tests.test_utils as test_utils
 import zaza.openstack.configure.guest
+import zaza.openstack.utilities.generic as generic_utils
 import zaza.openstack.utilities.openstack as openstack_utils
+from zaza.utilities import juju as juju_utils


 class BaseGuestCreateTest(unittest.TestCase):
@@ -434,6 +441,263 @@ class NovaComputeActionTest(test_utils.OpenStackBaseTest):
                    "The action failed: {}".format(action.data["message"]))


+class NovaComputeNvidiaVgpuTest(test_utils.OpenStackBaseTest):
+    """Run nova-compute-nvidia-vgpu specific tests.
+
+    These tests should also turn green if the deployment under test doesn't
+    have GPU hardware.
+    """
+
+    def test_vgpu_in_nova_conf(self):
+        """Test that nova.conf contains vGPU-related settings.
+
+        This test assumes that nova-compute-nvidia-vgpu's config option
+        vgpu-device-mappings has been set to something not empty like
+        "{'nvidia-108': ['0000:c1:00.0']}".
+        """
+        for unit in zaza.model.get_units('nova-compute',
+                                         model_name=self.model_name):
+            nova_conf_file = '/etc/nova/nova.conf'
+            nova_conf = str(generic_utils.get_file_contents(unit,
+                                                            nova_conf_file))
+
+            # See
+            # https://docs.openstack.org/nova/queens/admin/virtual-gpu.html
+            # https://docs.openstack.org/nova/ussuri/admin/virtual-gpu.html
+            # https://docs.openstack.org/releasenotes/nova/xena.html#deprecation-notes
+            self.assertTrue(('enabled_vgpu_types' in nova_conf) or
+                            ('enabled_mdev_types' in nova_conf))
+
+
+class NovaComputeNvidiaVgpuWithHardwareTest(test_utils.OpenStackBaseTest):
+    """Run nova-compute-nvidia-vgpu specific tests.
+
+    These tests require real GPU hardware.
+    """
+
+    def setUp(self):
+        """Declare variables that will be used both in tests and tearDown."""
+        self.RESOURCE_PREFIX = 'zaza-nova'
+        self.keystone_client = openstack_utils.get_keystone_session_client(
+            self.keystone_session)
+        self.trait_name = 'CUSTOM_ZAZA_VGPU'
+        self.flavor_id = 42
+
+    def tearDown(self):
+        """Cleanup all created resources."""
+        self.resource_cleanup()  # cleans up the create guests
+        self._cleanup_vgpu_flavor()
+        self._cleanup_vgpu_trait()
+
+    def test_guest_using_vgpu(self):
+        """Test the creation of a guest with a vGPU.
+
+        This test assumes that nova-compute-nvidia-vgpu's config option
+        vgpu-device-mappings has been set to something not empty like
+        "{'nvidia-108': ['0000:c1:00.0']}".
+
+        This test requires OpenStack Stein or newer.
+
+        This test performs the following steps:
+        1.  Download the proprietary NVIDIA software.
+        2.  Attach it to the nova-compute-nvidia-vgpu charm as a resource.
+        3.  Reboot the compute nodes.
+        4.  List the available vGPU types.
+        5.  Select a vGPU type via juju config option on the charm.
+        6.  Check the amount of used vGPUs.
+        7.  Create a vGPU trait.
+        8.  Create a flavor with this trait.
+        9.  Create a guest with this flavor.
+        10. Check the amount of used vGPUs.
+        """
+        package_local_path = self._download_nvidia_package()
+
+        self._attach_nvidia_package_as_resource(package_local_path)
+        self._reboot_vgpu_units()
+
+        wanted_vgpu_type = 'nvidia-108'
+        wanted_gpu_address = '0000:c1:00.0'
+        self._assert_vgpu_type_available(wanted_vgpu_type, wanted_gpu_address)
+
+        logging.info('Selecting vGPU type {} on GPU {} ...'.format(
+            wanted_vgpu_type, wanted_gpu_address))
+        alternate_config = {
+            "vgpu-device-mappings": ("{'" + wanted_vgpu_type + "': ['" +
+                                     wanted_gpu_address + "']}")
+        }
+        with self.config_change({}, alternate_config, self.application_name,
+                                reset_to_charm_default=True):
+            self._install_openstack_cli_on_vgpu_units()
+
+            resource_provider_id = self._get_vgpu_resource_provider_id(
+                wanted_gpu_address)
+            num_vgpu_used_before = self._get_num_vgpu_used(
+                resource_provider_id)
+
+            self._create_vgpu_trait(resource_provider_id)
+            flavor_name = 'm1.small.vgpu'
+            self._create_vgpu_flavor(flavor_name)
+            self._assign_vgpu_trait_to_flavor(flavor_name)
+
+            self.launch_guest(
+                'vgpu', instance_key=glance_setup.LTS_IMAGE_NAME,
+                flavor_name=flavor_name)
+
+            num_vgpu_used_after = self._get_num_vgpu_used(resource_provider_id)
+            self.assertEqual(num_vgpu_used_after, num_vgpu_used_before + 1)
+
+    def _download_nvidia_package(self):
+        package_cache_dir = tempfile.gettempdir()
+        package_url = os.environ['TEST_NVIDIA_VGPU_HOST_SW']
+        package_name = os.path.basename(urllib.parse.urlparse(
+            package_url).path)
+        package_local_path = os.path.join(package_cache_dir, package_name)
+        if not os.path.exists(package_local_path):
+            logging.info('Downloading {} to {} ...'.format(
+                package_url, package_local_path))
+            openstack_utils.download_image(package_url, package_local_path)
+        else:
+            logging.info(
+                'Cached package found at {} - Skipping download'.format(
+                    package_local_path))
+        return package_local_path
+
+    def _get_vgpu_unit_names(self):
+        vgpu_unit_names = [unit.name for unit in
+                           zaza.model.get_units(self.application_name)]
+        self.assertGreater(len(vgpu_unit_names), 0, 'No vGPU unit found')
+        return vgpu_unit_names
+
+    def _attach_nvidia_package_as_resource(self, package_local_path):
+        logging.info('Attaching {} as a resource...'.format(
+            package_local_path))
+        zaza.model.attach_resource(self.application_name,
+                                   'nvidia-vgpu-software',
+                                   package_local_path)
+        for vgpu_unit_name in self._get_vgpu_unit_names():
+            zaza.model.block_until_unit_wl_message_match(
+                vgpu_unit_name, '.*installed NVIDIA software.*')
+        zaza.model.block_until_all_units_idle()
+
+    def _reboot_vgpu_units(self):
+        vgpu_unit_names = self._get_vgpu_unit_names()
+        for vgpu_unit_name in vgpu_unit_names:
+            logging.info('Rebooting {} ...'.format(vgpu_unit_name))
+            generic_utils.reboot(vgpu_unit_name)
+            zaza.model.block_until_unit_wl_status(vgpu_unit_name, "unknown")
+        for vgpu_unit_name in vgpu_unit_names:
+            zaza.model.block_until_unit_wl_status(vgpu_unit_name, "active")
+        zaza.model.block_until_all_units_idle()
+
+    def _assert_vgpu_type_available(self, wanted_vgpu_type,
+                                    wanted_gpu_address):
+        logging.info(
+            'Checking that the vGPU type {} is available on GPU {} ...'.format(
+                wanted_vgpu_type, wanted_gpu_address))
+        available_vgpu_types = zaza.model.run_action_on_leader(
+            self.application_name, 'list-vgpu-types',
+            raise_on_failure=True).results['output']
+        self.assertIn('{}, {}'.format(wanted_vgpu_type, wanted_gpu_address),
+                      available_vgpu_types)
+
+    def _install_openstack_cli_on_vgpu_units(self):
+        command = 'snap install openstackclients'
+        for vgpu_unit_name in self._get_vgpu_unit_names():
+            juju_utils.remote_run(vgpu_unit_name, remote_cmd=command,
+                                  timeout=180, fatal=True)
+
+    def _get_vgpu_resource_provider_id(self, wanted_gpu_address):
+        logging.info('Querying resource providers...')
+        command = (
+            'openstack {} resource provider list -f value -c uuid -c name')
+        command = command.format(openstack_utils.get_cli_auth_args(
+            self.keystone_client))
+        resource_providers = juju_utils.remote_run(
+            self._get_vgpu_unit_names()[0], remote_cmd=command, timeout=180,
+            fatal=True).strip().split('\n')
+
+        # At this point resource_providers should look like
+        # ['0e1379b8-7bd1-40e6-9f41-93cb5b95e38b node-sparky.maas',
+        #  '1bb845a4-cf21-44c2-896e-e877760ad39b \
+        #   node-sparky.maas_pci_0000_c1_00_0']
+        resource_provider_id = None
+        wanted_resource_provider_substring = 'pci_{}'.format(
+            wanted_gpu_address.replace(':', '_').replace('.', '_'))
+        for resource_provider in resource_providers:
+            if wanted_resource_provider_substring in resource_provider:
+                resource_provider_id = resource_provider.split()[0]
+        self.assertIsNotNone(resource_provider_id)
+        return resource_provider_id
+
+    def _get_num_vgpu_used(self, resource_provider_id):
+        logging.info('Querying resource provider inventory...')
+        command = (
+            'openstack {} resource provider inventory list {} '
+            '-f value -c used')
+        command = command.format(openstack_utils.get_cli_auth_args(
+            self.keystone_client), resource_provider_id)
+        num_vgpu_used = juju_utils.remote_run(
+            self._get_vgpu_unit_names()[0], remote_cmd=command, timeout=180,
+            fatal=True).strip()
+        return int(num_vgpu_used)
+
+    def _create_vgpu_trait(self, resource_provider_id):
+        logging.info('Creating trait {}...'.format(self.trait_name))
+        command = (
+            'openstack {} --os-placement-api-version 1.6 trait create {}')
+        command = command.format(openstack_utils.get_cli_auth_args(
+            self.keystone_client), self.trait_name)
+        first_unit_name = self._get_vgpu_unit_names()[0]
+        juju_utils.remote_run(first_unit_name, remote_cmd=command, timeout=180,
+                              fatal=True)
+        command = (
+            'openstack {} --os-placement-api-version 1.6 resource provider '
+            'trait set --trait {} {}')
+        command = command.format(openstack_utils.get_cli_auth_args(
+            self.keystone_client), self.trait_name, resource_provider_id)
+        juju_utils.remote_run(first_unit_name, remote_cmd=command, timeout=180,
+                              fatal=True)
+
+    def _cleanup_vgpu_trait(self):
+        logging.info('Cleaning up trait {}...'.format(self.trait_name))
+        command = (
+            'openstack {} --os-placement-api-version 1.6 trait delete {}')
+        command = command.format(openstack_utils.get_cli_auth_args(
+            self.keystone_client), self.trait_name)
+        juju_utils.remote_run(
+            self._get_vgpu_unit_names()[0], remote_cmd=command, timeout=180,
+            fatal=False)
+
+    def _create_vgpu_flavor(self, flavor_name):
+        logging.info('Creating flavor {}...'.format(flavor_name))
+        nova_client = openstack_utils.get_nova_session_client(
+            self.keystone_session)
+        nova_client.flavors.create(name=flavor_name, ram=2048, vcpus=1,
+                                   disk=20, flavorid=self.flavor_id)
+
+    def _cleanup_vgpu_flavor(self):
+        logging.info('Cleaning up created flavor...')
+        nova_client = openstack_utils.get_nova_session_client(
+            self.keystone_session)
+        try:
+            flavor = nova_client.flavors.get(self.flavor_id)
+        except novaclient.exceptions.NotFound:
+            return
+        nova_client.flavors.delete(flavor)
+
+    def _assign_vgpu_trait_to_flavor(self, flavor_name):
+        logging.info('Assigning trait {} to flavor {} ...'.format(
+            self.trait_name, flavor_name))
+        command = (
+            'openstack {} flavor set {} --property resources:VGPU=1 '
+            '--property trait:{}=required')
+        command = command.format(openstack_utils.get_cli_auth_args(
+            self.keystone_client), flavor_name, self.trait_name)
+        juju_utils.remote_run(
+            self._get_vgpu_unit_names()[0], remote_cmd=command, timeout=180,
+            fatal=True)
+
+
 class NovaCloudControllerActionTest(test_utils.OpenStackBaseTest):
    """Run nova-cloud-controller specific tests.

--- a/zaza/openstack/charm_tests/test_utils.py
+++ b/zaza/openstack/charm_tests/test_utils.py
@@ -634,7 +634,7 @@ class OpenStackBaseTest(BaseCharmTest):
            pass

    def launch_guest(self, guest_name, userdata=None, use_boot_volume=False,
-                     instance_key=None):
+                     instance_key=None, flavor_name=None):
        """Launch one guest to use in tests.

        Note that it is up to the caller to have set the RESOURCE_PREFIX class
@@ -678,7 +678,8 @@ class OpenStackBaseTest(BaseCharmTest):
                    instance_key,
                    vm_name=instance_name,
                    use_boot_volume=use_boot_volume,
-                    userdata=userdata)
+                    userdata=userdata,
+                    flavor_name=flavor_name)

    def launch_guests(self, userdata=None):
        """Launch two guests to use in tests.
--- a/zaza/openstack/charm_tests/trilio/tests.py
+++ b/zaza/openstack/charm_tests/trilio/tests.py
@@ -158,49 +158,7 @@ class WorkloadmgrCLIHelper(object):
        self.trilio_wlm_unit = zaza_model.get_first_unit_name(
            "trilio-wlm"
        )
-        self.auth_args = self._auth_arguments(keystone_client)
-
-    @classmethod
-    def _auth_arguments(cls, keystone_client):
-        """Generate workloadmgrcli arguments for cloud authentication.
-
-        :returns: string of required cli arguments for authentication
-        :rtype: str
-        """
-        overcloud_auth = openstack_utils.get_overcloud_auth()
-        overcloud_auth.update(
-            {
-                "OS_DOMAIN_ID": openstack_utils.get_domain_id(
-                    keystone_client, domain_name="admin_domain"
-                ),
-                "OS_TENANT_ID": openstack_utils.get_project_id(
-                    keystone_client,
-                    project_name="admin",
-                    domain_name="admin_domain",
-                ),
-                "OS_TENANT_NAME": "admin",
-            }
-        )
-
-        _required_keys = [
-            "OS_AUTH_URL",
-            "OS_USERNAME",
-            "OS_PASSWORD",
-            "OS_REGION_NAME",
-            "OS_DOMAIN_ID",
-            "OS_TENANT_ID",
-            "OS_TENANT_NAME",
-        ]
-
-        params = []
-        for os_key in _required_keys:
-            params.append(
-                "--{}={}".format(
-                    os_key.lower().replace("_", "-"),
-                    overcloud_auth[os_key],
-                )
-            )
-        return " ".join(params)
+        self.auth_args = openstack_utils.get_cli_auth_args(keystone_client)

    def create_workload(self, instance_id):
        """Create a new workload.
--- a/zaza/openstack/utilities/openstack.py
+++ b/zaza/openstack/utilities/openstack.py
@@ -3131,3 +3131,45 @@ def get_keystone_session_from_relation(client_app,
        creds['OS_PROJECT_DOMAIN_NAME'] = relation['service_domain']

    return get_keystone_session(creds, scope=scope, verify=verify)
+
+
+def get_cli_auth_args(keystone_client):
+    """Generate openstack CLI arguments for cloud authentication.
+
+    :returns: string of required cli arguments for authentication
+    :rtype: str
+    """
+    overcloud_auth = get_overcloud_auth()
+    overcloud_auth.update(
+        {
+            "OS_DOMAIN_ID": get_domain_id(
+                keystone_client, domain_name="admin_domain"
+            ),
+            "OS_TENANT_ID": get_project_id(
+                keystone_client,
+                project_name="admin",
+                domain_name="admin_domain",
+            ),
+            "OS_TENANT_NAME": "admin",
+        }
+    )
+
+    _required_keys = [
+        "OS_AUTH_URL",
+        "OS_USERNAME",
+        "OS_PASSWORD",
+        "OS_REGION_NAME",
+        "OS_DOMAIN_ID",
+        "OS_TENANT_ID",
+        "OS_TENANT_NAME",
+    ]
+
+    params = []
+    for os_key in _required_keys:
+        params.append(
+            "--{}={}".format(
+                os_key.lower().replace("_", "-"),
+                overcloud_auth[os_key],
+            )
+        )
+    return " ".join(params)