From 9265ef98f4b5ae59466de4476422835c9420ae14 Mon Sep 17 00:00:00 2001 From: cxhong Date: Thu, 18 Jun 2020 15:12:01 -0400 Subject: [PATCH] Modified readme and remove cuda_power9_setup --- .../xcat/samples/cuda11/cuda11_setup.readme | 19 ++-- .../xcat/samples/cuda11/cuda_power9_setup | 96 ------------------- 2 files changed, 13 insertions(+), 102 deletions(-) delete mode 100755 xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup diff --git a/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme b/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme index 5e888811b..d6bb77371 100644 --- a/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme +++ b/xCAT-server/share/xcat/samples/cuda11/cuda11_setup.readme @@ -1,12 +1,15 @@ cuda setup scripts ================== -This section documented NVIDIA CUDA Toolkit v11 installation on the power9 rhels8.1 system. +This sample documents installation of the NVIDIA CUDA Toolkit v11 on IBM POWER9 servers as part of xCAT diskful provisioning of Red Hat Enterprise Linux 8.1. +For ``CUDA11``, there is a known issue that prevents successful installion of the nvidia-drivers module as part of the operating system kickstart install process used by diskful provisioning. +Diskless provisioning can still be performed using the traditional osimage method; these instructions apply to diskful provisioning only. + Diskful images -------------- -The following ``cudafull`` osimage definitions will be created from the base ``rhels8.1-ppc64le-install-compute`` osimage. :: +For diskful provisioning, create a new ``cudafull`` osimage definition using the default ``rhels8.1-ppc64le-install-compute`` osimage as a starting point. :: # lsdef -t osimage rhels8.1.0-ppc64le-install-cudafull Object name: rhels8.1.0-ppc64le-install-cudafull @@ -23,10 +26,12 @@ The following ``cudafull`` osimage definitions will be created from the base `` provmethod=install template=/opt/xcat/share/xcat/install/rh/compute.rhels8.tmpl + Postscripts ^^^^^^^^^^^ -xCAT provides ``cuda_power9_setup`` postscripts to setup additional configuration to install NVIDIA POWER9 CUDA driver. For ``CUDA11``, it has issue to installing nvidia-drivers modules with kickstart. To workaround this problem, xCAT provides another postscripts ``cuda11_power9_setup``, the CUDA packages will be installed from this postscripts instead from package list and this is only apply to the diskfull installation. +For ``CUDA11``, there is a known issue that prevents successful installion of the nvidia-drivers module as part of the Red Hat kickstart install process used by diskful provisioning. As an example method to work around this problem, refer to the postscript named ``cuda11_power9_setup``. This postscript will install the NVIDIA CUDA packages directly instead of relying on the osimage package list mechanism. ``cuda11_power9_setup`` is only needed for diskful provisioning. + CUDA dependences ^^^^^^^^^^^^^^^^ @@ -39,11 +44,12 @@ CUDA dependences -rw-r--r-- 1 root root 8668 Jun 16 10:29 opencl-filesystem-1.0-6.el8.noarch.rpm drwxr-xr-x 2 root root 4096 Jun 16 15:10 repodata + CUDA Packages ^^^^^^^^^^^^^ -``cuda-repo-rhel8-11-0-local-11.0.1_450.36.06-1.ppc64le.rpm`` is used for above osimage and it disbuted in the ``/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le`` dir. -Besides rhels8 base packlist, the following packages needs to be added also. :: +``cuda-repo-rhel8-11-0-local-11.0.1_450.36.06-1.ppc64le.rpm`` is used for the example ``cudafull`` osimage and the contents are copied into a directory named ``/install/REPO/software/nvidia/cuda-core/11.0.1_450.36.06-1/repo/ppc64le``. +In addition to the rhels8 base pkglist, the following packages need to also be added. :: # diff /opt/xcat/share/xcat/install/rh/compute.rhels8.cuda.pkglist /opt/xcat/share/xcat/install/rh/compute.rhels8.pkglist 12,27d11 @@ -63,4 +69,5 @@ Besides rhels8 base packlist, the following packages needs to be added also. :: < dkms < opencl-filesystem -NOTE: The two scripts in this directory verified with HPC service stack software. + +NOTE: The samples in this directory were verified as part of the IBM HPC POWER9 Clusters service pack testing diff --git a/xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup b/xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup deleted file mode 100755 index ac7626290..000000000 --- a/xCAT-server/share/xcat/samples/cuda11/cuda_power9_setup +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash -# -# Copyright (C) 2018 International Business Machines -# Eclipse Public License, Version 1.0 (EPL-1.0) -# -# -# 2018-03-21 GONG Jie -# 2018-04-24 Matt Ezell -# -# This script is used for doing extra setup steps for NVIDIA POWER9 CUDA driver -# on RHEL 7. Please refer document below for details. -# -# http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#power9-setup -# - -umask 0022 - -[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}" -$CHROOTCMD /bin/bash -c "systemctl enable nvidia-persistenced" - - -[ ! -z "${IMG_ROOTIMGDIR}" ] && CHROOTCMD="chroot ${IMG_ROOTIMGDIR}" -$CHROOTCMD /bin/bash -c "systemctl enable nvidia_gdrcopy_kernel.service" - -# Disable a udev rule installed by default in some Linux distributions that cause hot-pluggable -# memory to be automatically onlined when it is physically probed. -# -# The overrides for /lib/udev rules should be done in /etc/udev -# -UDEV_REDHAT_SOURCE=${IMG_ROOTIMGDIR}/lib/udev/rules.d/40-redhat.rules -UDEV_REDHAT_TARGET=${IMG_ROOTIMGDIR}/etc/udev/rules.d/40-redhat.rules - -# If the file does not exist in /etc/udev, copy it from /lib/udev -if [ ! -e ${UDEV_REDHAT_TARGET} ]; then - cp -n ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET} -fi - -# Disable udev memory auto-onlining Rule for cuda10.x -# -# For RHELS 7.5 ALT -# -sed -i "s/^\(SUBSYSTEM==\"memory\".*\)/#\1/" ${UDEV_REDHAT_TARGET} -# -# For RHELS 7.6 ALT -# -if [[ `grep 'Memory hotadd request' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null && grep 'LABEL="memory_hotplug_end' ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` == 0 ]]; then - echo "Detected RHELS 7.6 ALT, modifying ${UDEV_REDHAT_TARGET}..." - # Comment out the memory hotadd request (for reference) - if [[ `grep "## Memory hotadd request" ${UDEV_REDHAT_TARGET} 2>&1 >> /dev/null; echo $?` != 0 ]]; then - # but only run one time, not if it's already commented out. (to handle multiple genimage calls) - #sed -i '/Memory hotadd request/,+8 s/^/#/' ${UDEV_REDHAT_TARGET} - # RH76 CUDA doc recommends the following: - sed -i s/^\SUBSYSTEM!=\"memory\"/SUBSYSTEM==\"\*\"/ ${UDEV_REDHAT_TARGET} - sed -i s/^\ACTION!=\"add\"/ACTION==\"\*\"/ /tmp/40-redhat.rules ${UDEV_REDHAT_TARGET} - fi -fi - -echo "Comparing ${UDEV_REDHAT_SOURCE} and ${UDEV_REDHAT_TARGET}" -diff ${UDEV_REDHAT_SOURCE} ${UDEV_REDHAT_TARGET} - -# Setting NVIDIA parameters in both /etc/modprobe.d and /usr/lib/modprobe.d - -echo "==> Setting NVIDIA options in /usr/lib/modprobe.d/gpusupport and /etc/modprobe.d" -echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/gpusupport.conf -echo 'options nvidia NVreg_EnableStreamMemOPs=1 NVreg_RegistryDwords="PeerMappingOverride=1"' >${IMG_ROOTIMGDIR}/etc/modprobe.d/gpusupport.conf - -grep nouveau ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf -if (( $? )) -then -echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf -fi - -grep nouveau ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf -if (( $? )) -then -echo 'blacklist nouveau' >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf -fi - -# This is for nvprof (per George Chochia) -grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf -if (( $? )) -then - echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/usr/lib/modprobe.d/nvidia.conf -fi - -grep NVreg_RestrictProfilingToAdminUsers ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf -if (( $? )) -then - echo "options nvidia NVreg_RestrictProfilingToAdminUsers=0" >> ${IMG_ROOTIMGDIR}/etc/modprobe.d/nvidia.conf -fi - -if [ -z "${IMG_ROOTIMGDIR}" ] -then - kernel_version="$(for d in $(ls /lib/modules | sort -V) ; do : ; done && echo $d)" - mkinitrd -v -f "/boot/initramfs-${kernel_version}.img" "${kernel_version}" -fi