From 210cd3ede65ce5530451c0d900f80fa80c202314 Mon Sep 17 00:00:00 2001 From: Marty Kandes <mkandes@sdsc.edu> Date: Sat, 20 Mar 2021 22:15:02 +0000 Subject: [PATCH] Test newer versions of OpenMPI 4.X.X series May have observed the effects of a bug in older versions of OpenMPI 4.0.X series when attempting to run a single-node HPL calculation on Expanse with the Singularity.hpl-2.3-ubuntu-18.04-openmpi-4.0.4-openblas-0.3.14 container. Single-node job fails with this set of PMIX errors [1] at startup. This issue appears to have been observed previously [2] [3] [4]. Unfortunately, the suggested temporary solutions to set PMIX_MCA_gds=^ds21 or PMIX_MCA_gds=hash do not work. However, it seems like the bug causing the problem should be fixed in the latest releases of the OpenMPI 4.X.X series. Hence, the new Ubuntu 18.04 + OpenMPI 4.0.5 and Ubuntu 18.04 + OpenMPI 4.1.0 definitions files. [1] [exp-8-32:06710] PMIX ERROR: NOT-FOUND in file dstore_base.c at line 2866 [exp-8-32:06710] PMIX ERROR: NOT-FOUND in file server/pmix_server.c at line 3408 [exp-8-32:06742] PMIX ERROR: OUT-OF-RESOURCE in file client/pmix_client.c at line 231 [exp-8-32:06742] OPAL ERROR: Error in file pmix3x_client.c at line 112 *** An error occurred in MPI_Init *** on a NULL communicator *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort, *** and potentially your MPI job) [exp-8-32:06742] Local abort before MPI_INIT completed completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed! -------------------------------------------------------------------------- Primary job terminated normally, but 1 process returned a non-zero exit code. Per user-direction, the job has been aborted. -------------------------------------------------------------------------- -------------------------------------------------------------------------- mpirun detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was: Process name: [[43048,1],0] Exit code: 1 -------------------------------------------------------------------------- [exp-8-32:06710] PMIX ERROR: ERROR in file gds_ds21_lock_pthread.c at line 99 [exp-8-32:06710] PMIX ERROR: ERROR in file gds_ds21_lock_pthread.c at line 99 [2] https://github.com/open-mpi/ompi/issues/6761 [3] https://github.com/open-mpi/ompi/issues/6981 [4] https://github.com/open-mpi/ompi/issues/7516 --- .../Singularity.ubuntu-18.04-openmpi-4.0.5 | 134 ++++++++++++++++++ .../Singularity.ubuntu-18.04-openmpi-4.1.0 | 134 ++++++++++++++++++ 2 files changed, 268 insertions(+) create mode 100644 definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.0.5 create mode 100644 definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.1.0 diff --git a/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.0.5 b/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.0.5 new file mode 100644 index 0000000..df94ed9 --- /dev/null +++ b/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.0.5 @@ -0,0 +1,134 @@ +Bootstrap: shub +From: mkandes/naked-singularity:ubuntu-18.04 + +%labels + + APPLICATION_NAME ubuntu + openmpi + APPLICATION_VERSION 18.04 + 4.0.5 + APPLICATION_URL https://www.open-mpi.org + + AUTHOR_NAME Marty Kandes + AUTHOR_EMAIL mkandes@sdsc.edu + + LAST_UPDATED 20210319 + +%setup + +%environment + + # Set Mellanox OFED version, operating system, and hardware platform + export MLNX_ROOT_URL='http://www.mellanox.com/downloads/ofed' + export MLNX_OFED_VERSION='4.7-3.2.9.0' + export MLNX_OS_VERSION='ubuntu18.04' + export MLNX_PLATFORM='x86_64' + + # Set OpenMPI major, minor, and revision numbers, root and + # installation directories + export OMPI_ROOT_URL='https://download.open-mpi.org/release/open-mpi' + export OMPI_MAJOR='4' + export OMPI_MINOR='0' + export OMPI_REVISION='5' + export OMPI_VERSION="${OMPI_MAJOR}.${OMPI_MINOR}.${OMPI_REVISION}" + export OMPI_ROOT_DIR='/opt/openmpi' + export OMPI_INSTALL_DIR="${OMPI_ROOT_DIR}/${OMPI_VERSION}" + + # Set paths to OpenMPI binaries and libraries + export PATH="${OMPI_INSTALL_DIR}/bin:${PATH}" + export LD_LIBRARY_PATH="${OMPI_INSTALL_DIR}/lib:${LD_LIBRARY_PATH}" + +%post -c /bin/bash + + # Set operating system mirror URL + export MIRRORURL='http://us.archive.ubuntu.com/ubuntu' + + # Set operating system version + export OSVERSION='bionic' + + # Set system locale + export LC_ALL='C' + + # Set debian frontend interface + export DEBIAN_FRONTEND='noninteractive' + + # Upgrade all software packages to their latest versions + apt-get -y update && apt-get -y upgrade + + # Install all dependencies and/or prerequisites for Mellanox OFED + apt-get -y install bison + apt-get -y install chrpath + apt-get -y install debhelper + apt-get -y install dpatch + apt-get -y install flex + apt-get -y install graphviz + apt-get -y install libnl-3-dev + apt-get -y install libnl-route-3-200 + apt-get -y install tcl-dev + apt-get -y install tk-dev + apt-get -y install swig + + cd /tmp + + # Set Mellanox OFED version, operating system, and hardware platform + export MLNX_ROOT_URL='http://www.mellanox.com/downloads/ofed' + export MLNX_OFED_VERSION='4.7-3.2.9.0' + export MLNX_OS_VERSION='ubuntu18.04' + export MLNX_PLATFORM='x86_64' + + # Download and install Mellanox OFED drivers and supporting + # libraries for userspace access to Ethernet, RDMA, and Infiniband. + # https://docs.mellanox.com/pages/releaseview.action?pageId=15049785 + wget "${MLNX_ROOT_URL}/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz" + tar -xf "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz" + cd "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}" + ./mlnxofedinstall --user-space-only --without-fw-update --force + + cd /tmp + + # Remove Mellanox OFED archive directory and tarball + rm -rf "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}" + rm "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz" + + # Install additional tools + apt-get -y install numactl + apt-get -y install libnuma-dev + + # Install OpenMPI dependencies + apt-get -y install zlib1g-dev + + cd /tmp + + # Set OpenMPI major, minor, and revision numbers, root and + # installation directories + export OMPI_ROOT_URL='https://download.open-mpi.org/release/open-mpi' + export OMPI_MAJOR='4' + export OMPI_MINOR='0' + export OMPI_REVISION='5' + export OMPI_VERSION="${OMPI_MAJOR}.${OMPI_MINOR}.${OMPI_REVISION}" + export OMPI_ROOT_DIR='/opt/openmpi' + export OMPI_INSTALL_DIR="${OMPI_ROOT_DIR}/${OMPI_VERSION}" + + # Download, build, and install OpenMPI + wget "${OMPI_ROOT_URL}/v${OMPI_MAJOR}.${OMPI_MINOR}/openmpi-${OMPI_VERSION}.tar.gz" + tar -xf "openmpi-${OMPI_VERSION}.tar.gz" + cd "openmpi-${OMPI_VERSION}" + ./configure --prefix="${OMPI_INSTALL_DIR}" --without-verbs + make all install + + cd /tmp + + # Remove OpenMPI build directory and source tarball + rm -rf "openmpi-${OMPI_VERSION}" + rm "openmpi-${OMPI_VERSION}.tar.gz" + + # Cleanup + apt-get -y autoremove --purge + apt-get -y clean + + # Update database for mlocate + updatedb + +%files + +%runscript + +%test diff --git a/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.1.0 b/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.1.0 new file mode 100644 index 0000000..cb0c679 --- /dev/null +++ b/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.1.0 @@ -0,0 +1,134 @@ +Bootstrap: shub +From: mkandes/naked-singularity:ubuntu-18.04 + +%labels + + APPLICATION_NAME ubuntu + openmpi + APPLICATION_VERSION 18.04 + 4.1.0 + APPLICATION_URL https://www.open-mpi.org + + AUTHOR_NAME Marty Kandes + AUTHOR_EMAIL mkandes@sdsc.edu + + LAST_UPDATED 20210319 + +%setup + +%environment + + # Set Mellanox OFED version, operating system, and hardware platform + export MLNX_ROOT_URL='http://www.mellanox.com/downloads/ofed' + export MLNX_OFED_VERSION='4.7-3.2.9.0' + export MLNX_OS_VERSION='ubuntu18.04' + export MLNX_PLATFORM='x86_64' + + # Set OpenMPI major, minor, and revision numbers, root and + # installation directories + export OMPI_ROOT_URL='https://download.open-mpi.org/release/open-mpi' + export OMPI_MAJOR='4' + export OMPI_MINOR='1' + export OMPI_REVISION='0' + export OMPI_VERSION="${OMPI_MAJOR}.${OMPI_MINOR}.${OMPI_REVISION}" + export OMPI_ROOT_DIR='/opt/openmpi' + export OMPI_INSTALL_DIR="${OMPI_ROOT_DIR}/${OMPI_VERSION}" + + # Set paths to OpenMPI binaries and libraries + export PATH="${OMPI_INSTALL_DIR}/bin:${PATH}" + export LD_LIBRARY_PATH="${OMPI_INSTALL_DIR}/lib:${LD_LIBRARY_PATH}" + +%post -c /bin/bash + + # Set operating system mirror URL + export MIRRORURL='http://us.archive.ubuntu.com/ubuntu' + + # Set operating system version + export OSVERSION='bionic' + + # Set system locale + export LC_ALL='C' + + # Set debian frontend interface + export DEBIAN_FRONTEND='noninteractive' + + # Upgrade all software packages to their latest versions + apt-get -y update && apt-get -y upgrade + + # Install all dependencies and/or prerequisites for Mellanox OFED + apt-get -y install bison + apt-get -y install chrpath + apt-get -y install debhelper + apt-get -y install dpatch + apt-get -y install flex + apt-get -y install graphviz + apt-get -y install libnl-3-dev + apt-get -y install libnl-route-3-200 + apt-get -y install tcl-dev + apt-get -y install tk-dev + apt-get -y install swig + + cd /tmp + + # Set Mellanox OFED version, operating system, and hardware platform + export MLNX_ROOT_URL='http://www.mellanox.com/downloads/ofed' + export MLNX_OFED_VERSION='4.7-3.2.9.0' + export MLNX_OS_VERSION='ubuntu18.04' + export MLNX_PLATFORM='x86_64' + + # Download and install Mellanox OFED drivers and supporting + # libraries for userspace access to Ethernet, RDMA, and Infiniband. + # https://docs.mellanox.com/pages/releaseview.action?pageId=15049785 + wget "${MLNX_ROOT_URL}/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz" + tar -xf "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz" + cd "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}" + ./mlnxofedinstall --user-space-only --without-fw-update --force + + cd /tmp + + # Remove Mellanox OFED archive directory and tarball + rm -rf "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}" + rm "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz" + + # Install additional tools + apt-get -y install numactl + apt-get -y install libnuma-dev + + # Install OpenMPI dependencies + apt-get -y install zlib1g-dev + + cd /tmp + + # Set OpenMPI major, minor, and revision numbers, root and + # installation directories + export OMPI_ROOT_URL='https://download.open-mpi.org/release/open-mpi' + export OMPI_MAJOR='4' + export OMPI_MINOR='1' + export OMPI_REVISION='0' + export OMPI_VERSION="${OMPI_MAJOR}.${OMPI_MINOR}.${OMPI_REVISION}" + export OMPI_ROOT_DIR='/opt/openmpi' + export OMPI_INSTALL_DIR="${OMPI_ROOT_DIR}/${OMPI_VERSION}" + + # Download, build, and install OpenMPI + wget "${OMPI_ROOT_URL}/v${OMPI_MAJOR}.${OMPI_MINOR}/openmpi-${OMPI_VERSION}.tar.gz" + tar -xf "openmpi-${OMPI_VERSION}.tar.gz" + cd "openmpi-${OMPI_VERSION}" + ./configure --prefix="${OMPI_INSTALL_DIR}" --without-verbs + make all install + + cd /tmp + + # Remove OpenMPI build directory and source tarball + rm -rf "openmpi-${OMPI_VERSION}" + rm "openmpi-${OMPI_VERSION}.tar.gz" + + # Cleanup + apt-get -y autoremove --purge + apt-get -y clean + + # Update database for mlocate + updatedb + +%files + +%runscript + +%test -- GitLab