From 210cd3ede65ce5530451c0d900f80fa80c202314 Mon Sep 17 00:00:00 2001
From: Marty Kandes <mkandes@sdsc.edu>
Date: Sat, 20 Mar 2021 22:15:02 +0000
Subject: [PATCH] Test newer versions of OpenMPI 4.X.X series

May have observed the effects of a bug in older versions of OpenMPI
4.0.X series when attempting to run a single-node HPL calculation on
Expanse with the Singularity.hpl-2.3-ubuntu-18.04-openmpi-4.0.4-openblas-0.3.14
container. Single-node job fails with this set of PMIX errors [1] at
startup. This issue appears to have been observed previously [2] [3]
[4]. Unfortunately, the suggested temporary solutions to set
PMIX_MCA_gds=^ds21 or PMIX_MCA_gds=hash do not work. However, it seems
like the bug causing the problem should be fixed in the latest releases
of the OpenMPI 4.X.X series. Hence, the new Ubuntu 18.04 + OpenMPI 4.0.5
and Ubuntu 18.04 + OpenMPI 4.1.0 definitions files.

[1]

[exp-8-32:06710] PMIX ERROR: NOT-FOUND in file dstore_base.c at line 2866
[exp-8-32:06710] PMIX ERROR: NOT-FOUND in file server/pmix_server.c at line 3408
[exp-8-32:06742] PMIX ERROR: OUT-OF-RESOURCE in file client/pmix_client.c at line 231
[exp-8-32:06742] OPAL ERROR: Error in file pmix3x_client.c at line 112
*** An error occurred in MPI_Init
*** on a NULL communicator
*** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
***    and potentially your MPI job)
[exp-8-32:06742] Local abort before MPI_INIT completed completed successfully, but am not able to aggregate error messages, and not able to guarantee that all other processes were killed!
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[43048,1],0]
  Exit code:    1
--------------------------------------------------------------------------
[exp-8-32:06710] PMIX ERROR: ERROR in file gds_ds21_lock_pthread.c at line 99
[exp-8-32:06710] PMIX ERROR: ERROR in file gds_ds21_lock_pthread.c at line 99

[2]

https://github.com/open-mpi/ompi/issues/6761

[3]

https://github.com/open-mpi/ompi/issues/6981

[4]

https://github.com/open-mpi/ompi/issues/7516
---
 .../Singularity.ubuntu-18.04-openmpi-4.0.5    | 134 ++++++++++++++++++
 .../Singularity.ubuntu-18.04-openmpi-4.1.0    | 134 ++++++++++++++++++
 2 files changed, 268 insertions(+)
 create mode 100644 definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.0.5
 create mode 100644 definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.1.0

diff --git a/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.0.5 b/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.0.5
new file mode 100644
index 0000000..df94ed9
--- /dev/null
+++ b/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.0.5
@@ -0,0 +1,134 @@
+Bootstrap: shub
+From: mkandes/naked-singularity:ubuntu-18.04
+
+%labels
+
+    APPLICATION_NAME ubuntu + openmpi
+    APPLICATION_VERSION 18.04 + 4.0.5
+    APPLICATION_URL https://www.open-mpi.org
+
+    AUTHOR_NAME Marty Kandes
+    AUTHOR_EMAIL mkandes@sdsc.edu
+
+    LAST_UPDATED 20210319
+
+%setup
+
+%environment
+
+    # Set Mellanox OFED version, operating system, and hardware platform
+    export MLNX_ROOT_URL='http://www.mellanox.com/downloads/ofed'
+    export MLNX_OFED_VERSION='4.7-3.2.9.0'
+    export MLNX_OS_VERSION='ubuntu18.04'
+    export MLNX_PLATFORM='x86_64'
+
+    # Set OpenMPI major, minor, and revision numbers, root and
+    # installation directories
+    export OMPI_ROOT_URL='https://download.open-mpi.org/release/open-mpi'
+    export OMPI_MAJOR='4'
+    export OMPI_MINOR='0'
+    export OMPI_REVISION='5'
+    export OMPI_VERSION="${OMPI_MAJOR}.${OMPI_MINOR}.${OMPI_REVISION}"
+    export OMPI_ROOT_DIR='/opt/openmpi'
+    export OMPI_INSTALL_DIR="${OMPI_ROOT_DIR}/${OMPI_VERSION}"
+
+    # Set paths to OpenMPI binaries and libraries
+    export PATH="${OMPI_INSTALL_DIR}/bin:${PATH}"
+    export LD_LIBRARY_PATH="${OMPI_INSTALL_DIR}/lib:${LD_LIBRARY_PATH}"
+
+%post -c /bin/bash
+
+    # Set operating system mirror URL
+    export MIRRORURL='http://us.archive.ubuntu.com/ubuntu'
+
+    # Set operating system version
+    export OSVERSION='bionic'
+
+    # Set system locale
+    export LC_ALL='C'
+
+    # Set debian frontend interface
+    export DEBIAN_FRONTEND='noninteractive'
+
+    # Upgrade all software packages to their latest versions
+    apt-get -y update && apt-get -y upgrade
+
+    # Install all dependencies and/or prerequisites for Mellanox OFED
+    apt-get -y install bison
+    apt-get -y install chrpath
+    apt-get -y install debhelper
+    apt-get -y install dpatch
+    apt-get -y install flex
+    apt-get -y install graphviz
+    apt-get -y install libnl-3-dev
+    apt-get -y install libnl-route-3-200
+    apt-get -y install tcl-dev
+    apt-get -y install tk-dev
+    apt-get -y install swig
+
+    cd /tmp
+
+    # Set Mellanox OFED version, operating system, and hardware platform
+    export MLNX_ROOT_URL='http://www.mellanox.com/downloads/ofed'
+    export MLNX_OFED_VERSION='4.7-3.2.9.0'
+    export MLNX_OS_VERSION='ubuntu18.04'
+    export MLNX_PLATFORM='x86_64'
+
+    # Download and install Mellanox OFED drivers and supporting
+    # libraries for userspace access to Ethernet, RDMA, and Infiniband. 
+    # https://docs.mellanox.com/pages/releaseview.action?pageId=15049785
+    wget "${MLNX_ROOT_URL}/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz"
+    tar -xf "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz"
+    cd "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}"
+    ./mlnxofedinstall --user-space-only --without-fw-update --force
+
+    cd /tmp
+
+    # Remove Mellanox OFED archive directory and tarball
+    rm -rf "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}"
+    rm "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz"
+
+    # Install additional tools
+    apt-get -y install numactl
+    apt-get -y install libnuma-dev
+
+    # Install OpenMPI dependencies
+    apt-get -y install zlib1g-dev
+
+    cd /tmp
+
+    # Set OpenMPI major, minor, and revision numbers, root and
+    # installation directories
+    export OMPI_ROOT_URL='https://download.open-mpi.org/release/open-mpi'
+    export OMPI_MAJOR='4'
+    export OMPI_MINOR='0'
+    export OMPI_REVISION='5'
+    export OMPI_VERSION="${OMPI_MAJOR}.${OMPI_MINOR}.${OMPI_REVISION}"
+    export OMPI_ROOT_DIR='/opt/openmpi'
+    export OMPI_INSTALL_DIR="${OMPI_ROOT_DIR}/${OMPI_VERSION}"
+
+    # Download, build, and install OpenMPI
+    wget "${OMPI_ROOT_URL}/v${OMPI_MAJOR}.${OMPI_MINOR}/openmpi-${OMPI_VERSION}.tar.gz"
+    tar -xf "openmpi-${OMPI_VERSION}.tar.gz"
+    cd "openmpi-${OMPI_VERSION}"
+    ./configure --prefix="${OMPI_INSTALL_DIR}" --without-verbs
+    make all install
+
+    cd /tmp
+
+    # Remove OpenMPI build directory and source tarball
+    rm -rf "openmpi-${OMPI_VERSION}"
+    rm "openmpi-${OMPI_VERSION}.tar.gz"
+
+    # Cleanup
+    apt-get -y autoremove --purge
+    apt-get -y clean
+
+    # Update database for mlocate
+    updatedb
+
+%files
+
+%runscript
+
+%test
diff --git a/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.1.0 b/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.1.0
new file mode 100644
index 0000000..cb0c679
--- /dev/null
+++ b/definition-files/ubuntu/Singularity.ubuntu-18.04-openmpi-4.1.0
@@ -0,0 +1,134 @@
+Bootstrap: shub
+From: mkandes/naked-singularity:ubuntu-18.04
+
+%labels
+
+    APPLICATION_NAME ubuntu + openmpi
+    APPLICATION_VERSION 18.04 + 4.1.0
+    APPLICATION_URL https://www.open-mpi.org
+
+    AUTHOR_NAME Marty Kandes
+    AUTHOR_EMAIL mkandes@sdsc.edu
+
+    LAST_UPDATED 20210319
+
+%setup
+
+%environment
+
+    # Set Mellanox OFED version, operating system, and hardware platform
+    export MLNX_ROOT_URL='http://www.mellanox.com/downloads/ofed'
+    export MLNX_OFED_VERSION='4.7-3.2.9.0'
+    export MLNX_OS_VERSION='ubuntu18.04'
+    export MLNX_PLATFORM='x86_64'
+
+    # Set OpenMPI major, minor, and revision numbers, root and
+    # installation directories
+    export OMPI_ROOT_URL='https://download.open-mpi.org/release/open-mpi'
+    export OMPI_MAJOR='4'
+    export OMPI_MINOR='1'
+    export OMPI_REVISION='0'
+    export OMPI_VERSION="${OMPI_MAJOR}.${OMPI_MINOR}.${OMPI_REVISION}"
+    export OMPI_ROOT_DIR='/opt/openmpi'
+    export OMPI_INSTALL_DIR="${OMPI_ROOT_DIR}/${OMPI_VERSION}"
+
+    # Set paths to OpenMPI binaries and libraries
+    export PATH="${OMPI_INSTALL_DIR}/bin:${PATH}"
+    export LD_LIBRARY_PATH="${OMPI_INSTALL_DIR}/lib:${LD_LIBRARY_PATH}"
+
+%post -c /bin/bash
+
+    # Set operating system mirror URL
+    export MIRRORURL='http://us.archive.ubuntu.com/ubuntu'
+
+    # Set operating system version
+    export OSVERSION='bionic'
+
+    # Set system locale
+    export LC_ALL='C'
+
+    # Set debian frontend interface
+    export DEBIAN_FRONTEND='noninteractive'
+
+    # Upgrade all software packages to their latest versions
+    apt-get -y update && apt-get -y upgrade
+
+    # Install all dependencies and/or prerequisites for Mellanox OFED
+    apt-get -y install bison
+    apt-get -y install chrpath
+    apt-get -y install debhelper
+    apt-get -y install dpatch
+    apt-get -y install flex
+    apt-get -y install graphviz
+    apt-get -y install libnl-3-dev
+    apt-get -y install libnl-route-3-200
+    apt-get -y install tcl-dev
+    apt-get -y install tk-dev
+    apt-get -y install swig
+
+    cd /tmp
+
+    # Set Mellanox OFED version, operating system, and hardware platform
+    export MLNX_ROOT_URL='http://www.mellanox.com/downloads/ofed'
+    export MLNX_OFED_VERSION='4.7-3.2.9.0'
+    export MLNX_OS_VERSION='ubuntu18.04'
+    export MLNX_PLATFORM='x86_64'
+
+    # Download and install Mellanox OFED drivers and supporting
+    # libraries for userspace access to Ethernet, RDMA, and Infiniband. 
+    # https://docs.mellanox.com/pages/releaseview.action?pageId=15049785
+    wget "${MLNX_ROOT_URL}/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz"
+    tar -xf "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz"
+    cd "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}"
+    ./mlnxofedinstall --user-space-only --without-fw-update --force
+
+    cd /tmp
+
+    # Remove Mellanox OFED archive directory and tarball
+    rm -rf "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}"
+    rm "MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-${MLNX_OS_VERSION}-${MLNX_PLATFORM}.tgz"
+
+    # Install additional tools
+    apt-get -y install numactl
+    apt-get -y install libnuma-dev
+
+    # Install OpenMPI dependencies
+    apt-get -y install zlib1g-dev
+
+    cd /tmp
+
+    # Set OpenMPI major, minor, and revision numbers, root and
+    # installation directories
+    export OMPI_ROOT_URL='https://download.open-mpi.org/release/open-mpi'
+    export OMPI_MAJOR='4'
+    export OMPI_MINOR='1'
+    export OMPI_REVISION='0'
+    export OMPI_VERSION="${OMPI_MAJOR}.${OMPI_MINOR}.${OMPI_REVISION}"
+    export OMPI_ROOT_DIR='/opt/openmpi'
+    export OMPI_INSTALL_DIR="${OMPI_ROOT_DIR}/${OMPI_VERSION}"
+
+    # Download, build, and install OpenMPI
+    wget "${OMPI_ROOT_URL}/v${OMPI_MAJOR}.${OMPI_MINOR}/openmpi-${OMPI_VERSION}.tar.gz"
+    tar -xf "openmpi-${OMPI_VERSION}.tar.gz"
+    cd "openmpi-${OMPI_VERSION}"
+    ./configure --prefix="${OMPI_INSTALL_DIR}" --without-verbs
+    make all install
+
+    cd /tmp
+
+    # Remove OpenMPI build directory and source tarball
+    rm -rf "openmpi-${OMPI_VERSION}"
+    rm "openmpi-${OMPI_VERSION}.tar.gz"
+
+    # Cleanup
+    apt-get -y autoremove --purge
+    apt-get -y clean
+
+    # Update database for mlocate
+    updatedb
+
+%files
+
+%runscript
+
+%test
-- 
GitLab