gen3c / Dockerfile
elungky's picture
Fix cudnn.h not found error by dynamically locating and symlinking it from pip installed package
8da02bd
# Adopt new base image with cuDNN pre-installed
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
# Set environment variables for non-interactive installations to prevent prompts during apt-get.
ENV DEBIAN_FRONTEND=noninteractive
ENV CONDA_DIR=/opt/conda
WORKDIR /app
# Install essential system dependencies from both Dockerfiles
RUN apt-get update -y && apt-get install -qqy \
wget \
git \
build-essential \
libgl1-mesa-glx \
libglib2.0-0 \
rsync \
make \
libssl-dev zlib1g-dev \
libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
git-lfs \
&& rm -rf /var/lib/apt/lists/* \
&& git lfs install
# Install Miniconda
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
/bin/bash miniconda.sh -b -p $CONDA_DIR && \
rm miniconda.sh && \
export PATH=$CONDA_DIR/bin:$PATH && \
conda clean --all --yes && \
conda config --set auto_activate_base false && \
conda config --add channels conda-forge
# Set the global PATH for Conda's base environment immediately after installation.
ENV PATH=$CONDA_DIR/bin:$PATH
# Accept Conda Terms of Service for default channels.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
# Copy all local project files into the container's working directory (/app).
COPY . /app
# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
RUN conda env create -f cosmos-predict1.yaml
# Set the default Conda environment to be activated and update PATH (for subsequent layers and runtime)
ENV CONDA_DEFAULT_ENV=cosmos-predict1
ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH
# Install PyTorch and TorchVision via pip with specific CUDA index.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
pip install --no-cache-dir \
torch==2.3.1 \
torchvision==0.18.1 \
torchaudio==2.3.1 \
--index-url https://download.pytorch.org/whl/cu121
# NEW: Dynamically find cudnn.h and symlink it to the Conda environment's include path.
# This ensures Transformer Engine can find it during compilation.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
CUDNN_HEADER_PATH=$(find "$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/" -name "cudnn.h" | head -n 1) && \
if [ -f "$CUDNN_HEADER_PATH" ]; then \
echo "Found cudnn.h at: $CUDNN_HEADER_PATH"; \
mkdir -p "$CONDA_PREFIX/include" && \
ln -sf "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h" || \
cp "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h"; \
else \
echo "Error: cudnn.h not found in any expected location within Conda environment. This will likely cause compilation failures."; \
exit 1; \
fi
# IMPORTANT: Symlink fix for Transformer Engine compilation (from INSTALL.md).
# These symlinks are for other NVIDIA headers that might be in Python site-packages.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/ || true && \
ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/python3.10 || true
# Install Transformer Engine by attempting to compile it.
# Pass CUDA_HOME explicitly to ensure it looks in the correct Conda environment path.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
CUDA_HOME=$CONDA_PREFIX pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"
# Install Apex for inference.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
git clone https://github.com/NVIDIA/apex /app/apex && \
CUDA_HOME=$CONDA_PREFIX pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" /app/apex
# Install MoGe for inference.
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
conda activate cosmos-predict1 && \
pip install --no-cache-dir git+https://github.com/microsoft/MoGe.git
# Make the start.sh script executable.
RUN chmod +x /app/start.sh
# Verification Steps
RUN echo "Verifying Python and Conda installations..."
RUN python --version
RUN conda env list
RUN echo "Verifying PyTorch and CUDA availability..."
RUN conda run -n cosmos-predict1 python <<EOF
import torch
print('PyTorch Version: ' + torch.__version__)
print('CUDA Available: ' + str(torch.cuda.is_available()))
if torch.cuda.is_available():
print('CUDA Device Name: ' + torch.cuda.get_device_name(0))
else:
print('CUDA Device Name: N/A')
EOF
RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
# Set the default command to run when the container starts.
CMD ["/app/start.sh"]