Spaces:

NTUST-DDRC
/

gen3c

Build error

App Files Files Community

gen3c / Dockerfile

elungky

Fix cudnn.h not found error by dynamically locating and symlinking it from pip installed package

8da02bd about 1 month ago

raw

history blame contribute delete

5.23 kB

	# Adopt new base image with cuDNN pre-installed
	FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04

	# Set environment variables for non-interactive installations to prevent prompts during apt-get.
	ENV DEBIAN_FRONTEND=noninteractive
	ENV CONDA_DIR=/opt/conda

	WORKDIR /app

	# Install essential system dependencies from both Dockerfiles
	RUN apt-get update -y && apt-get install -qqy \
	wget \
	git \
	build-essential \
	libgl1-mesa-glx \
	libglib2.0-0 \
	rsync \
	make \
	libssl-dev zlib1g-dev \
	libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
	libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
	ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
	git-lfs \
	&& rm -rf /var/lib/apt/lists/* \
	&& git lfs install

	# Install Miniconda
	RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
	/bin/bash miniconda.sh -b -p $CONDA_DIR && \
	rm miniconda.sh && \
	export PATH=$CONDA_DIR/bin:$PATH && \
	conda clean --all --yes && \
	conda config --set auto_activate_base false && \
	conda config --add channels conda-forge

	# Set the global PATH for Conda's base environment immediately after installation.
	ENV PATH=$CONDA_DIR/bin:$PATH

	# Accept Conda Terms of Service for default channels.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
	conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r

	# Copy all local project files into the container's working directory (/app).
	COPY . /app

	# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
	RUN conda env create -f cosmos-predict1.yaml

	# Set the default Conda environment to be activated and update PATH (for subsequent layers and runtime)
	ENV CONDA_DEFAULT_ENV=cosmos-predict1
	ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH

	# Install PyTorch and TorchVision via pip with specific CUDA index.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda activate cosmos-predict1 && \
	pip install --no-cache-dir \
	torch==2.3.1 \
	torchvision==0.18.1 \
	torchaudio==2.3.1 \
	--index-url https://download.pytorch.org/whl/cu121

	# NEW: Dynamically find cudnn.h and symlink it to the Conda environment's include path.
	# This ensures Transformer Engine can find it during compilation.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda activate cosmos-predict1 && \
	CUDNN_HEADER_PATH=$(find "$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/" -name "cudnn.h" \| head -n 1) && \
	if [ -f "$CUDNN_HEADER_PATH" ]; then \
	echo "Found cudnn.h at: $CUDNN_HEADER_PATH"; \
	mkdir -p "$CONDA_PREFIX/include" && \
	ln -sf "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h" \|\| \
	cp "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h"; \
	else \
	echo "Error: cudnn.h not found in any expected location within Conda environment. This will likely cause compilation failures."; \
	exit 1; \
	fi

	# IMPORTANT: Symlink fix for Transformer Engine compilation (from INSTALL.md).
	# These symlinks are for other NVIDIA headers that might be in Python site-packages.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda activate cosmos-predict1 && \
	ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia//include/ "$CONDA_PREFIX"/include/ \|\| true && \
	ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia//include/ "$CONDA_PREFIX"/include/python3.10 \|\| true

	# Install Transformer Engine by attempting to compile it.
	# Pass CUDA_HOME explicitly to ensure it looks in the correct Conda environment path.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda activate cosmos-predict1 && \
	CUDA_HOME=$CONDA_PREFIX pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"

	# Install Apex for inference.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda activate cosmos-predict1 && \
	git clone https://github.com/NVIDIA/apex /app/apex && \
	CUDA_HOME=$CONDA_PREFIX pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" /app/apex

	# Install MoGe for inference.
	RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
	conda activate cosmos-predict1 && \
	pip install --no-cache-dir git+https://github.com/microsoft/MoGe.git

	# Make the start.sh script executable.
	RUN chmod +x /app/start.sh

	# Verification Steps
	RUN echo "Verifying Python and Conda installations..."
	RUN python --version
	RUN conda env list
	RUN echo "Verifying PyTorch and CUDA availability..."
	RUN conda run -n cosmos-predict1 python <<EOF
	import torch
	print('PyTorch Version: ' + torch.__version__)
	print('CUDA Available: ' + str(torch.cuda.is_available()))
	if torch.cuda.is_available():
	print('CUDA Device Name: ' + torch.cuda.get_device_name(0))
	else:
	print('CUDA Device Name: N/A')
	EOF
	RUN [ $? -eq 0 ] \|\| echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."

	# Set the default command to run when the container starts.
	CMD ["/app/start.sh"]