Setting up Torch
Installing Conda Packages with a Dedicated Pod
apiVersion: v1
kind: Pod
metadata:
name: pytorch-install
spec:
restartPolicy: Never
volumes:
- name: home-volume
persistentVolumeClaim:
claimName: dsmillerrunfol-rwm # Ensure this is your correct PVC
containers:
- name: pytorch-setup-container
image: "nvidia/samples:vectoradd-cuda11.2.1"
resources:
requests:
memory: "32Gi"
nvidia.com/gpu: 1
limits:
memory: "32Gi"
nvidia.com/gpu: 1
volumeMounts:
- name: home-volume
mountPath: /kube/home/
command:
- /bin/bash
- -c
- |
# Set the Miniconda path and initialize
export MINICONDA_PATH="/kube/home/.envs/conda"
export PATH="$MINICONDA_PATH/bin:$PATH"
# Check if the conda environment already exists
if conda info --envs | grep -q "torchEnv"; then
echo "Conda environment 'torchEnv' already exists. Deleting and rebuilding."
conda env remove -n torchEnv --yes
fi
# Create a new conda environment with Python 3.11
conda create -y -n torchEnv python=3.11
# Activate the environment
source activate torchEnv
# Install PyTorch, torchvision, and pytorch-cuda
conda install -y -n torchEnv pytorch torchvision pytorch-cuda=12.1 -c nvidia -c pytorch
#Test that we have a GPU, and it's registering in Torch.
echo "Testing visibility of GPUs on system and in Python."
python -c "import torch; gpus = torch.cuda.device_count(); print(f'Available GPUs: {gpus}'); [print(f'GPU {gpu}: {torch.cuda.get_device_name(gpu)}') for gpu in range(gpus)]"
nvidia-smi
# Check for GPU availability using Python and PyTorch
GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())")
# Export the Conda environment if at least one GPU is detected
if [ "$GPU_COUNT" -gt 0 ]; then
echo "At least one GPU detected, exporting Conda environment."
echo "Exporting environment yaml file to:" $MINICONDA_PATH
conda env export -n torchEnv > $MINICONDA_PATH/torchEnv.yml
else
echo "No GPUs detected. Something may have gone wrong, or you may not have asked for any in your pod."
fi
Last updated