|
|
#!/usr/bin/env bash |
|
|
export HOME=/home/user |
|
|
export PYTHONUNBUFFERED=1 |
|
|
export HF_HOME=/home/user/.cache/huggingface |
|
|
|
|
|
export OMP_NUM_THREADS=$(nproc) |
|
|
export MKL_NUM_THREADS=$(nproc) |
|
|
export OPENBLAS_NUM_THREADS=$(nproc) |
|
|
export NUMEXPR_NUM_THREADS=$(nproc) |
|
|
|
|
|
export TORCH_ALLOW_TF32_CUBLAS=1 |
|
|
export TORCH_ALLOW_TF32_CUDNN=1 |
|
|
|
|
|
|
|
|
export SDL_AUDIODRIVER=dummy |
|
|
export PULSE_RUNTIME_PATH=/tmp/pulse-runtime |
|
|
|
|
|
|
|
|
|
|
|
echo "π CUDA Environment Debug Information:" |
|
|
echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" |
|
|
|
|
|
|
|
|
if command -v nvidia-smi >/dev/null 2>&1; then |
|
|
echo "β
nvidia-smi available" |
|
|
echo "π GPU Information:" |
|
|
nvidia-smi --query-gpu=name,driver_version,memory.total,memory.free --format=csv,noheader,nounits 2>/dev/null || echo "β nvidia-smi failed to query GPU" |
|
|
echo "π Running Processes:" |
|
|
nvidia-smi --query-compute-apps=pid,name,used_memory --format=csv,noheader,nounits 2>/dev/null || echo "βΉοΈ No running CUDA processes" |
|
|
else |
|
|
echo "β nvidia-smi not available in container" |
|
|
fi |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "π§ CUDA Runtime Check:" |
|
|
if ls /usr/local/cuda*/lib*/libcudart.so* >/dev/null 2>&1; then |
|
|
echo "β
CUDA runtime libraries found:" |
|
|
ls /usr/local/cuda*/lib*/libcudart.so* 2>/dev/null |
|
|
else |
|
|
echo "β CUDA runtime libraries not found" |
|
|
fi |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "π₯οΈ CUDA Device Files:" |
|
|
if ls /dev/nvidia* >/dev/null 2>&1; then |
|
|
echo "β
NVIDIA device files found:" |
|
|
ls -la /dev/nvidia* 2>/dev/null |
|
|
else |
|
|
echo "β No NVIDIA device files found - Docker may not have GPU access" |
|
|
fi |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "π CUDA Environment Variables:" |
|
|
echo " CUDA_HOME: ${CUDA_HOME:-not set}" |
|
|
echo " CUDA_ROOT: ${CUDA_ROOT:-not set}" |
|
|
echo " CUDA_PATH: ${CUDA_PATH:-not set}" |
|
|
echo " LD_LIBRARY_PATH: ${LD_LIBRARY_PATH:-not set}" |
|
|
echo " TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-not set}" |
|
|
echo " CUDA_VISIBLE_DEVICES: ${CUDA_VISIBLE_DEVICES:-not set}" |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "π PyTorch CUDA Check:" |
|
|
python3 -c " |
|
|
import sys |
|
|
try: |
|
|
import torch |
|
|
print('β
PyTorch imported successfully') |
|
|
print(f' Version: {torch.__version__}') |
|
|
print(f' CUDA available: {torch.cuda.is_available()}') |
|
|
if torch.cuda.is_available(): |
|
|
print(f' CUDA version: {torch.version.cuda}') |
|
|
print(f' cuDNN version: {torch.backends.cudnn.version()}') |
|
|
print(f' Device count: {torch.cuda.device_count()}') |
|
|
for i in range(torch.cuda.device_count()): |
|
|
props = torch.cuda.get_device_properties(i) |
|
|
print(f' Device {i}: {props.name} (SM {props.major}.{props.minor}, {props.total_memory//1024//1024}MB)') |
|
|
else: |
|
|
print('β CUDA not available to PyTorch') |
|
|
print(' This could mean:') |
|
|
print(' - CUDA runtime not properly installed') |
|
|
print(' - GPU not accessible to container') |
|
|
print(' - Driver/runtime version mismatch') |
|
|
except ImportError as e: |
|
|
print(f'β Failed to import PyTorch: {e}') |
|
|
except Exception as e: |
|
|
print(f'β PyTorch CUDA check failed: {e}') |
|
|
" 2>&1 |
|
|
|
|
|
|
|
|
echo "" |
|
|
echo "π©Ί Common Issue Diagnostics:" |
|
|
|
|
|
|
|
|
if [ ! -e /dev/nvidia0 ] && [ ! -e /dev/nvidiactl ]; then |
|
|
echo "β No NVIDIA device nodes - container likely missing --gpus all or --runtime=nvidia" |
|
|
fi |
|
|
|
|
|
|
|
|
if [ -z "$LD_LIBRARY_PATH" ] || ! echo "$LD_LIBRARY_PATH" | grep -q cuda; then |
|
|
echo "β οΈ LD_LIBRARY_PATH may not include CUDA libraries" |
|
|
fi |
|
|
|
|
|
|
|
|
if ls /dev/nvidia* >/dev/null 2>&1; then |
|
|
if ! ls -la /dev/nvidia* | grep -q "rw-rw-rw-\|rw-r--r--"; then |
|
|
echo "β οΈ NVIDIA device files may have restrictive permissions" |
|
|
fi |
|
|
fi |
|
|
|
|
|
echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" |
|
|
echo "π Starting application..." |
|
|
echo "" |
|
|
|
|
|
exec su -p user -c "python3 wgp.py --listen $*" |
|
|
|