Untitled

 avatar
user_3839718
sh
2 months ago
1.2 kB
1
Indexable
Never
#!/bin/bash
#SBATCH --job-name=food_recipe_generation
#SBATCH --nodes=1               # Request one node
#SBATCH --ntasks-per-node=1     # Number of tasks (i.e., processes) per node
#SBATCH --cpus-per-task=6       # Number of CPU cores per task
#SBATCH --mem=32G               # Request 32GB of memory
#SBATCH --partition=high        # Specify the partition/queue
#SBATCH --gres=gpu:tesla:3
#SBATCH -o %x-%j.out           # File to which STDOUT will be written
#SBATCH -e %x-%j.err           # File to which STDERR will be written

module load CUDA

# Check if CUDA module loaded successfully
if [ $? -ne 0 ]; then
    echo "Failed to load CUDA module"
    exit 1
fi


# Run the training script using accelerate within the Singularity container
singularity exec --nv --bind ./output:/app/t5-base-recipe-model food_recipe_container.sif \
    accelerate launch \
    --num_processes 3 \
    --num_machines 1 \
    --mixed_precision no \
    --dynamo_backend no \
    /app/main.py --upper_limit 100000 --batch_per_gpu 8

# Check if the Singularity execution was successful
if [ $? -ne 0 ]; then
    echo "Singularity execution failed"
    exit 1
fi

echo "Job completed"
Leave a Comment