Untitled
user_3839718
sh
2 years ago
1.2 kB
9
Indexable
#!/bin/bash
#SBATCH --job-name=food_recipe_generation
#SBATCH --nodes=1 # Request one node
#SBATCH --ntasks-per-node=1 # Number of tasks (i.e., processes) per node
#SBATCH --cpus-per-task=6 # Number of CPU cores per task
#SBATCH --mem=32G # Request 32GB of memory
#SBATCH --partition=high # Specify the partition/queue
#SBATCH --gres=gpu:tesla:3
#SBATCH -o %x-%j.out # File to which STDOUT will be written
#SBATCH -e %x-%j.err # File to which STDERR will be written
module load CUDA
# Check if CUDA module loaded successfully
if [ $? -ne 0 ]; then
echo "Failed to load CUDA module"
exit 1
fi
# Run the training script using accelerate within the Singularity container
singularity exec --nv --bind ./output:/app/t5-base-recipe-model food_recipe_container.sif \
accelerate launch \
--num_processes 3 \
--num_machines 1 \
--mixed_precision no \
--dynamo_backend no \
/app/main.py --upper_limit 100000 --batch_per_gpu 8
# Check if the Singularity execution was successful
if [ $? -ne 0 ]; then
echo "Singularity execution failed"
exit 1
fi
echo "Job completed"Editor is loading...
Leave a Comment