Untitled
user_3839718
sh
a year ago
1.2 kB
8
Indexable
#!/bin/bash #SBATCH --job-name=food_recipe_generation #SBATCH --nodes=1 # Request one node #SBATCH --ntasks-per-node=1 # Number of tasks (i.e., processes) per node #SBATCH --cpus-per-task=6 # Number of CPU cores per task #SBATCH --mem=32G # Request 32GB of memory #SBATCH --partition=high # Specify the partition/queue #SBATCH --gres=gpu:tesla:3 #SBATCH -o %x-%j.out # File to which STDOUT will be written #SBATCH -e %x-%j.err # File to which STDERR will be written module load CUDA # Check if CUDA module loaded successfully if [ $? -ne 0 ]; then echo "Failed to load CUDA module" exit 1 fi # Run the training script using accelerate within the Singularity container singularity exec --nv --bind ./output:/app/t5-base-recipe-model food_recipe_container.sif \ accelerate launch \ --num_processes 3 \ --num_machines 1 \ --mixed_precision no \ --dynamo_backend no \ /app/main.py --upper_limit 100000 --batch_per_gpu 8 # Check if the Singularity execution was successful if [ $? -ne 0 ]; then echo "Singularity execution failed" exit 1 fi echo "Job completed"
Editor is loading...
Leave a Comment