script examples/05_submitting_batch_jobs/job-quickstart.sh:
#!/bin/bash #SBATCH -J h5test #SBATCH -N 1 module purge module load gcc/5.3 intel-mpi/5 hdf5/1.8.18-MPI cp $VSC_HDF5_ROOT/share/hdf5_examples/c/ph5example.c . mpicc -lhdf5 ph5example.c -o ph5example mpirun -np 8 ./ph5example -c -v
submission:
$ sbatch job.sh Submitted batch job 5250981
check what is going on:
squeue -u $USER
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 5250981 mem_0128 h5test markus R 0:00 2 n323-[018-019]
Output files:
ParaEg0.h5 ParaEg1.h5 slurm-5250981.out
try on .h5 files:
h5dump
cancel jobs:
scancel <job_id>
or
scancel <job_name>
or
scancel -u $USER
partition | RAM (GB) | CPU | Cores | IB (HCA) | #Nodes |
---|---|---|---|---|---|
mem_0064* | 64 | 2x Intel E5-2650 v2 @ 2.60GHz | 2×8 | 2xQDR | 1849 |
mem_0128 | 128 | 2x Intel E5-2650 v2 @ 2.60GHz | 2×8 | 2xQDR | 140 |
mem_0256 | 256 | 2x Intel E5-2650 v2 @ 2.60GHz | 2×8 | 2xQDR | 50 |
vsc3plus_0064 | 64 | 2x Intel E5-2660 v2 @ 2.20GHz | 2×10 | 1xFDR | 816 |
vsc3plus_0256 | 256 | 2x Intel E5-2660 v2 @ 2.20GHz | 2×10 | 1xFDR | 48 |
binf | 512 - 1536 | 2x Intel E5-2690 v4 @ 2.60GHz | 2×14 | 1xFDR | 17 |
* default partition, QDR: Intel Truescale Infinipath (40Gbit/s), FDR: Mellanox ConnectX-3 (56Gbit/s)
effective: 10/2018
#SBATCH -p <partition>
partition | QOS |
---|---|
mem_0064* | normal_0064 |
mem_0128 | normal_0128 |
mem_0256 | normal_0256 |
vsc3plus_0064 | vsc3plus_0064 |
vsc3plus_0256 | vsc3plus_0256 |
binf | normal_binf |
#SBATCH --qos <QOS>
partition | RAM (GB) | CPU | Cores | IB (HCA) | #Nodes |
---|---|---|---|---|---|
mem_0096* | 96 | 2x Intel Platinum 8174 @ 3.10GHz | 2×24 | 1xEDR | 688 |
mem_0384 | 384 | 2x Intel Platinum 8174 @ 3.10GHz | 2×24 | 1xEDR | 78 |
mem_0768 | 768 | 2x Intel Platinum 8174 @ 3.10GHz | 2×24 | 1xEDR | 12 |
* default partition, EDR: Intel Omni-Path (100Gbit/s)
effective: 10/2020
partition | QOS |
---|---|
mem_0096* | mem_0096 |
mem_0384 | mem_0384 |
mem_0768 | mem_0768 |
sinfo -o %P scontrol show partition mem_0064 scontrol show node n301-001
1.+2.:
sqos -acc
default_account: p70824 account: p70824 default_qos: normal_0064 qos: devel_0128 goodluck gpu_gtx1080amd gpu_gtx1080multi gpu_gtx1080single gpu_k20m gpu_m60 knl normal_0064 normal_0128 normal_0256 normal_binf vsc3plus_0064 vsc3plus_0256
3.:
sqos
qos_name total used free walltime priority partitions ========================================================================= normal_0064 1782 1173 609 3-00:00:00 2000 mem_0064 normal_0256 15 24 -9 3-00:00:00 2000 mem_0256 normal_0128 93 51 42 3-00:00:00 2000 mem_0128 devel_0128 10 20 -10 00:10:00 20000 mem_0128 goodluck 0 0 0 3-00:00:00 1000 vsc3plus_0256,vsc3plus_0064,amd knl 4 1 3 3-00:00:00 1000 knl normal_binf 16 5 11 1-00:00:00 1000 binf gpu_gtx1080multi 4 2 2 3-00:00:00 2000 gpu_gtx1080multi gpu_gtx1080single 50 18 32 3-00:00:00 2000 gpu_gtx1080single gpu_k20m 2 0 2 3-00:00:00 2000 gpu_k20m gpu_m60 1 1 0 3-00:00:00 2000 gpu_m60 vsc3plus_0064 800 781 19 3-00:00:00 1000 vsc3plus_0064 vsc3plus_0256 48 44 4 3-00:00:00 1000 vsc3plus_0256 gpu_gtx1080amd 1 0 1 3-00:00:00 2000 gpu_gtx1080amd
naming convention:
QOS | Partition |
---|---|
*_0064 | mem_0064 |
#SBATCH --account=xxxxxx #SBATCH --qos=xxxxx_xxxx #SBATCH --partition=mem_xxxx
For omitted lines corresponding defaults are used. See previous slides, default partition is “mem_0064”
default:
#!/bin/bash #SBATCH -J jobname #SBATCH -N number_of_nodes do_my_work
job is submitted to:
explicit:
#!/bin/bash #SBATCH -J jobname #SBATCH -N number_of_nodes #SBATCH #SBATCH #SBATCH --partition=mem_xxxx #SBATCH --qos=xxxxx_xxxx #SBATCH --account=xxxxxx do_my_work
SLURM_JOB_NUM_NODES
)sbatch <SLURM_PARAMETERS> job.sh <JOB_PARAMETERS>
sqos sqos -acc
scontrol show partition ... scontrol show node ...
hostname free
for i in {1..1000} do sbatch job.sh $i done
for i in {1..1000} do mpirun my_program $i done
#!/bin/sh #SBATCH -J array #SBATCH -N 1 #SBATCH --array=1-10 echo "Hi, this is array job number" $SLURM_ARRAY_TASK_ID sleep $SLURM_ARRAY_TASK_ID
VSC-4 > squeue -u $user JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) 406846_[7-10] mem_0096 array sh PD 0:00 1 (Resources) 406846_4 mem_0096 array sh R INVALID 1 n403-062 406846_5 mem_0096 array sh R INVALID 1 n403-072 406846_6 mem_0096 array sh R INVALID 1 n404-031
VSC-4 > ls slurm-* slurm-406846_10.out slurm-406846_3.out slurm-406846_6.out slurm-406846_9.out slurm-406846_1.out slurm-406846_4.out slurm-406846_7.out slurm-406846_2.out slurm-406846_5.out slurm-406846_8.out
VSC-4 > cat slurm-406846_8.out Hi, this is array job number 8
#SBATCH --array=1-20:5
#SBATCH --array=1-20:5%2
for ((i=1; i<=48; i++)) do stress --cpu 1 --timeout $i & done wait
... #SBATCH --array=1-144:48 j=$SLURM_ARRAY_TASK_ID ((j+=47)) for ((i=$SLURM_ARRAY_TASK_ID; i<=$j; i++)) do stress --cpu 1 --timeout $i & done wait
examples/05_submitting_batch_jobs
#SBATCH | job environment |
---|---|
-N | SLURM_JOB_NUM_NODES |
–ntasks-per-core | SLURM_NTASKS_PER_CORE |
–ntasks-per-node | SLURM_NTASKS_PER_NODE |
–ntasks, -n | SLURM_NTASKS |
#SBATCH --mail-user=yourmail@example.com #SBATCH --mail-type=BEGIN,END
#SBATCH -t, --time=<time> #SBATCH --time-min=<time>
time format:
squeue -h -j $SLURM_JOBID -o %L
VSC-3 > slic
Within the SLURN submit script add the flags as shown with ‘slic’, e.g. when both Matlab and Mathematica are required
#SBATCH -L matlab@vsc,mathematica@vsc
Intel licenses are needed only when compiling code, not for running resulting executables
VSC-3 > scontrol show reservations
#SBATCH --reservation=
echo "2+2" | matlab
Example: Two nodes with two MPI processes each:
#SBATCH -N 2 #SBATCH --tasks-per-node=2 srun --cpu_bind=map_cpu:0,24 ./my_mpi_program
#SBATCH -N 2 #SBATCH --tasks-per-node=2 export I_MPI_PIN_PROCESSOR_LIST=0,24 # Intel MPI syntax mpirun ./my_mpi_program
#!/bin/bash #SBATCH -J jobname #SBATCH -N 2 #SBATCH -d afterany:<job_id> srun ./my_program
<HTML><ol start=“3” style=“list-style-type: decimal;”></HTML> <HTML><li></HTML>continue at 2. for further dependent jobs<HTML></li></HTML><HTML></ol></HTML>