Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revisionLast revisionBoth sides next revision | ||
pandoc:introduction-to-vsc:05_submitting_batch_jobs:slurm [2018/01/31 11:10] – Pandoc Auto-commit pandoc | pandoc:introduction-to-vsc:05_submitting_batch_jobs:slurm [2020/10/20 08:09] – Pandoc Auto-commit pandoc | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== SLURM ====== | ||
+ | |||
+ | * Article written by Markus Stöhr (VSC Team) < | ||
+ | |||
+ | |||
+ | |||
+ | ==== Quickstart ==== | ||
+ | |||
+ | script [[examples/ | ||
+ | |||
+ | < | ||
+ | #!/bin/bash | ||
+ | #SBATCH -J h5test | ||
+ | #SBATCH -N 1 | ||
+ | |||
+ | module purge | ||
+ | module load gcc/5.3 intel-mpi/5 hdf5/ | ||
+ | |||
+ | cp $VSC_HDF5_ROOT/ | ||
+ | mpicc -lhdf5 ph5example.c -o ph5example | ||
+ | |||
+ | mpirun -np 8 ./ | ||
+ | |||
+ | </ | ||
+ | submission: | ||
+ | |||
+ | < | ||
+ | $ sbatch job.sh | ||
+ | Submitted batch job 5250981 | ||
+ | </ | ||
+ | |||
+ | check what is going on: | ||
+ | |||
+ | < | ||
+ | squeue -u $USER | ||
+ | </ | ||
+ | < | ||
+ | JOBID PARTITION | ||
+ | 5250981 | ||
+ | </ | ||
+ | Output files: | ||
+ | |||
+ | < | ||
+ | ParaEg0.h5 | ||
+ | ParaEg1.h5 | ||
+ | slurm-5250981.out | ||
+ | </ | ||
+ | try on .h5 files: | ||
+ | |||
+ | < | ||
+ | h5dump | ||
+ | </ | ||
+ | |||
+ | cancel jobs: | ||
+ | |||
+ | < | ||
+ | scancel < | ||
+ | </ | ||
+ | or | ||
+ | |||
+ | < | ||
+ | scancel < | ||
+ | </ | ||
+ | or | ||
+ | |||
+ | < | ||
+ | scancel -u $USER | ||
+ | </ | ||
+ | ===== Basic concepts ===== | ||
+ | |||
+ | ==== Queueing system ==== | ||
+ | |||
+ | * job/batch script: | ||
+ | * shell script, that does everything needed to run your calculation | ||
+ | * independent of queueing system | ||
+ | * **use simple scripts** (max 50 lines, i.e. put complicated logic elsewhere) | ||
+ | * load modules from scratch (purge, then load) | ||
+ | |||
+ | |||
+ | * tell scheduler where/how to run jobs: | ||
+ | * #nodes | ||
+ | * nodetype | ||
+ | * … | ||
+ | |||
+ | |||
+ | * scheduler manages job allocation to compute nodes | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | {{..: | ||
+ | |||
+ | ==== SLURM: Accounts and Users ==== | ||
+ | |||
+ | {{..: | ||
+ | |||
+ | |||
+ | ==== SLURM: Partition and Quality of Service ==== | ||
+ | |||
+ | {{..: | ||
+ | |||
+ | |||
+ | ==== VSC-3 Hardware Types ==== | ||
+ | |||
+ | ^partition | ||
+ | |mem_0064* | ||
+ | |mem_0128 | ||
+ | |mem_0256 | ||
+ | |vsc3plus_0064| | ||
+ | |vsc3plus_0256| | ||
+ | |binf | ||
+ | |||
+ | |||
+ | * default partition, QDR: Intel Truescale Infinipath (40Gbit/s), FDR: Mellanox ConnectX-3 (56Gbit/s) | ||
+ | |||
+ | effective: 10/2018 | ||
+ | |||
+ | * + GPU nodes (see later) | ||
+ | * specify partition in job script: | ||
+ | |||
+ | < | ||
+ | #SBATCH -p < | ||
+ | </ | ||
+ | ==== Standard QOS ==== | ||
+ | |||
+ | ^partition | ||
+ | |mem_0064* | ||
+ | |mem_0128 | ||
+ | |mem_0256 | ||
+ | |vsc3plus_0064|vsc3plus_0064| | ||
+ | |vsc3plus_0256|vsc3plus_0256| | ||
+ | |binf | ||
+ | |||
+ | |||
+ | * specify QOS in job script: | ||
+ | |||
+ | < | ||
+ | #SBATCH --qos <QOS> | ||
+ | </ | ||
+ | |||
+ | ---- | ||
+ | |||
+ | ==== VSC-4 Hardware Types ==== | ||
+ | |||
+ | ^partition^ | ||
+ | |mem_0096*| | ||
+ | |mem_0384 | 384 |2x Intel Platinum 8174 @ 3.10GHz| | ||
+ | |mem_0768 | 768 |2x Intel Platinum 8174 @ 3.10GHz| | ||
+ | |||
+ | |||
+ | * default partition, EDR: Intel Omni-Path (100Gbit/s) | ||
+ | |||
+ | effective: 10/2020 | ||
+ | |||
+ | ==== Standard QOS ==== | ||
+ | |||
+ | ^partition^QOS | ||
+ | |mem_0096*|mem_0096| | ||
+ | |mem_0384 |mem_0384| | ||
+ | |mem_0768 |mem_0768| | ||
+ | |||
+ | |||
+ | |||
+ | ---- | ||
+ | |||
+ | ==== VSC Hardware Types ==== | ||
+ | |||
+ | * Display information about partitions and their nodes: | ||
+ | |||
+ | < | ||
+ | sinfo -o %P | ||
+ | scontrol show partition mem_0064 | ||
+ | scontrol show node n301-001 | ||
+ | </ | ||
+ | |||
+ | ==== QOS-Account/ | ||
+ | |||
+ | |||
+ | {{..: | ||
+ | |||
+ | 1.+2.: | ||
+ | |||
+ | < | ||
+ | sqos -acc | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | default_account: | ||
+ | account: | ||
+ | |||
+ | default_qos: | ||
+ | qos: devel_0128 | ||
+ | goodluck | ||
+ | gpu_gtx1080amd | ||
+ | gpu_gtx1080multi | ||
+ | | ||
+ | gpu_k20m | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | </ | ||
+ | |||
+ | |||
+ | ==== QOS-Partition assignment ==== | ||
+ | |||
+ | |||
+ | 3.: | ||
+ | |||
+ | < | ||
+ | sqos | ||
+ | </ | ||
+ | < | ||
+ | qos_name total used free | ||
+ | ========================================================================= | ||
+ | | ||
+ | | ||
+ | | ||
+ | devel_0128 | ||
+ | goodluck | ||
+ | | ||
+ | | ||
+ | gpu_gtx1080multi | ||
+ | | ||
+ | gpu_k20m | ||
+ | | ||
+ | | ||
+ | | ||
+ | gpu_gtx1080amd | ||
+ | </ | ||
+ | naming convention: | ||
+ | |||
+ | ^QOS | ||
+ | |*_0064|mem_0064 | | ||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | |||
+ | ---- | ||
+ | |||
+ | ==== Specification in job script ==== | ||
+ | |||
+ | |||
+ | < | ||
+ | #SBATCH --account=xxxxxx | ||
+ | #SBATCH --qos=xxxxx_xxxx | ||
+ | #SBATCH --partition=mem_xxxx | ||
+ | </ | ||
+ | For omitted lines corresponding defaults are used. See previous slides, default partition is “mem_0064” | ||
+ | |||
+ | |||
+ | ==== Sample batch job ==== | ||
+ | |||
+ | default: | ||
+ | |||
+ | < | ||
+ | #!/bin/bash | ||
+ | #SBATCH -J jobname | ||
+ | #SBATCH -N number_of_nodes | ||
+ | |||
+ | do_my_work | ||
+ | </ | ||
+ | job is submitted to: | ||
+ | |||
+ | * partition mem_0064 | ||
+ | * qos normal_0064 | ||
+ | * default account | ||
+ | |||
+ | |||
+ | |||
+ | explicit: | ||
+ | |||
+ | < | ||
+ | #!/bin/bash | ||
+ | #SBATCH -J jobname | ||
+ | #SBATCH -N number_of_nodes | ||
+ | #SBATCH | ||
+ | #SBATCH | ||
+ | #SBATCH --partition=mem_xxxx | ||
+ | #SBATCH --qos=xxxxx_xxxx | ||
+ | #SBATCH --account=xxxxxx | ||
+ | |||
+ | do_my_work | ||
+ | </ | ||
+ | |||
+ | |||
+ | |||
+ | * must be a shell script (first line!) | ||
+ | * ‘# | ||
+ | * environment variables are set by SLURM for use within the script (e.g. '' | ||
+ | |||
+ | |||
+ | |||
+ | ==== Job submission ==== | ||
+ | |||
+ | < | ||
+ | sbatch < | ||
+ | </ | ||
+ | * parameters are specified as in job script | ||
+ | * precedence: sbatch parameters override parameters in job script | ||
+ | * be careful to place slurm parameters **before** job script | ||
+ | |||
+ | ==== Exercises ==== | ||
+ | |||
+ | * try these commands and find out which partition has to be used if you want to run in QOS ‘devel_0128’: | ||
+ | |||
+ | < | ||
+ | sqos | ||
+ | sqos -acc | ||
+ | </ | ||
+ | * find out, which nodes are in the partition that allows running in ‘devel_0128’. Further, check how much memory these nodes have: | ||
+ | |||
+ | < | ||
+ | scontrol show partition ... | ||
+ | scontrol show node ... | ||
+ | </ | ||
+ | * submit a one node job to QOS devel_0128 with the following commands: | ||
+ | |||
+ | < | ||
+ | hostname | ||
+ | free | ||
+ | </ | ||
+ | ==== Bad job practices ==== | ||
+ | |||
+ | * job submissions in a loop (takes a long time): | ||
+ | |||
+ | < | ||
+ | for i in {1..1000} | ||
+ | do | ||
+ | sbatch job.sh $i | ||
+ | done | ||
+ | </ | ||
+ | |||
+ | * loop inside job script (sequential mpirun commands): | ||
+ | |||
+ | < | ||
+ | for i in {1..1000} | ||
+ | do | ||
+ | mpirun my_program $i | ||
+ | done | ||
+ | </ | ||
+ | |||
+ | |||
+ | ==== Array jobs ==== | ||
+ | |||
+ | * submit/run a series of **independent** jobs via a single SLURM script | ||
+ | * each job in the array gets a unique identifier (SLURM_ARRAY_TASK_ID) based on which various workloads can be organized | ||
+ | * example ([[examples/ | ||
+ | |||
+ | < | ||
+ | #!/bin/sh | ||
+ | #SBATCH -J array | ||
+ | #SBATCH -N 1 | ||
+ | #SBATCH --array=1-10 | ||
+ | |||
+ | echo "Hi, this is array job number" | ||
+ | sleep $SLURM_ARRAY_TASK_ID | ||
+ | </ | ||
+ | * independent jobs: 1, 2, 3 … 10 | ||
+ | |||
+ | < | ||
+ | VSC-4 > squeue | ||
+ | JOBID PARTITION | ||
+ | | ||
+ | 406846_4 | ||
+ | 406846_5 | ||
+ | 406846_6 | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | VSC-4 > ls slurm-* | ||
+ | slurm-406846_10.out | ||
+ | slurm-406846_1.out | ||
+ | slurm-406846_2.out | ||
+ | </ | ||
+ | |||
+ | < | ||
+ | VSC-4 > cat slurm-406846_8.out | ||
+ | Hi, this is array job number | ||
+ | </ | ||
+ | |||
+ | |||
+ | |||
+ | * fine-tuning via builtin variables (SLURM_ARRAY_TASK_MIN, | ||
+ | |||
+ | * example of going in chunks of a certain size, e.g. 5, SLURM_ARRAY_TASK_ID=1, | ||
+ | |||
+ | < | ||
+ | #SBATCH --array=1-20: | ||
+ | </ | ||
+ | |||
+ | * example of limiting number of simultaneously running jobs to 2 (perhaps for licences) | ||
+ | |||
+ | < | ||
+ | #SBATCH --array=1-20: | ||
+ | </ | ||
+ | |||
+ | |||
+ | ==== Single core jobs ==== | ||
+ | |||
+ | * use an entire compute node for several independent jobs | ||
+ | * example: [[examples/ | ||
+ | |||
+ | < | ||
+ | for ((i=1; i<=48; i++)) | ||
+ | do | ||
+ | | ||
+ | done | ||
+ | wait | ||
+ | </ | ||
+ | * ‘& | ||
+ | * ‘wait’: waits for all processes in the background, otherwise script would terminate | ||
+ | |||
+ | |||
+ | ==== Combination of array & single core job ==== | ||
+ | |||
+ | * example: [[examples/ | ||
+ | |||
+ | < | ||
+ | ... | ||
+ | #SBATCH --array=1-144: | ||
+ | |||
+ | j=$SLURM_ARRAY_TASK_ID | ||
+ | ((j+=47)) | ||
+ | |||
+ | for ((i=$SLURM_ARRAY_TASK_ID; | ||
+ | do | ||
+ | | ||
+ | done | ||
+ | wait | ||
+ | |||
+ | </ | ||
+ | ==== Exercises ==== | ||
+ | |||
+ | * files are located in folder '' | ||
+ | * look into [[examples/ | ||
+ | * look into [[examples/ | ||
+ | * run [[examples/ | ||
+ | |||
+ | ==== Job/process setup ==== | ||
+ | |||
+ | * normal jobs: | ||
+ | |||
+ | ^# | ||
+ | |-N | ||
+ | |--ntasks-per-core|SLURM_NTASKS_PER_CORE| | ||
+ | |--ntasks-per-node|SLURM_NTASKS_PER_NODE| | ||
+ | |--ntasks, -n | ||
+ | |||
+ | * emails: | ||
+ | |||
+ | < | ||
+ | #SBATCH --mail-user=yourmail@example.com | ||
+ | #SBATCH --mail-type=BEGIN, | ||
+ | </ | ||
+ | |||
+ | * constraints: | ||
+ | |||
+ | < | ||
+ | #SBATCH -t, --time=< | ||
+ | #SBATCH --time-min=< | ||
+ | </ | ||
+ | |||
+ | time format: | ||
+ | |||
+ | * DD-HH[: | ||
+ | |||
+ | |||
+ | |||
+ | * backfilling: | ||
+ | * get the remaining running time for your job: | ||
+ | |||
+ | < | ||
+ | squeue -h -j $SLURM_JOBID -o %L | ||
+ | </ | ||
+ | |||
+ | |||
+ | ==== Licenses ==== | ||
+ | |||
+ | {{..: | ||
+ | |||
+ | |||
+ | < | ||
+ | VSC-3 > slic | ||
+ | </ | ||
+ | Within the SLURN submit script add the flags as shown with ‘slic’, e.g. when both Matlab and Mathematica are required | ||
+ | |||
+ | < | ||
+ | #SBATCH -L matlab@vsc, | ||
+ | </ | ||
+ | Intel licenses are needed only when compiling code, not for running resulting executables | ||
+ | |||
+ | ==== Reservation of compute nodes ==== | ||
+ | |||
+ | * core-h accounting is done for the entire period of reservation | ||
+ | * contact service@vsc.ac.at | ||
+ | * reservations are named after the project id | ||
+ | |||
+ | * check for reservations: | ||
+ | |||
+ | < | ||
+ | VSC-3 > scontrol show reservations | ||
+ | </ | ||
+ | * usage: | ||
+ | |||
+ | < | ||
+ | #SBATCH --reservation= | ||
+ | </ | ||
+ | |||
+ | |||
+ | ==== Exercises ==== | ||
+ | |||
+ | * check for available reservations. If there is one available, use it | ||
+ | * specify an email address that notifies you when the job has finished | ||
+ | * run the following matlab code in your job: | ||
+ | |||
+ | < | ||
+ | echo " | ||
+ | </ | ||
+ | ==== MPI + pinning ==== | ||
+ | |||
+ | * understand what your code is doing and place the processes correctly | ||
+ | * use only a few processes per node if memory demand is high | ||
+ | * details for pinning: https:// | ||
+ | |||
+ | Example: Two nodes with two MPI processes each: | ||
+ | |||
+ | === srun === | ||
+ | |||
+ | < | ||
+ | #SBATCH -N 2 | ||
+ | #SBATCH --tasks-per-node=2 | ||
+ | |||
+ | srun --cpu_bind=map_cpu: | ||
+ | |||
+ | </ | ||
+ | |||
+ | === mpirun === | ||
+ | |||
+ | < | ||
+ | #SBATCH -N 2 | ||
+ | #SBATCH --tasks-per-node=2 | ||
+ | |||
+ | export I_MPI_PIN_PROCESSOR_LIST=0, | ||
+ | mpirun ./ | ||
+ | </ | ||
+ | |||
+ | |||
+ | ==== Job dependencies ==== | ||
+ | |||
+ | - Submit first job and get its <job id> | ||
+ | - Submit dependent job (and get < | ||
+ | |||
+ | < | ||
+ | #!/bin/bash | ||
+ | #SBATCH -J jobname | ||
+ | #SBATCH -N 2 | ||
+ | #SBATCH -d afterany:< | ||
+ | srun ./ | ||
+ | </ | ||
+ | < | ||
+ | < | ||
+ | |||
+ | |||
+ | ---- | ||
+ | |||