sinfo #查看服务器节点和分区
squeue -u username #查看你当前运行的任务
scontrol show job JOBID #查看指定的jobID的状态
scancel jobid #取消对应jobid任务
sbatch test.s #以批命令的方式运行test.s这个文件。
scontrol show node #显示所有node节点的硬件信息
scontrol show node node02"#查看名字为node02的节点的硬件信息
smap #以图形的方式显示运行的任务
##slurm资源管理系统命令
#运行命令方式:sbatch run.sh
srun sh test.sh #分别运行对应的sh脚本
srun python test.py #运行python程序
srun Rscript test.R #运行R程序
$ module load slurm
$ sbatch slurm-job.sh
Submitted batch job 106
查看当前任务:PD排队;R运行;S挂起;CG正在退出
$ squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
106 defq slurm-jo rstober R 0:04 1 atom01
$ scontrol show job 106
JobId=106 Name=slurm-job.sh
UserId=rstober(1001) GroupId=rstober(1001)
Priority=4294901717 Account=(null) QOS=normal
JobState=RUNNING Reason=None Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=1 ExitCode=0:0
RunTime=00:00:07 TimeLimit=UNLIMITED TimeMin=N/A
SubmitTime=2013-01-26T12:55:02 EligibleTime=2013-01-26T12:55:02
StartTime=2013-01-26T12:55:02 EndTime=Unknown
PreemptTime=None SuspendTime=None SecsPreSuspend=0
Partition=defq AllocNode:Sid=atom-head1:3526
ReqNodeList=(null) ExcNodeList=(null)
NodeList=atom01
BatchHost=atom01
NumNodes=1 NumCPUs=2 CPUs/Task=1 ReqS:C:T=*:*:*
MinCPUsNode=1 MinMemoryNode=0 MinTmpDiskNode=0
Features=(null) Gres=(null) Reservation=(null)
Shared=0 Contiguous=0 Licenses=(null) Network=(null)
Command=/home/rstober/slurm/local/slurm-job.sh
WorkDir=/home/rstober/slurm/local
# scontrol suspend 135
# squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
135 defq simple.s rstober S 0:10 1 atom01
# scontrol resume 135
# squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
135 defq simple.s rstober R 0:13 1 atom01
$ scancel 135
$ squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
$ squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
139 defq simple rstober PD 0:00 1 (Dependency)
138 defq simple rstober R 0:16 1 atom01
$ scontrol hold 139
$ squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
139 defq simple rstober PD 0:00 1 (JobHeldUser)
138 defq simple rstober R 0:32 1 atom01
$ scontrol release 139
$ squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
139 defq simple rstober PD 0:00 1 (Dependency)
138 defq simple rstober R 0:46 1 atom01
显示节点信息;idle表示节点空闲。
$ sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
defq* up infinite 1 down* atom04
defq* up infinite 3 idle atom[01-03]
cloud up infinite 2 down* cnode1,cnodegpu1
cloudtran up infinite 1 idle atom-head1
参考: