当前位置: 首页 > 工具软件 > SLURM > 使用案例 >

Centos7搭建slurm-21.08.8作业管理系统集群

裴昕
2023-12-01

本地环境

1.系统:centos 7.9

2.VMwareWrok Station Pro 16

3.三台虚拟机

服务器IP主机名
控制节点192.168.11.11master
计算节点1192.168.11.22node01
计算节点2192.168.11.33node02

一 、基础配置(所有机器均执行)

关闭防火墙

systemctl stop firewalld
systemctl disable firewalld
sed -i -e  's/^SELINUX=.*/SELINUX=disabled/g' /etc/selinux/config
setenforce 0

设置主机名

hostname master
hostname node01
hostname node02

设置hosts,重启生效

vim /etc/hosts
192.168.11.11 master
192.168.11.22 node01
192.168.11.33 node02
reboot

配置SSH免登陆(控制节点上边执行

echo y| ssh-keygen -t rsa -P '' -f  ~/.ssh/id_rsa
ssh-copy-id -i ~/.ssh/id_rsa.pub  -o  StrictHostKeyChecking=no root@node01
ssh-copy-id -i ~/.ssh/id_rsa.pub  -o  StrictHostKeyChecking=no root@node02

安装依赖软件

yum -y install net-tools wget vim ntpdate chrony htop glances nfs-utils rpcbind python32

二 、配置Munge(所有机器均执行)

安装epel-release

yum install -y epel-release 

安装munge

yum install munge munge-libs munge-devel -y

创建全局密钥(控制节点上面执行)

create-munge-key

密钥同步到所有计算节点(控制节点上面执行)

scp -p /etc/munge/munge.key root@ndoe01:/etc/munge
scp -p /etc/munge/munge.key root@node02:/etc/munge

# 计算节点上面执行
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key

启动所有节点

systemctl restart munge
systemctl enable munge
systemctl status munge

测试Munge服务,每个计算节点与控制节点进行连接验证

# 本地查看凭据
munge -n
# 本地解码
munge -n | unmunge
# 验证compute node,远程解码
munge -n | ssh node01 unmunge
# Munge凭证基准测试
remunge

三 、配置Slurm(所有机器均执行)

创建Slurm用户

groupadd -g 1109 slurm
useradd -m -c "Slurm manager" -d /var/lib/slurm -u 1109 -g slurm -s /bin/bash slurm

 安装Slurm依赖

yum install gcc gcc-c++ readline-devel perl-ExtUtils-MakeMaker pam-devel rpm-build mysql-devel http-parser-devel json-c-devel libjwt  libjwt-devel -y

编译Slurm和安装Slurm

# 下载地址

wget https://download.schedmd.com/slurm/slurm-21.08.8.tar.bz2

rpmbuild -ta --with mysql --with slurmrestd --with jwt slurm-21.08.8.tar.bz2
cd /root/rpmbuild/RPMS/x86_64/
yum localinstall -y slurm-*

配置控制节点Slurm (控制节点上执行)

# 控制节点上面执行
cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf
cp /etc/slurm/slurm.conf.example /etc/slurm/slurm.conf

参考官网:    https://slurm.schedmd.com/quickstart_admin.html

ControlMachine        改为控制节点的名称

ControlAddr              改为控制节点的 IP

NodeName               改为控制节点的名称,计算节点名称

Nodes                       改为计算节点的名称

vim /etc/slurm/slurm.conf
ClusterName=cluster
ControlMachine=master
ControlAddr=192.168.11.11
SlurmctldDebug=info
SlurmdDebug=debug3
GresTypes=gpu
MpiDefault=none
ProctrackType=proctrack/cgroup
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurm
SlurmUser=slurm
StateSaveLocation=/var/spool/slurm/ctld
SwitchType=switch/none
TaskPlugin=task/affinity,task/cgroup
TaskPluginParam=verbose
MinJobAge=172800
AccountingStorageEnforce=associations
AccountingStorageHost=master
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd
AccountingStoreFlags=job_comment
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdLogFile=/var/log/slurm/slurmd.log
AuthAltTypes=auth/jwt
AuthAltParameters=jwt_key=/var/spool/slurm/ctld/jwt_hs256.key
NodeName=master,node[01-02] CPUs=1 RealMemory=1024 State=UNKNOWN
PartitionName=compute Nodes=node[01-02] Default=YES MaxTime=INFINITE State=UP

复制控制节点配置文件到计算节点(控制节点上面执行

scp /etc/slurm/*.conf node01:/etc/slurm/
scp /etc/slurm/*.conf node02:/etc/slurm/

设置控制、计算节点文件权限 

mkdir -p /var/spool/slurm
chown slurm: /var/spool/slurm
mkdir -p /var/log/slurm
chown slurm: /var/log/slurm

四 、搭建slurmdbd环境

mysql服务端安装(这边在计算节点3中安装

下载(mysql 8.0)

wget https://dev.mysql.com/get/Downloads/MySQL-8.0/mysql-8.0.16-2.el7.x86_64.rpm-bundle.tar

解压缩

tar -xvf mysql-8.0.16-2.el7.x86_64.rpm-bundle.tar

安装commonlibsclientserver

rpm -ivh mysql-community-client-8.0.16-2.el7.x86_64.rpm --nodeps --force
rpm -ivh mysql-community-libs-8.0.16-2.el7.x86_64.rpm --nodeps --force
rpm -ivh mysql-community-common-8.0.16-2.el7.x86_64.rpm --nodeps --force
rpm -ivh mysql-community-server-8.0.16-2.el7.x86_64.rpm --nodeps --force

创建文件夹

mkdir -p /data/mysql

增加配置文件

vim /etc/my.cnf

character-set-server=utf8 
collation-server=utf8_general_ci 
lower_case_table_names=1 
max_connections=10240
open_files_limit=10240
max_connect_errors=10240
max_allowed_packet=10M
port=3306
sql_mode=NO_ENGINE_SUBSTITUTION,STRICT_TRANS_TABLES
[client]
socket=/data/mysql/mysql.sock

修改如下路径

sed -i 's#^datadir.*#datadir=/data/mysql#g'  /etc/my.cnf
sed -i 's#^socket.*#socket=/data/mysql/mysql.sock#g'  /etc/my.cnf

启动服务

systemctl restart mysqld 
systemctl enable mysqld 
systemctl status mysqld

修改密码

# 在日志中获取密码
grep 'temporary password' /var/log/mysqld.log

# 登录mysql,密码是上一步日志获取的
mysql -uroot -p

# 修改root密码
ALTER user root@localhost identified by '123456';

use mysql;
update user set host = '%' where user = 'root';
select host, user, authentication_string, plugin from user;

# 刷新立即生效
flush privileges;

创建数据库的Slurm用户

#登录mysql
mysql uroot -p 

CREATE USER 'slurm'@'%' identified with mysql_native_password  by 'Slurm*1234';
GRANT ALL ON slurm_acct_db.* TO 'slurm'@'%';
flush privileges;

配置slurmdbd.conf文件 

DbdAddr                控制节点 IP

DbdHos                 控制节点 主机名

StorageHost          计算节点 IP

StorageUser          计算节点 主机名

StorageUser          创建的用户                         #slurm

StoragePass          mysql 登录密码

StorageLoc            slurm_acct_db                   #db名,slurmdbd会自动创建db

StoragePort            mysql 端口号

# 控制节点上面执行
cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf

vim /etc/slurm/slurmdbd.conf

AuthType=auth/munge
AuthInfo=/var/run/munge/munge.socket.2
DbdAddr=192.168.11.11
DbdHost=master
SlurmUser=slurm
DebugLevel=verbose
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
StorageType=accounting_storage/mysql
StorageHost=192.168.11.33
StorageUser=slurm
StoragePass=123456
StorageLoc=slurm_acct_db 
StoragePort=3306

AuthAltTypes=auth/jwt
AuthAltParameters=jwt_key=/var/spool/slurm/ctld/jwt_hs256.key

设置权限(控制节点执行

chown slurm: /etc/slurm/slurmdbd.conf
chown slurm: /etc/slurm/slurm.conf

添加JWT键到控制器

mkdir -p /var/spool/slurm/ctld

dd if=/dev/random of=/var/spool/slurm/ctld/jwt_hs256.key bs=32 count=1
chown slurm:slurm /var/spool/slurm/ctld/jwt_hs256.key
chmod 0600 /var/spool/slurm/ctld/jwt_hs256.key
# chown root:root /etc/slurm
chmod 0755 /var/spool/slurm/ctld
chown slurm:slurm /var/spool/slurm/ctld

 启动服务

# 启动控制节点Slurmdbd服务
systemctl restart slurmdbd
systemctl enable slurmdbd
systemctl status slurmdbd
 
# 启动控制节点slurmctld服务
systemctl restart slurmctld
systemctl enable slurmctld
systemctl status slurmctld
 
# 启动计算节点的服务
systemctl restart slurmd
systemctl enable slurmd
systemctl status slurmd

五、检查Slurm集群

检查集群(控制节点和计算节点上面都可以执行

# 查看集群
sinfo
scontrol show partition
scontrol show node

# 提交作业 
srun -N2 hostname
scontrol show jobs

# 查看作业
squeue -a

节点为down时

NodeName=down节点的名称

scontrol update NodeName=c11 State=RESUME

 类似资料: