1.系统:centos 7.9
2.VMwareWrok Station Pro 16
3.三台虚拟机
服务器 | IP | 主机名 |
控制节点 | 192.168.11.11 | master |
计算节点1 | 192.168.11.22 | node01 |
计算节点2 | 192.168.11.33 | node02 |
systemctl stop firewalld
systemctl disable firewalld
sed -i -e 's/^SELINUX=.*/SELINUX=disabled/g' /etc/selinux/config
setenforce 0
hostname master
hostname node01
hostname node02
vim /etc/hosts
192.168.11.11 master
192.168.11.22 node01
192.168.11.33 node02
reboot
echo y| ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
ssh-copy-id -i ~/.ssh/id_rsa.pub -o StrictHostKeyChecking=no root@node01
ssh-copy-id -i ~/.ssh/id_rsa.pub -o StrictHostKeyChecking=no root@node02
yum -y install net-tools wget vim ntpdate chrony htop glances nfs-utils rpcbind python32
yum install -y epel-release
yum install munge munge-libs munge-devel -y
create-munge-key
scp -p /etc/munge/munge.key root@ndoe01:/etc/munge
scp -p /etc/munge/munge.key root@node02:/etc/munge
# 计算节点上面执行
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
systemctl restart munge
systemctl enable munge
systemctl status munge
# 本地查看凭据
munge -n
# 本地解码
munge -n | unmunge
# 验证compute node,远程解码
munge -n | ssh node01 unmunge
# Munge凭证基准测试
remunge
groupadd -g 1109 slurm
useradd -m -c "Slurm manager" -d /var/lib/slurm -u 1109 -g slurm -s /bin/bash slurm
安装Slurm依赖
yum install gcc gcc-c++ readline-devel perl-ExtUtils-MakeMaker pam-devel rpm-build mysql-devel http-parser-devel json-c-devel libjwt libjwt-devel -y
编译Slurm和安装Slurm
# 下载地址
wget https://download.schedmd.com/slurm/slurm-21.08.8.tar.bz2
rpmbuild -ta --with mysql --with slurmrestd --with jwt slurm-21.08.8.tar.bz2
cd /root/rpmbuild/RPMS/x86_64/
yum localinstall -y slurm-*
配置控制节点Slurm (控制节点上执行)
# 控制节点上面执行
cp /etc/slurm/cgroup.conf.example /etc/slurm/cgroup.conf
cp /etc/slurm/slurm.conf.example /etc/slurm/slurm.conf
参考官网: https://slurm.schedmd.com/quickstart_admin.html
ControlMachine 改为控制节点的名称
ControlAddr 改为控制节点的 IP
NodeName 改为控制节点的名称,计算节点名称
Nodes 改为计算节点的名称
vim /etc/slurm/slurm.conf
ClusterName=cluster
ControlMachine=master
ControlAddr=192.168.11.11
SlurmctldDebug=info
SlurmdDebug=debug3
GresTypes=gpu
MpiDefault=none
ProctrackType=proctrack/cgroup
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurm
SlurmUser=slurm
StateSaveLocation=/var/spool/slurm/ctld
SwitchType=switch/none
TaskPlugin=task/affinity,task/cgroup
TaskPluginParam=verbose
MinJobAge=172800
AccountingStorageEnforce=associations
AccountingStorageHost=master
AccountingStoragePort=6819
AccountingStorageType=accounting_storage/slurmdbd
AccountingStoreFlags=job_comment
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdLogFile=/var/log/slurm/slurmd.log
AuthAltTypes=auth/jwt
AuthAltParameters=jwt_key=/var/spool/slurm/ctld/jwt_hs256.key
NodeName=master,node[01-02] CPUs=1 RealMemory=1024 State=UNKNOWN
PartitionName=compute Nodes=node[01-02] Default=YES MaxTime=INFINITE State=UP
scp /etc/slurm/*.conf node01:/etc/slurm/
scp /etc/slurm/*.conf node02:/etc/slurm/
mkdir -p /var/spool/slurm
chown slurm: /var/spool/slurm
mkdir -p /var/log/slurm
chown slurm: /var/log/slurm
wget https://dev.mysql.com/get/Downloads/MySQL-8.0/mysql-8.0.16-2.el7.x86_64.rpm-bundle.tar
tar -xvf mysql-8.0.16-2.el7.x86_64.rpm-bundle.tar
common
、libs
、client
、server
rpm -ivh mysql-community-client-8.0.16-2.el7.x86_64.rpm --nodeps --force
rpm -ivh mysql-community-libs-8.0.16-2.el7.x86_64.rpm --nodeps --force
rpm -ivh mysql-community-common-8.0.16-2.el7.x86_64.rpm --nodeps --force
rpm -ivh mysql-community-server-8.0.16-2.el7.x86_64.rpm --nodeps --force
mkdir -p /data/mysql
vim /etc/my.cnf
character-set-server=utf8
collation-server=utf8_general_ci
lower_case_table_names=1
max_connections=10240
open_files_limit=10240
max_connect_errors=10240
max_allowed_packet=10M
port=3306
sql_mode=NO_ENGINE_SUBSTITUTION,STRICT_TRANS_TABLES
[client]
socket=/data/mysql/mysql.sock
sed -i 's#^datadir.*#datadir=/data/mysql#g' /etc/my.cnf
sed -i 's#^socket.*#socket=/data/mysql/mysql.sock#g' /etc/my.cnf
systemctl restart mysqld
systemctl enable mysqld
systemctl status mysqld
# 在日志中获取密码
grep 'temporary password' /var/log/mysqld.log
# 登录mysql,密码是上一步日志获取的
mysql -uroot -p
# 修改root密码
ALTER user root@localhost identified by '123456';
use mysql;
update user set host = '%' where user = 'root';
select host, user, authentication_string, plugin from user;
# 刷新立即生效
flush privileges;
#登录mysql
mysql uroot -p
CREATE USER 'slurm'@'%' identified with mysql_native_password by 'Slurm*1234';
GRANT ALL ON slurm_acct_db.* TO 'slurm'@'%';
flush privileges;
DbdAddr 控制节点 IP
DbdHos 控制节点 主机名
StorageHost 计算节点 IP
StorageUser 计算节点 主机名
StorageUser 创建的用户 #slurm
StoragePass mysql 登录密码
StorageLoc slurm_acct_db #db名,slurmdbd会自动创建db
StoragePort mysql 端口号
# 控制节点上面执行
cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf
vim /etc/slurm/slurmdbd.conf
AuthType=auth/munge
AuthInfo=/var/run/munge/munge.socket.2
DbdAddr=192.168.11.11
DbdHost=master
SlurmUser=slurm
DebugLevel=verbose
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
StorageType=accounting_storage/mysql
StorageHost=192.168.11.33
StorageUser=slurm
StoragePass=123456
StorageLoc=slurm_acct_db
StoragePort=3306
AuthAltTypes=auth/jwt
AuthAltParameters=jwt_key=/var/spool/slurm/ctld/jwt_hs256.key
chown slurm: /etc/slurm/slurmdbd.conf
chown slurm: /etc/slurm/slurm.conf
mkdir -p /var/spool/slurm/ctld
dd if=/dev/random of=/var/spool/slurm/ctld/jwt_hs256.key bs=32 count=1
chown slurm:slurm /var/spool/slurm/ctld/jwt_hs256.key
chmod 0600 /var/spool/slurm/ctld/jwt_hs256.key
# chown root:root /etc/slurm
chmod 0755 /var/spool/slurm/ctld
chown slurm:slurm /var/spool/slurm/ctld
# 启动控制节点Slurmdbd服务
systemctl restart slurmdbd
systemctl enable slurmdbd
systemctl status slurmdbd
# 启动控制节点slurmctld服务
systemctl restart slurmctld
systemctl enable slurmctld
systemctl status slurmctld
# 启动计算节点的服务
systemctl restart slurmd
systemctl enable slurmd
systemctl status slurmd
检查集群(控制节点和计算节点上面都可以执行)
# 查看集群
sinfo
scontrol show partition
scontrol show node
# 提交作业
srun -N2 hostname
scontrol show jobs
# 查看作业
squeue -a
NodeName=down节点的名称
scontrol update NodeName=c11 State=RESUME