导语:经过确认,我用ubuntu20.04安装完nvidia-docker之后再安装nvidia驱动就会出现问题,重启也没用。
整理了一下,得先装cuda,然后再装nvidia-docker2才没问题。
#!/bin/bash
localpath=$(
cd "$(dirname "$0")"
pwd
)
#显色信息提示
blue() {
echo -e "\033[34m $(date +%Y%m%d_%H%M%S)>>>>>> $1 \033[0m"
}
green() {
echo -e "\033[32m $(date +%Y%m%d_%H%M%S)>>>>>> $1 \033[0m"
}
red() {
echo -e "\033[31m $(date +%Y%m%d_%H%M%S)>>>>>> $1 \033[0m"
}
yellow() {
echo -e "\033[33m $(date +%Y%m%d_%H%M%S)>>>>>> $1 \033[0m"
}
#Debs依赖包
cuda_cudnn_driver_install() {
type nvidia-smi 2>/dev/null
if [ $? -eq 0 ]; then
#cuda_version=`cat /usr/local/cuda/version.json |grep "version"|head -n 1|awk -F ":" '{print $2}'|sed 's/\"//g'|sed 's/ //g'`
nvidia_version=$(nvidia-smi -q | grep 'Driver Version' | awk '{print $4}')
fi
if [ x$nvidia_version != x470.63.01 ]; then
modprobe -r nvidia-uvm 2>/dev/null
modprobe -r nvidia-drm 2>/dev/null
yellow "卸载旧版本驱动,按照屏幕提示直接回车即可"
/usr/bin/nvidia-uninstall 2>/dev/null
yellow "nvidia驱动安装"
apt-get -y -qqq purge nvidia* 2>/dev/null
apt-get -y -qqq purge *nvidia* *cuda* 2>/dev/null
cd /
apt-get -y --purge remove nvidia-* 2>/dev/null
cd $localpath
echo blacklist rivafb >> /etc/modprobe.d/blacklist.conf
echo blacklist vga16fb >> /etc/modprobe.d/blacklist.conf
echo blacklist nouveau >> /etc/modprobe.d/blacklist.conf
echo blacklist nvidiafb >> /etc/modprobe.d/blacklist.conf
echo blacklist rivatv >> /etc/modprobe.d/blacklist.conf
cat <<-EOF >/etc/modprobe.d/blacklist-nouveau.conf
blacklist nouveau
options nouveau modeset=0
EOF
update-initramfs -u
apt install -y dkms build-essential linux-headers-generic 2>/dev/null
./NVIDIA-Linux-x86_64-470.63.01.run -silent --no-x-check --no-nouveau-check --install-libglvnd
[ $? -ne 0 ] && red "nvidia驱动安装异常,请检查nvidia-smi命令是否可用!!!" && exit 1 || green "nvidia驱动安装成功"
modprobe nvidia-uvm 1>/dev/null
modprobe nvidia-drm 1>/dev/null
fi
#开启驱动持久模式
nvidia-smi -pm 1
ln -s /lib/systemd/system/rc.local.service /etc/systemd/system/
touch /etc/rc.local
chmod +x /etc/rc.local
grep nvidia-smi /etc/rc.local >/dev/null 2>&1
[ $? != 0 ] && sed -i '/exit/i\nvidia-smi -pm 1' /etc/rc.local
blue "13.Nvidia Driver 470 安装成功"
}
docker_install() {
tar -xvf ./docker-20.10.8.tgz
cp -f docker/* /usr/bin/
cat >/etc/systemd/system/docker.service <<EOF
[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target firewalld.service
Wants=network-online.target
[Service]
Type=notify
ExecStart=/usr/bin/dockerd
ExecReload=/bin/kill -s HUP \$MAINPID
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
TimeoutStartSec=0
Delegate=yes
KillMode=process
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
[Install]
WantedBy=multi-user.target
EOF
# dpkg -i /var/debs/libnvidia-container*
# dpkg -i /var/debs/nvidia-container-toolkit_1.8.1-1_amd64.deb
# echo Y | dpkg -i /var/debs/nvidia-docker2_2.9.1-1_all.deb
dpkg -i /var/debs/libnvidia-container1_1.9.0-1_amd64.deb
dpkg -i /var/debs/libnvidia-container-tools_1.9.0-1_amd64.deb
dpkg -i /var/debs/nvidia-container-runtime_3.9.0-1_all.deb
dpkg -i /var/debs/nvidia-container-toolkit_1.9.0-1_amd64.deb
echo Y | dpkg -i /var/debs/nvidia-docker2_2.10.0-1_all.deb
apt install -y -qqq gcc cmake 2>/dev/null
echo Y |apt --fix-broken install -y 1>/dev/null
cat > /etc/docker/daemon.json <<EOF
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
},
"default-shm-size": "2G",
"insecure-registries": ["harbor.deepwise.com","10.10.3.5","172.28.3.5"] ,
"graph":"/data1/docker/lib/docker"
}
EOF
systemctl daemon-reload
systemctl start docker
# systemctl start containerd.service
systemctl restart docker
systemctl restart containerd.service
systemctl enable docker.service
systemctl status docker.service --no-pager
# 覆盖一下版本问题
cp -f docker/* /usr/bin/
systemctl daemon-reload
systemctl start docker
# systemctl start containerd.service
systemctl restart docker
systemctl restart containerd.service
systemctl enable docker.service
systemctl status docker.service --no-pager
}
debs_libs() {
yellow "正在同步deb依赖包,大约需要一分钟....."
mkdir -p /var/debs && rm -rf /var/debs/* && cp -ar debs/* /var/debs/
green "deb依赖包同步完成"
cp /etc/apt/sources.list /etc/apt/sources.list_bak_origin
rm -rf /var/lib/apt/lists/*
cat <<-EOF >/etc/apt/sources.list
deb [trusted=yes] file:///var/ debs/
EOF
rm -rf /etc/apt/sources.list.d/google-chrome.list
#预防apt环境问题
mv /var/lib/dpkg/info /var/lib/dpkg/info_old >/dev/null 2>&1
mkdir /var/lib/dpkg/info
apt-get -y -qqq update
apt-get -f install -qqq
mv /var/lib/dpkg/info/* /var/lib/dpkg/info_old >/dev/null 2>&1
rm -rf /var/lib/dpkg/info
mv /var/lib/dpkg/info_old /var/lib/dpkg/info >/dev/null 2>&1
apt-get -y -qqq update
apt -y -qqq update
green "Debs-pkg 依赖安装"
apt install -y -qqq gcc cmake 2>/dev/null
}
# 安装
main() {
TIMEFORMAT="%2lR"
time debs_libs
time cuda_cudnn_driver_install
time docker_install
}
mainx() {
TIMEFORMAT="%2lR"
# time debs_libs
# time cuda_cudnn_driver_install
# time docker_install
}
if [ x$1 = x'ubuntu' ]; then
main | tee -a install.log
else
mainx | tee -a install.log
fi
直接执行脚本安装即可
./install.sh ubuntu
debs依赖包下载地址
链接: https://pan.baidu.com/s/1VA0DI8h0bHOAaV-WLdPS7w?pwd=2k5r 提取码: 2k5r
–来自百度网盘超级会员v5的分享
如果还是报这个错,确认是否完全禁用nouveau。
lsmod | grep nouveau 必须没有返回才是完全禁用。执行update-initramfs -u 需要重启生效。