当前位置: 首页 > 工具软件 > ko-build > 使用案例 >

[问题已处理]-安装完nvidia-docker之后报错Unable to load the kernel module ‘nvidia.ko‘

白坚壁
2023-12-01

导语:经过确认,我用ubuntu20.04安装完nvidia-docker之后再安装nvidia驱动就会出现问题,重启也没用。

整理了一下,得先装cuda,然后再装nvidia-docker2才没问题。

#!/bin/bash

localpath=$(
  cd "$(dirname "$0")"
  pwd
)
#显色信息提示
blue() {
  echo -e "\033[34m $(date +%Y%m%d_%H%M%S)>>>>>> $1 \033[0m"
}
green() {
  echo -e "\033[32m $(date +%Y%m%d_%H%M%S)>>>>>> $1 \033[0m"
}
red() {
  echo -e "\033[31m $(date +%Y%m%d_%H%M%S)>>>>>> $1 \033[0m"
}
yellow() {
  echo -e "\033[33m $(date +%Y%m%d_%H%M%S)>>>>>> $1 \033[0m"
}

#Debs依赖包
cuda_cudnn_driver_install() {
  type nvidia-smi 2>/dev/null
  if [ $? -eq 0 ]; then
    #cuda_version=`cat /usr/local/cuda/version.json |grep "version"|head -n 1|awk -F ":" '{print $2}'|sed 's/\"//g'|sed 's/ //g'`
    nvidia_version=$(nvidia-smi -q | grep 'Driver Version' | awk '{print $4}')
  fi
  if [ x$nvidia_version != x470.63.01 ]; then
    modprobe -r nvidia-uvm 2>/dev/null
    modprobe -r nvidia-drm 2>/dev/null
    yellow "卸载旧版本驱动,按照屏幕提示直接回车即可"
    /usr/bin/nvidia-uninstall 2>/dev/null
    yellow "nvidia驱动安装"
    apt-get -y -qqq purge nvidia* 2>/dev/null
    apt-get -y -qqq purge *nvidia* *cuda* 2>/dev/null
    cd /
    apt-get -y --purge remove nvidia-* 2>/dev/null
    cd $localpath
    echo blacklist rivafb >> /etc/modprobe.d/blacklist.conf
    echo blacklist vga16fb >> /etc/modprobe.d/blacklist.conf
    echo blacklist nouveau >> /etc/modprobe.d/blacklist.conf
    echo blacklist nvidiafb >> /etc/modprobe.d/blacklist.conf
    echo blacklist rivatv >> /etc/modprobe.d/blacklist.conf
    cat <<-EOF >/etc/modprobe.d/blacklist-nouveau.conf
                blacklist nouveau
                options nouveau modeset=0
EOF
    update-initramfs -u
    apt install -y  dkms build-essential linux-headers-generic 2>/dev/null
    ./NVIDIA-Linux-x86_64-470.63.01.run -silent --no-x-check --no-nouveau-check --install-libglvnd
    [ $? -ne 0 ] &&  red "nvidia驱动安装异常,请检查nvidia-smi命令是否可用!!!" && exit 1 || green "nvidia驱动安装成功"
    modprobe nvidia-uvm 1>/dev/null
    modprobe nvidia-drm 1>/dev/null
  fi
  #开启驱动持久模式
  nvidia-smi -pm 1
  ln -s /lib/systemd/system/rc.local.service /etc/systemd/system/
  touch /etc/rc.local
  chmod +x /etc/rc.local
  grep nvidia-smi /etc/rc.local >/dev/null 2>&1
  [ $? != 0 ] && sed -i '/exit/i\nvidia-smi -pm 1' /etc/rc.local
  blue "13.Nvidia Driver 470  安装成功"

}
docker_install() {

	tar -xvf ./docker-20.10.8.tgz
	cp -f docker/* /usr/bin/
	cat >/etc/systemd/system/docker.service <<EOF
	[Unit]
	Description=Docker Application Container Engine
	Documentation=https://docs.docker.com
	After=network-online.target firewalld.service
	Wants=network-online.target
	[Service]
	Type=notify
	ExecStart=/usr/bin/dockerd
	ExecReload=/bin/kill -s HUP \$MAINPID
	LimitNOFILE=infinity
	LimitNPROC=infinity
	LimitCORE=infinity
	TimeoutStartSec=0
	Delegate=yes
	KillMode=process
	Restart=on-failure
	StartLimitBurst=3
	StartLimitInterval=60s
	[Install]
	WantedBy=multi-user.target
EOF

#  dpkg -i /var/debs/libnvidia-container*
#  dpkg -i /var/debs/nvidia-container-toolkit_1.8.1-1_amd64.deb
#  echo Y |  dpkg -i /var/debs/nvidia-docker2_2.9.1-1_all.deb
   dpkg -i /var/debs/libnvidia-container1_1.9.0-1_amd64.deb
   dpkg -i /var/debs/libnvidia-container-tools_1.9.0-1_amd64.deb
   dpkg -i /var/debs/nvidia-container-runtime_3.9.0-1_all.deb
   dpkg -i /var/debs/nvidia-container-toolkit_1.9.0-1_amd64.deb
   echo Y | dpkg -i /var/debs/nvidia-docker2_2.10.0-1_all.deb

  apt install  -y -qqq gcc cmake 2>/dev/null
  echo Y |apt --fix-broken install  -y    1>/dev/null

        cat >  /etc/docker/daemon.json <<EOF
        {
            "default-runtime": "nvidia",
            "runtimes": {
                "nvidia": {
                    "path": "/usr/bin/nvidia-container-runtime",
                    "runtimeArgs": []
                }
            },
                "default-shm-size": "2G",
        "insecure-registries": ["harbor.deepwise.com","10.10.3.5","172.28.3.5"] ,
        "graph":"/data1/docker/lib/docker"
        }
EOF
        systemctl daemon-reload
        systemctl start  docker
#        systemctl start containerd.service
        systemctl restart  docker
        systemctl restart containerd.service
	systemctl enable docker.service
	systemctl status docker.service --no-pager

	# 覆盖一下版本问题
	cp -f docker/* /usr/bin/
        systemctl daemon-reload
        systemctl start  docker
#        systemctl start containerd.service
        systemctl restart  docker
        systemctl restart containerd.service
        systemctl enable docker.service
        systemctl status docker.service --no-pager
}





debs_libs() {
  yellow "正在同步deb依赖包,大约需要一分钟....."
  mkdir -p /var/debs && rm -rf /var/debs/* && cp -ar  debs/* /var/debs/
  green "deb依赖包同步完成"
  cp /etc/apt/sources.list /etc/apt/sources.list_bak_origin
  rm -rf /var/lib/apt/lists/*
  cat <<-EOF >/etc/apt/sources.list
        deb [trusted=yes] file:///var/ debs/
EOF
  rm -rf /etc/apt/sources.list.d/google-chrome.list
  #预防apt环境问题
  mv /var/lib/dpkg/info /var/lib/dpkg/info_old >/dev/null 2>&1
  mkdir /var/lib/dpkg/info
  apt-get -y -qqq update
  apt-get -f install -qqq
  mv /var/lib/dpkg/info/* /var/lib/dpkg/info_old >/dev/null 2>&1
  rm -rf /var/lib/dpkg/info
  mv /var/lib/dpkg/info_old /var/lib/dpkg/info >/dev/null 2>&1
  apt-get -y -qqq update
  apt -y -qqq update
  green "Debs-pkg 依赖安装"

  apt install  -y -qqq gcc cmake 2>/dev/null

}




# 安装
main() {
  TIMEFORMAT="%2lR"


  time debs_libs
  time cuda_cudnn_driver_install
  time docker_install

}
mainx() {
  TIMEFORMAT="%2lR"

#  time debs_libs
#  time cuda_cudnn_driver_install
#  time docker_install

}

if [ x$1 = x'ubuntu' ]; then
  main | tee -a install.log
else
  mainx | tee -a install.log
fi

直接执行脚本安装即可

./install.sh ubuntu

debs依赖包下载地址

链接: https://pan.baidu.com/s/1VA0DI8h0bHOAaV-WLdPS7w?pwd=2k5r 提取码: 2k5r
–来自百度网盘超级会员v5的分享

如果还是报这个错,确认是否完全禁用nouveau。
lsmod | grep nouveau 必须没有返回才是完全禁用。执行update-initramfs -u 需要重启生效。

 类似资料: