kill -9 `pidof gdb` && kill -9 `pidof slurmd`
systemctl restart slurmd && sinfo && gdb slurmd_
set follow-fork-mode child
b slurmd_req
r
c
c
>sbatch
c
c
c(debug2("Processing RPC: REQUEST_BATCH_JOB_LAUNCH");)
b _rpc_batch_job
c
b _forkexec_slurmstepd
c
set follow-fork-mode child
b job_manager
c
set follow-fork-mode child
b _spawn_job_container(src\slurmd\slurmstepd\mgr.c)
c
b spank_init
set follow-fork-mode child
c
b checkpoint_stepd_prefork(这后面会fork很多次,后面没跟到,后面通过添加打印信息继续追踪)
job_manager(src\slurmd\slurmstepd\mgr.c)以root身份运行,执行共享内存和和互联初始化
初始化各种插件
mpi_hook_slurmstepd_init//根据SLURM_MPI_TYPE或者配置文件获取并注册MPI插件
_mpi_init
继续初始化插件
mpi_hook_slurmstepd_prefork
mpi_hook_slurmstepd_init
(*(ops.slurmstepd_prefork))(job, env)
p_mpi_hook_slurmstepd_prefork(src\plugins\mpi\pmix\mpi_pmix.c)
pmixp_stepd_init
pmixp_dconn_init(src\plugins\mpi\pmix\pmixp_dconn.c)
(#ifdef HAVE_UCX)
pmixp_info_srv_direct_conn_ucx
pmixp_dconn_ucx_prepare(mpi\pmix\pmixp_dconn_ucx.c)
_load_ucx_lib(PMIXP_UCX_LIBPATH)
ucp_config_read
ucp_init
ucp_worker_create
ucp_worker_get_address
ucp_worker_get_efd
pmixp_agent_start
_fork_all_tasks
exec_task
_setup_mpi
mpi_hook_slurmstepd_task
ucp_worker_create(ucx-1.6.x\src\ucp\core\ucp_worker.c)
uct_worker_create
ucp_worker_add_resource_ifaces
uct_iface_open(src\uct\base\uct_md.c)
tlc->iface_open(md, worker, params, config, iface_p)
UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params,const uct_iface_config_t *tl_config)
uct_dc_mlx5_iface_create_dcis
uct_dc_mlx5_iface_dci_connect
ibv_exp_modify_qp(包括下面三个过程)
INIT
RTR(Ready To Receive)
RTS(Ready To Send)