DPDK lcore学习笔记

严亦

2023-12-01

1. DPDK核绑定的全局概览

DPDK核绑定的相关函数，都集中在rte_eal_init()函数中调用：
其中主要分为六部分：

检测所有的cpu。
解释核绑定相关参数。
主线程的核绑定。
中断处理线程的创建
副线程的创建。
线程启动和等待。

注意：

本文中，cpu所指的是机器上的逻辑核(也被称为logical processor，简称为processor)。

以下的是rte_eal_init()函数的调用图：

    main
    +-> rte_eal_init
        +-> rte_eal_log_early_init
        +-> eal_log_level_parse
        +-> rte_set_log_level
        |
        |   /* <-- 检测所有的`cpu` -->*/
        +=> rte_eal_cpu_init
        |   +=> rte_eal_get_configuration                               /* 取得 全局变量`rte_config`的指针。 */
        |   +=> lcore_config[].detected = eal_cpu_detected(lcore_id);   /* 检测`cpu`是否存在 */
        |   +=> config->lcore_role[lcore_id] = ROLE_RTE;                /* 'ROLE_RTE`表示`cpu`存在 */
        |   +=> lcore_config[].core_id = eal_cpu_core_id(lcore_id);     /* 取得`cpu`的`lcore_id`。 */
        |   +=> lcore_config[].socket_id = eal_cpu_socket_id(lcore_id); /* 取得`NUMA node id`。 */
        |
        |   /* <-- 解释核绑定相关参数 --> */
        +=> eal_parse_args
        |   +-> eal_parse_common_option
        |       |
        |       |   /* option: -c */
        |       +=> eal_parse_coremask
        |       |
        |       |   /* option: -l */
        |       +=> eal_parse_corelist
        |       |
        |       |   /* option: --proc-type */
        |       +=> eal_parse_proc_type
        |       |
        |       |   /* option: --master-lcore */
        |       +=> eal_parse_master_lcore
        |       |
        |       |   /* option: --lcores */
        |       +=> eal_parse_lcores
        |
        +-> rte_srand(rte_rdtsc());
        +-> rte_config_init
        +-> rte_eal_pci_init
        +-> rte_eal_vfio_setup
        +-> rte_eal_ivshmem_init
        +-> rte_eal_memory_init
        +-> eal_hugedirs_unlock
        +-> rte_eal_memzone_init
        +-> rte_eal_tailqs_init
        +-> rte_eal_ivshmem_obj_init
        +-> rte_eal_log_init
        +-> rte_eal_alarm_init
        +-> rte_eal_timer_init
        +-> eal_check_mem_on_local_socket
        +-> eal_plugins_init
        |
        |   /* <--- `主线程`的核绑定 ---> */
        +=> eal_thread_init_master(rte_config.master_lcore) /* 主线程绑核 */
        |   +=> RTE_PER_LCORE(_lcore_id) = lcore_id;        /* 使用`线程变量`记录`lcore`下标 */
        |   +=> eal_thread_set_affinity()                   /* 线程绑定`cpu` */
        |       +=> rte_gettid                              /* 使用`线程变量`记录`线程号` */
        |       |   +=> static RTE_DEFINE_PER_LCORE(int, _thread_id); /* 声明静态的`线程变量`记录`线程号` */
        |       |   +=> RTE_PER_LCORE(_thread_id) = rte_sys_gettid(); /* 取得`线程号` */
        |       |       +=> syscall(SYS_gettid);                        /* 系统函数取得`线程号` */
        |       +=> rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); /* 线程核绑定 */
        |           +=> pthread_setaffinity_np                          /* pthread库的线程核绑定 */
        |           |   /* 使用实际核绑定后的`cpusetp`，更新到相关线程变量`RTE_PER_LCORE`和全局变量`lcore_config` */
        |           +=> RTE_PER_LCORE(_socket_id) = eal_cpuset_socket_id(cpusetp);
        |           +=> memmove(&RTE_PER_LCORE(_cpuset), cpusetp,...);
        |           +=> lcore_config[lcore_id].socket_id = RTE_PER_LCORE(_socket_id);
        |           +=> memmove(&lcore_config[lcore_id].cpuset, cpusetp, sizeof(rte_cpuset_t));
        |
        +-> eal_thread_dump_affinity    /* 打印核绑定设置 */
        |
        +-> rte_eal_dev_init        /* init pmd devices */
		|	
		|   /* <--- `中断处理线程`的创建 ---> */
        +-> rte_eal_intr_init /* init interrupt-handling */
        |   +-> pthread_create(&intr_thread, NULL, eal_intr_thread_main, NULL);
        |   |   +~> eal_intr_thread_main    /* 这个处理中断的线程是没有绑核的 */
        |   +-> rte_thread_setname ("eal-intr-thread")
        |
        |   /* <--- `副线程`的创建 ---> */
        |   /* 遍历所有的`副线程`*/
        +=> RTE_LCORE_FOREACH_SLAVE(i)
        |   /* 创建`主线程`和`副线程`间的沟通管道 */
        +=> pipe(lcore_config[i].pipe_master2slave);
        +=> pipe(lcore_config[i].pipe_slave2master);
        |   /* 创建`副线程` */
        +=> pthread_create(&lcore_config[i].thread_id, NULL, eal_thread_loop, NULL);
        |   +~> eal_thread_loop
        |       +=> eal_thread_set_affinity /*`副线程`核绑定*/
        |       +=> read(m2s, &c, 1);       /* 等待`主线程`发送到`副线程`的消息 */
        |       +=> write(s2m, &c, 1);      /* `副线程`确认收到`主线程`的消息 */
        |       +=> lcore_config[lcore_id].f(fct_arg); /* 执行`业务处理回调函数` */
        |
        +-> rte_thread_setname              /*`副线程`重命名*/
        |
        |   /* <--- `线程`启动和等待 ---> */
        +=> rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); /* 设置每一个`副线程`的回调函数为`sync_func()` */
        |   +=> RTE_LCORE_FOREACH_SLAVE(lcore_id)           /* 遍历所有的`副线程`来执行回调函数 */
        |   +=> rte_eal_remote_launch(f, arg, lcore_id);    /* `副线程`来执行回调函数 */
        |       +=> lcore_config[slave_id].f = f;           /* 设置`副线程`的`业务函数f` */
        |       +=> lcore_config[slave_id].arg = arg;       /* 设置`副线程`的`参数arg` */
        |       +=> write(m2s, &c, 1);                      /* 发送消息到`副线程`，通知`副线程`执行`业务函数`。 */
        |       +=> read(s2m, &c, 1);                       /* 等待`副线程`的确认。并判断`副线程`启动是否有异常。 */
        +=> rte_eal_mp_wait_lcore();                        /* 等待所有线程返回 */
        |   +=> RTE_LCORE_FOREACH_SLAVE(lcore_id);          /* 遍历所有的`副线程` */
        |   +=> rte_eal_wait_lcore(lcore_id);               /* 等待某一`副线程`返回 */
        |
        +-> rte_eal_pci_probe
        +-> rte_eal_mcfg_complete

说明：

以下的例子采用的是同一物理构造的机器。
机器有 2 路物理cpu插槽。单颗cpu插槽有 12 个核。
没有开启超线程。所以单颗cpu插槽的cpu数也是 12 个。
2 路合计cpu数为 24 个。

2. 数据结构和全局变量

2.1. rte_cpuset_t类型

rte_cpuset_t类型在 linux 中其实就是系统的cpu_set_t类。
其中 __bits[] 上的每一位代表了一个cpu。

    typedef unsigned long __cpu_mask

    typedef struct {
        __cpu_mask __bits[16];
    } cpu_set_t;

    typedef cpu_set_t rte_cpuset_t;

注意：

使用CPU_ZERO()，CPU_SET()等宏函数来操作rte_cpuset_t类型。

2.2. struct lcore_config结构体

struct lcore_config结构体，用于表示一个lcore的用户设置。其中可以将成员变量分为三类：

核绑定相关成员变量。
主副线程通信相关成员变量。
业务处理相关成员变量。

注意：

lcore实际上是使用线程来实现。两者在后续的描述中可能会混用。请根据上下文理解。

详细的描述如下：

核绑定相关成员变量：

detected：cpu是否存在。

0：不存在。
非0：存在。

socket_id：cpu所在的NUMA node id。(这里没有任何错误，请看下面详细的说明！)
这里的socket_id字段的名字容易误导。通过分析代码发现：
由于在 OS 层中可以自由开启或关闭NUMA功能。从而在不同的情况下，NUMA node_id会有所不同。
最简单的例子就是，关闭NUMA功能后，所有的cpu都会属于node0。但是cpu的总数是没有改变的。

而物理cpu插槽的标识号，是由主板电路决定的。

物理cpu插槽的标识号，是不会随NUMA功能的开关而影响的。
物理cpu插槽的标识号，可以通过cat /sys/devices/system/cpu/cpu${cpu_index}/topology/physical_package_id指令得出。

其中${cpu_index}是cpu序号。

DPDK中的socket_id字段在分析代码后，其实所指的是cpu所在的NUMA node id。
NUMA node id的数值为/sys/devices/system/cpu/cpu${cpu_index}/node${node_id}的${node_id}。
其中 ${cpu_index} 是cpu序号。
在后续的描述中。代码上会保留使用socket_id；但是在解释中会使用NUMA node_id来表达。
core_id：cpu的标识号。数值与硬件相关。
cpu的标识号不一定连续。
cpu的标识号数值为 /sys/devices/system/cpu/cpu${cpu_index}/topology/core_id。其中${cpu_index}是cpu序号。
core_index：lcore的序号。
有效的序号从零开始，且是连续的。
-1：无效数据。

只有core_index可以唯一的区分lcore。

cpuset

lcore所绑定的cpuset。
lcore只可以绑定到一个cpu上。当使用-l或者-c命令行参数。
lcore可以绑定到多个cpu上。当使用--lcores命令行参数。
主副线程通信相关成员变量：

pipe_master2slave[2]：主线程到副线程的通信管道。
pipe_master2slave[0]，表示的是主线程到副线程管道的读端。
pipe_master2slave[1]，表示的是主线程到副线程管道的写端。
pipe_slave2master[2]：副线程到主线程的通信管道。
pipe_slave2master[0]，表示的是副线程到主线程`管道的读端。
pipe_slave2master[1]，表示的是主线程到副线程管道的写端。

业务处理相关成员变量：

thread_id：线程号。
f：用户业务函数。
arg：输入到用户业务函数的参数。
ret：用户业务函数的返回值。
state：线程的状态。
WAIT：等待命令。
RUNNING：线程正在运行业务函数。
FINISHED：线程运行完业务函数。

注意：

struct lcore_config结构体中，成员变量core_id，socket_id，无法可靠区分lcore。
因为当NUMA关闭的时候。socket_id的数值会全部变为0，而core_id却有可能会重复。
详细的例子请看后续的rte_eal_cpu_init()函数。
socket_id的主要作用是识别NUMA node，用于内存的分配。
struct lcore_config结构体中，成员变量core_index和thread_id，无论NUMA是否开启，都可以正确区分lcore。

结构体源码如下：

    /**
     * Structure storing internal configuration (per-lcore)
     */
    struct lcore_config {
        unsigned detected;         /**< true if lcore was detected */
        pthread_t thread_id;       /**< pthread identifier */
        int pipe_master2slave[2];  /**< communication pipe with master */
        int pipe_slave2master[2];  /**< communication pipe with master */
        lcore_function_t * volatile f;         /**< function to call */
        void * volatile arg;       /**< argument of function */
        volatile int ret;          /**< return value of function */
        volatile enum rte_lcore_state_t state; /**< lcore state */
        unsigned socket_id;        /**< physical socket id for this lcore */
        unsigned core_id;          /**< core number on socket for this lcore */
        int core_index;            /**< relative index, starting from 0 */
        rte_cpuset_t cpuset;       /**< cpu set which the lcore affinity to */
    };

2.2.1. 全局变量 `lcore_config`

全局变量lcore_config[]数组，表示lcore的用户设置。

全局变量lcore_config[]的定义如下：

    /* internal configuration (per-core) */
    struct lcore_config lcore_config[RTE_MAX_LCORE];

注意：
全局变量lcore_config[n]的下标比struct lcore_config结构体中的core_id，core_index的作用还要大。具体请看RTE_LCORE_FOREACH_SLAVE()宏函数的实现。

2.3. struct rte_config结构体

struct rte_config结构体，用于记录lcore和内存在DPDK应用程序中的设置。

成员变量描述如下：

master_lcore：主线程所在的lcore的序号（序号从零开始，并且是连续的）。
lcore_count：机器上所有lcore的个数。
lcore_role：每一个lcore的状态。
ROLE_OFF：lcore没有在DPDK中使用。
ROLE_RTE：lcore在DPDK中使用。
process_type：进程是主进程还是副进程。
RTE_PROC_AUTO：自动检测。
RTE_PROC_PRIMARY：默认值。主进程。
RTE_PROC_SECONDARY：副进程。
RTE_PROC_INVALID：无效进程。
mem_config：内存设置。
源代码如下：

    /**
     * The global RTE configuration structure.
     */
    struct rte_config {
        /* master lcore 的 id */
        uint32_t master_lcore;       /**< Id of the master lcore */
        uint32_t lcore_count;        /**< Number of available logical cores. */
        enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; /**< State of cores. */
        /** Primary or secondary configuration */
        enum rte_proc_type_t process_type;
        /**
         * Pointer to memory configuration, which may be shared across multiple
         * DPDK instances
         */
        struct rte_mem_config *mem_config;
    } __attribute__((__packed__));

2.3.1. 全局变量`rte_config`

全局变量rte_config，表示DPDK的用户配置。

全局变量rte_config的定义如下：

    /* Address of global and public configuration */
    static struct rte_config rte_config = {
            .mem_config = &early_mem_config,
    };

3. 检测所有的cpu

rte_eal_cpu_init()函数用于检测所有的cpu。并用来初始化全局变量rte_config和lcore_config[]。

函数流程如下：

使用eal_cpu_detected函数，遍历所有的路径 /sys/devices/system/cpu/cpu${cpu_index}。其中 ${cpu_index} 从0到RTE_MAX_LCORE - 1。

1.1. 如果路径 /sys/devices/system/cpu/cpu${cpu_index}不存在：
1.2.1. lcore_config[lcore_id].cpuset设置为0。
1.1.2. rte_config.lcore_role[${cpu_index}]设置为ROLE_OFF。
1.1.3. lcore_config[${cpu_index}].core_index设置为-1。
1.1.4. lcore_config[${cpu_index}].core_id设置为0。
1.1.5. lcore_config[${cpu_index}].socket_id设置为0。
1.2. 如果路径 /sys/devices/system/cpu/cpu${cpu_index}存在：
1.2.1. lcore_config[lcore_id].cpuset设置为0x1U << lcore_id。
1.2.2. rte_config.lcore_role[${cpu_index}]设置为ROLE_RTE。
1.2.3. lcore_config[${cpu_index}].core_index设置为core index。
1.2.4. lcore_config[${cpu_index}].core_id设置为eal_cpu_core_id(${cpu_index})。
1.2.5. lcore_config[${cpu_index}].socket_id设置为eal_cpu_socket_id(${cpu_index})。

rte_config.lcore_count设置为机器上所有cpu的个数。

函数调用图如下：

    rte_eal_init
    +-> rte_eal_cpu_init
        +-> rte_eal_get_configuration                               /* 取得 全局变量`rte_config`的指针。 */
        +-> lcore_config[].detected = eal_cpu_detected(lcore_id);   /* 检测`cpu`是否存在 */
        +-> config->lcore_role[lcore_id] = ROLE_RTE;                /* 'ROLE_RTE`表示`cpu`存在 */
        +-> lcore_config[].core_id = eal_cpu_core_id(lcore_id);     /* 取得`cpu`的`lcore_id`。 */
        +-> lcore_config[].socket_id = eal_cpu_socket_id(lcore_id); /* 取得`NUMA node id`。 */

rte_eal_cpu_init简化后的代码；

    int
    rte_eal_cpu_init(void)
    {
        /* pointer to global configuration */
        struct rte_config *config = rte_eal_get_configuration();
        unsigned lcore_id;
        unsigned count = 0;

        /*
         * Parse the maximum set of logical cores, detect the subset of running
         * ones and enable them by default.
         */
        for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
            lcore_config[lcore_id].core_index = count;

            /* init cpuset for per lcore config */
            CPU_ZERO(&lcore_config[lcore_id].cpuset);

            /* in 1:1 mapping, record related cpu detected state */
            lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id);
            if (lcore_config[lcore_id].detected == 0) {
                config->lcore_role[lcore_id] = ROLE_OFF;
                lcore_config[lcore_id].core_index = -1;
                continue;
            }

            /* By default, lcore 1:1 map to cpu id */
            CPU_SET(lcore_id, &lcore_config[lcore_id].cpuset);

            /* By default, each detected core is enabled */
            config->lcore_role[lcore_id] = ROLE_RTE;
            lcore_config[lcore_id].core_id = eal_cpu_core_id(lcore_id);
            lcore_config[lcore_id].socket_id = eal_cpu_socket_id(lcore_id);

            count++;
        }
        /* Set the count of enabled logical cores of the EAL configuration */
        config->lcore_count = count;
        return 0;
    }

例子：

不论是否开启NUMA功能。rte_eal_cpu_init函数运行完后rte_config.lcore_count都为 24。
但是rte_config.lcore_count在后面，经过解释核绑定相关参数后，会有可能修改。

NUMA关闭时：
使用lscpu查看到的系统配置为：

    lscpu
    >    Architecture:          x86_64
    >    CPU(s):                24          # 总`cpu`数
    >    On-line CPU(s) list:   0-23        # `cpu序号`
    >    Thread(s) per core:    1           # 每个`核`的`线程`个数（没有开启`超线程`）
    >    Core(s) per socket:    12          # 每个`cpu插槽`的`核`数
    >    Socket(s):             2           # `cpu插槽`个数
    >    NUMA node(s):          1           # `NUMA node`个数
    >    NUMA node0 CPU(s):     0-23        # `node 0`上`cpu`序号

当eal_cpu_detected运行完后，lcore_config[n]和rte_config.lcore_role[n] 的数值如下表所示：

DPDK lcore学习笔记

1. DPDK核绑定的全局概览

2. 数据结构和全局变量

2.1. rte_cpuset_t类型

2.2. struct lcore_config结构体

2.2.1. 全局变量 `lcore_config`

2.3. struct rte_config结构体

2.3.1. 全局变量`rte_config`

3. 检测所有的cpu

相关阅读

相关文章

相关问答

相关文档

DPDK lcore学习笔记

1. DPDK核绑定的全局概览

2. 数据结构和全局变量

2.1. rte_cpuset_t类型

2.2. struct lcore_config结构体

2.2.1. 全局变量 lcore_config

2.3. struct rte_config结构体

2.3.1. 全局变量rte_config

3. 检测所有的cpu

相关阅读

相关文章

相关问答

相关文档

2.2.1. 全局变量 `lcore_config`

2.3.1. 全局变量`rte_config`