如下VPP命令,默认创建名称为vpp pool 0的地址池,其中包括256032个vlib_buffer_t内存结构。
vpp# show dpdk buffer
name="vpp pool 0" available = 219146 allocated = 36886 total = 256032
以上内存池由函数dpdk_buffer_pools_create创建,其遍历VM的地址池buffer_pools向量,创建地址池buffer_pool。在创建DPDK内存池之前,提前向DPDK创建两个内存池操作集,名称分别为vpp和vpp-no-cache,在创建DPDK内存池时会使用到。
clib_error_t *
dpdk_buffer_pools_create (vlib_main_t * vm)
{
clib_error_t *err;
vlib_buffer_pool_t *bp;
struct rte_mempool_ops ops = { };
strncpy (ops.name, "vpp", 4);
ops.alloc = dpdk_ops_vpp_alloc;
ops.free = dpdk_ops_vpp_free;
ops.get_count = dpdk_ops_vpp_get_count;
ops.enqueue = CLIB_MARCH_FN_POINTER (dpdk_ops_vpp_enqueue);
ops.dequeue = CLIB_MARCH_FN_POINTER (dpdk_ops_vpp_dequeue);
rte_mempool_register_ops (&ops);
strncpy (ops.name, "vpp-no-cache", 13);
ops.get_count = dpdk_ops_vpp_get_count_no_cache;
ops.enqueue = CLIB_MARCH_FN_POINTER (dpdk_ops_vpp_enqueue_no_cache);
ops.dequeue = dpdk_ops_vpp_dequeue_no_cache;
rte_mempool_register_ops (&ops);
/* *INDENT-OFF* */
vec_foreach (bp, vm->buffer_main->buffer_pools)
if (bp->start && (err = dpdk_buffer_pool_init (vm, bp)))
return err;
首先计算pool中元素的大小elt_size,等于DPDK的rte_mbuf的大小,加上VPP的vlib_buffer_t结构的大小,最后加上实际存放报文数据的内存大小data_size,为pool中单个元素的大小。
数组dpdk_mempool_by_buffer_pool_index和dpdk_no_cache_mempool_by_buffer_pool_index用于存放之后创建的DPDK mempool地址。
clib_error_t *
dpdk_buffer_pool_init (vlib_main_t * vm, vlib_buffer_pool_t * bp)
{
uword buffer_mem_start = vm->buffer_main->buffer_mem_start;
struct rte_mempool *mp, *nmp;
struct rte_pktmbuf_pool_private priv;
enum rte_iova_mode iova_mode;
u8 *name = 0;
u32 elt_size =
sizeof (struct rte_mbuf) + sizeof (vlib_buffer_t) + bp->data_size;
/* create empty mempools */
vec_validate_aligned (dpdk_mempool_by_buffer_pool_index, bp->index,
CLIB_CACHE_LINE_BYTES);
vec_validate_aligned (dpdk_no_cache_mempool_by_buffer_pool_index, bp->index,
CLIB_CACHE_LINE_BYTES);
创建DPDK内存池结构,elt_size为元素大小,bp->n_buffers为元素数量。对于首个内存池,命名为:vpp pool 0,最后的数字为VPP内存池的索引值。
/* normal mempool */
name = format (name, "vpp pool %u%c", bp->index, 0);
mp = rte_mempool_create_empty ((char *) name, bp->n_buffers,
elt_size, 512, sizeof (priv),
bp->numa_node, 0);
if (!mp)
{
vec_free (name);
return clib_error_return (0,
"failed to create normal mempool for numa node %u",
bp->index);
}
vec_reset_length (name);
接下来创建一个相同规格的非缓存DPDK内存池,命名 vpp pool 0 (no cache),其中的数字对于与VPP内存池索引。将以上创建的的两个DPDK内存池赋值与以VPP内存池索引的数组dpdk_mempool_by_buffer_pool_index和dpdk_no_cache_mempool_by_buffer_pool_index。
/* non-cached mempool */
name = format (name, "vpp pool %u (no cache)%c", bp->index, 0);
nmp = rte_mempool_create_empty ((char *) name, bp->n_buffers,
elt_size, 0, sizeof (priv),
bp->numa_node, 0);
if (!nmp)
{
rte_mempool_free (mp);
vec_free (name);
return clib_error_return (0,
"failed to create non-cache mempool for numa nude %u",
bp->index);
}
vec_free (name);
dpdk_mempool_by_buffer_pool_index[bp->index] = mp;
dpdk_no_cache_mempool_by_buffer_pool_index[bp->index] = nmp;
为新创建的两个DPDK内存池关联提前注册的操作集。初始化DPDK内存池的私有结构,mbuf_data_room_size表示VPP结构vlib_buffer_t中预留的报文数据长度,与正在存放报文数据的内存大小之和。而mbuf_priv_size表示VPP结构vlib_buffer_t大小除去预留报文数据长度之后的长度。
mp->pool_id = nmp->pool_id = bp->index;
rte_mempool_set_ops_byname (mp, "vpp", NULL);
rte_mempool_set_ops_byname (nmp, "vpp-no-cache", NULL);
/* Call the mempool priv initializer */
memset (&priv, 0, sizeof (priv));
priv.mbuf_data_room_size = VLIB_BUFFER_PRE_DATA_SIZE +
vlib_buffer_get_default_data_size (vm);
priv.mbuf_priv_size = VLIB_BUFFER_HDR_SIZE;
rte_pktmbuf_pool_init (mp, &priv);
rte_pktmbuf_pool_init (nmp, &priv);
接下来初始化DPDK内存池的元素链表,链表元素为rte_mempool_objhdr结构,其中子成员iova保存DPDK结构rte_mbuf的地址(根据IOVA模式,决定保存物理还是虚拟地址)。
|---rte_mempool_objhdr---|----------rte_mbuf----------|------vlib_buffer_t------|------packet_data------|
| | |
| |---priv.mbuf_priv_size---|-------data_size-------|
|
|------------------------------------elt_size----------------------------------|
涉及到的三个数据结构内存分布如上所示。
iova_mode = rte_eal_iova_mode ();
/* populate mempool object buffer header */
for (i = 0; i < bp->n_buffers; i++)
{
struct rte_mempool_objhdr *hdr;
vlib_buffer_t *b = vlib_get_buffer (vm, bp->buffers[i]);
struct rte_mbuf *mb = rte_mbuf_from_vlib_buffer (b);
hdr = (struct rte_mempool_objhdr *) RTE_PTR_SUB (mb, sizeof (*hdr));
hdr->mp = mp;
hdr->iova = (iova_mode == RTE_IOVA_VA) ?
pointer_to_uword (mb) : vlib_physmem_get_pa (vm, mb);
STAILQ_INSERT_TAIL (&mp->elt_list, hdr, next);
STAILQ_INSERT_TAIL (&nmp->elt_list, hdr, next);
mp->populated_size++;
nmp->populated_size++;
}
#if RTE_VERSION >= RTE_VERSION_NUM(22, 3, 0, 0)
mp->flags &= ~RTE_MEMPOOL_F_NON_IO;
#endif
遍历DPDK内存池元素,调用rte_pktmbuf_init初始化化每个元素(rte_mbuf结构)。接下来,将第一个DPDK mbuf模板内容拷贝到当前索引指向的DPDK mbuf模板。遍历VPP结构vlib_buffer_t,将其初始化为vlib_buffer模板中的内容(参见vlib_buufer_t结构,模板长度为64字节,即cache大小)。
/* call the object initializers */
rte_mempool_obj_iter (mp, rte_pktmbuf_init, 0);
/* create mbuf header tempate from the first buffer in the pool */
vec_validate_aligned (dpdk_mbuf_template_by_pool_index, bp->index,
CLIB_CACHE_LINE_BYTES);
clib_memcpy (vec_elt_at_index (dpdk_mbuf_template_by_pool_index, bp->index),
rte_mbuf_from_vlib_buffer (vlib_buffer_ptr_from_index
(buffer_mem_start, *bp->buffers,
0)), sizeof (struct rte_mbuf));
for (i = 0; i < bp->n_buffers; i++)
{
vlib_buffer_t *b;
b = vlib_buffer_ptr_from_index (buffer_mem_start, bp->buffers[i], 0);
vlib_buffer_copy_template (b, &bp->buffer_template);
}
最后,初始化DPDK内存池的mem_list链表,其元素结构为rte_mempool_memhdr,根据VPP内存池的物理内存结构,为每个页面分配rte_mempool_memhdr结构,记录页面的地址等信息。
/* map DMA pages if at least one physical device exists */
if (rte_eth_dev_count_avail () || rte_cryptodev_count ())
{
uword i;
size_t page_sz;
vlib_physmem_map_t *pm;
int do_vfio_map = 1;
pm = vlib_physmem_get_map (vm, bp->physmem_map_index);
page_sz = 1ULL << pm->log2_page_size;
for (i = 0; i < pm->n_pages; i++)
{
char *va = ((char *) pm->base) + i * page_sz;
uword pa = (iova_mode == RTE_IOVA_VA) ?
pointer_to_uword (va) : pm->page_table[i];
if (do_vfio_map &&
#if RTE_VERSION < RTE_VERSION_NUM(19, 11, 0, 0)
rte_vfio_dma_map (pointer_to_uword (va), pa, page_sz))
#else
rte_vfio_container_dma_map (RTE_VFIO_DEFAULT_CONTAINER_FD,
pointer_to_uword (va), pa, page_sz))
#endif
do_vfio_map = 0;
struct rte_mempool_memhdr *memhdr;
memhdr = clib_mem_alloc (sizeof (*memhdr));
memhdr->mp = mp;
memhdr->addr = va;
memhdr->iova = pa;
memhdr->len = page_sz;
memhdr->free_cb = 0;
memhdr->opaque = 0;
STAILQ_INSERT_TAIL (&mp->mem_list, memhdr, next);
mp->nb_mem_chunks++;
}
}