Linux perf_event_open 简介

林运浩

2023-12-01

前言

perf 工具通过系统调用 perf_event_open 与内核交互，接下来我们主要来了解 perf_event_open 系统调用：

NAME
       perf_event_open - set up performance monitoring

perf_event_open系统调用从该函数名就可以看出 perf 与 event 联系在一起的。

perf_event_open 对系统中打开的event分配一个对应的perf_event结构，所有对event的操作都是围绕perf_event来展开的。

一、简介

#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>

int perf_event_open(struct perf_event_attr *attr,
                    pid_t pid, int cpu, int group_fd,
                    unsigned long flags);

Glibc没有为这个系统调用提供包装器；使用syscall调用它：

static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
                       int cpu, int group_fd, unsigned long flags)
{
    int ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
                   group_fd, flags);
    return ret;
}

给定一个参数列表，perf_event_open（）返回一个文件描述符fd，用于随后的系统调用（read、mmap、ioctl、prctl、fcntl等），通常用的是对 fd 进行 mmap、ioctl、read操作。
经过perf_event_open()调用以后返回perf_event对应的fd，后续的文件操作对应perf_fops：

static const struct file_operations perf_fops = {
	.llseek			= no_llseek,
	.release		= perf_release,
	.read			= perf_read,
	.poll			= perf_poll,
	.unlocked_ioctl		= perf_ioctl,
	.compat_ioctl		= perf_ioctl,
	.mmap			= perf_mmap,
	.fasync			= perf_fasync,
};

对perf_event_open（）的调用将创建一个文件描述符，用于测量性能信息。每个文件描述符对应于一个被测量的事件；这些可以组合在一起以同时测量多个事件。

perf_ioctl：事件可以通过两种方式启用和禁用：通过ioctl和prctl。禁用事件时，它不会计数或生成溢出，但会继续存在并保持其计数值。

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
	struct perf_event *event = file->private_data;
	void (*func)(struct perf_event *);
	u32 flags = arg;

	switch (cmd) {
	case PERF_EVENT_IOC_ENABLE:
		func = perf_event_enable;
		break;
	case PERF_EVENT_IOC_DISABLE:
		func = perf_event_disable;
		break;
	......
}

事件有两种类型：计数和采样。
perf_read：
计数事件是用于对发生的事件总数进行计数的事件。通常，计数事件结果是通过 read（）系统调用读取收集的。
perf stat 命令工作在计数模式。

perf_mmap：采样事件定期将测量值写入缓冲区，然后可以通过mmap访问该缓冲区。
perf record 命令工作在采样模式。

计数只是记录了event的发生次数，采样记录了大量信息(比如：IP、ADDR、TID、TIME、CPU、BT)。

Arguments：
struct perf_event_attr *attr是该系统调用最重要的参数，先介绍其他的参数。

pid和cpu参数允许指定要监视的进程和cpu：

pid==0，cpu==-1
它测量任何CPU上的调用进程/线程。

pid==0，cpu>=0
这仅在指定CPU上运行时测量调用进程/线程。

pid>0，cpu==-1
它测量任何CPU上指定的进程/线程。

pid>0和cpu>=0
仅当在指定的CPU上运行时，才会测量指定的进程/线程。

pid==-1，cpu>=0
这将测量指定CPU上的所有进程/线程。这需要CAP_SYS_ADMIN功能或/proc/sys/kernel/perf_event_paranoid值小于1。

pid==-1和cpu==-1
此设置无效，将返回错误。

group_fd参数允许创建事件组。一个事件组有一个事件，即组长。首先创建组长，group_fd=-1。其余的组成员是通过随后的perf_event_open（）调用创建的，group_fd被设置为组长的文件描述符（group_fd=-1表示单独创建一个事件，并且被认为是只有1个成员的组）。事件组作为一个单元调度到CPU上：只有当组中的所有事件都可以放到CPU上时，它才会放到CPU上。这意味着可以对成员事件的值进行有意义的比较—相加、除法（以获得比率）等等，因为它们已经对同一组执行指令的事件进行了计数。

flags参数是通过对以下零个或多个值进行“或”运算形成的：

#define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
#define PERF_FLAG_FD_OUTPUT			(1UL << 1)
#define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
#define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */

该参数通常为零个值，即flags=0。

二、struct perf_event_attr

perf_event_attr结构为正在创建的事件提供详细的配置信息，如下所示：

struct perf_event_attr {
               __u32 type;                 /* Type of event */
               __u32 size;                 /* Size of attribute structure */
               __u64 config;               /* Type-specific configuration */

               union {
                   __u64 sample_period;    /* Period of sampling */
                   __u64 sample_freq;      /* Frequency of sampling */
               };

               __u64 sample_type;  /* Specifies values included in sample */
               __u64 read_format;  /* Specifies values returned in read */

               __u64 disabled       : 1,   /* off by default */
                     inherit        : 1,   /* children inherit it */
                     pinned         : 1,   /* must always be on PMU */
                     exclusive      : 1,   /* only group on PMU */
                     exclude_user   : 1,   /* don't count user */
                     exclude_kernel : 1,   /* don't count kernel */
                     exclude_hv     : 1,   /* don't count hypervisor */
                     exclude_idle   : 1,   /* don't count when idle */
                     mmap           : 1,   /* include mmap data */
                     comm           : 1,   /* include comm data */
                     freq           : 1,   /* use freq, not period */
                     inherit_stat   : 1,   /* per task counts */
                     enable_on_exec : 1,   /* next exec enables */
                     task           : 1,   /* trace fork/exit */
                     watermark      : 1,   /* wakeup_watermark */
                     precise_ip     : 2,   /* skid constraint */
                     mmap_data      : 1,   /* non-exec mmap data */
                     sample_id_all  : 1,   /* sample_type all events */
                     exclude_host   : 1,   /* don't count in host */
                     exclude_guest  : 1,   /* don't count in guest */
                     exclude_callchain_kernel : 1,
                                           /* exclude kernel callchains */
                     exclude_callchain_user   : 1,
                                           /* exclude user callchains */
                     mmap2          :  1,  /* include mmap with inode data */
                     comm_exec      :  1,  /* flag comm events that are
                                              due to exec */
                     use_clockid    :  1,  /* use clockid for time fields */
                     context_switch :  1,  /* context switch data */
                     write_backward :  1,  /* Write ring buffer from end
                                              to beginning */
                     namespaces     :  1,  /* include namespaces data */
                     ksymbol        :  1,  /* include ksymbol events */
                     bpf_event      :  1,  /* include bpf events */
                     aux_output     :  1,  /* generate AUX records
                                              instead of events */
                     cgroup         :  1,  /* include cgroup events */
                     text_poke      :  1,  /* include text poke events */

                     __reserved_1   : 30;

               union {
                   __u32 wakeup_events;    /* wakeup every n events */
                   __u32 wakeup_watermark; /* bytes before wakeup */
               };

               __u32     bp_type;          /* breakpoint type */

               union {
                   __u64 bp_addr;          /* breakpoint address */
                   __u64 kprobe_func;      /* for perf_kprobe */
                   __u64 uprobe_path;      /* for perf_uprobe */
                   __u64 config1;          /* extension of config */
               };

               union {
                   __u64 bp_len;           /* breakpoint length */
                   __u64 kprobe_addr;      /* with kprobe_func == NULL */
                   __u64 probe_offset;     /* for perf_[k,u]probe */
                   __u64 config2;          /* extension of config1 */
               };
               __u64 branch_sample_type;   /* enum perf_branch_sample_type */
               __u64 sample_regs_user;     /* user regs to dump on samples */
               __u32 sample_stack_user;    /* size of stack to dump on
                                              samples */
               __s32 clockid;              /* clock to use for time fields */
               __u64 sample_regs_intr;     /* regs to dump on samples */
               __u32 aux_watermark;        /* aux bytes before wakeup */
               __u16 sample_max_stack;     /* max frames in callchain */
               __u16 __reserved_2;         /* align to u64 */

           };

perf_event_attr结构的字段如下所述：

2.1 type

此字段指定整个事件类型。它具有以下值之一：

PERF_TYPE_HARDWARE
        这表示内核提供的“通用”硬件事件之一。有关详细信息，请参阅 config 字段定义。

PERF_TYPE_SOFTWARE
       这表示内核提供的软件定义事件之一（即使没有可用的硬件支持）。

PERF_TYPE_TRACEPOINT
       This indicates a tracepoint provided by the kernel tracepoint infrastructure.

PERF_TYPE_HW_CACHE
       这表示内核跟踪点基础结构提供的跟踪点。

PERF_TYPE_RAW
       这表示 config 字段中的“raw”实现特定事件。

PERF_TYPE_BREAKPOINT (since Linux 2.6.33)
       这表示CPU提供的硬件断点。断点可以是对地址的读/写访问以及指令地址的执行。

dynamic PMU
       从Linux 2.6.38开始，perf_event_open（）可以支持多个PMU。要启用此功能，可以在 type 字段中使用内核导出的值来指示要使用的PMU。要使用的值可以在sysfs文件系统中找到：/sys/bus/event_source/devices下的每个PMU实例都有一个子目录。在每个子目录中都有一个类型文件，其内容是可以在 type 字段中使用的整数。例如，/sys/bus/event_source/devices/cpu/type包含核心cpu PMU的值，通常为4

kprobe and uprobe (since Linux 4.17)
       这两个动态PMU创建一个kprobe/uprobe，并将其附加到perf_event_open生成的文件描述符。kprobe/uprobe将在销毁文件描述符时销毁。有关详细信息，请参见字段kprobe_func、uprobe_path、kprobe_addr和probe_offset。

/*
 * attr.type
 */
enum perf_type_id {
	PERF_TYPE_HARDWARE			= 0,
	PERF_TYPE_SOFTWARE			= 1,
	PERF_TYPE_TRACEPOINT			= 2,
	PERF_TYPE_HW_CACHE			= 3,
	PERF_TYPE_RAW				= 4,
	PERF_TYPE_BREAKPOINT			= 5,

	PERF_TYPE_MAX,				/* non-ABI */
};

2.2 size

向前/向后兼容性的perf_event_attr结构的大小。使用sizeof（struct perf_event_attr）设置此值，以允许内核在编译时查看结构大小。
即perf_event_attr.size = sizeof（struct perf_event_attr）。

2.3 config

这将与 type 字段一起指定所需的事件。设置 config 字段的方法多种多样，取决于前面描述的 type 字段的值。以下是按 type 区分的 config 的各种可能设置。

2.3.1 PERF_TYPE_HARDWARE

如果类型为PERF_TYPE_HARDWARE，将测量一个通用硬件CPU事件。并非所有平台都有这些功能。将config设置为以下值之一：

PERF_COUNT_HW_CPU_CYCLES
     Total cycles.  Be wary of what happens during
     CPU frequency scaling.

PERF_COUNT_HW_INSTRUCTIONS
       Retired instructions.  Be careful, these can
       be affected by various issues, most notably
       hardware interrupt counts.

PERF_COUNT_HW_CACHE_REFERENCES
       Cache accesses.  Usually this indicates Last
       Level Cache accesses but this may vary
       depending on your CPU.  This may include
       prefetches and coherency messages; again this
       depends on the design of your CPU.

PERF_COUNT_HW_CACHE_MISSES
       Cache misses.  Usually this indicates Last
       Level Cache misses; this is intended to be
       used in conjunction with the
       PERF_COUNT_HW_CACHE_REFERENCES event to
       calculate cache miss rates.

PERF_COUNT_HW_BRANCH_INSTRUCTIONS
       Retired branch instructions.  Prior to Linux
       2.6.35, this used the wrong event on AMD
       processors.

PERF_COUNT_HW_BRANCH_MISSES
       Mispredicted branch instructions.

PERF_COUNT_HW_BUS_CYCLES
       Bus cycles, which can be different from total
       cycles.

PERF_COUNT_HW_STALLED_CYCLES_FRONTEND (since Linux
3.0)
       Stalled cycles during issue.

PERF_COUNT_HW_STALLED_CYCLES_BACKEND (since Linux
3.0)
       Stalled cycles during retirement.

PERF_COUNT_HW_REF_CPU_CYCLES (since Linux 3.3)
       Total cycles; not affected by CPU frequency
       scaling.

/*
 * Generalized performance event event_id types, used by the
 * attr.event_id parameter of the sys_perf_event_open()
 * syscall:
 */
enum perf_hw_id {
	/*
	 * Common hardware events, generalized by the kernel:
	 */
	PERF_COUNT_HW_CPU_CYCLES		= 0,
	PERF_COUNT_HW_INSTRUCTIONS		= 1,
	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
	PERF_COUNT_HW_CACHE_MISSES		= 3,
	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
	PERF_COUNT_HW_BRANCH_MISSES		= 5,
	PERF_COUNT_HW_BUS_CYCLES		= 6,
	PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	= 7,
	PERF_COUNT_HW_STALLED_CYCLES_BACKEND	= 8,
	PERF_COUNT_HW_REF_CPU_CYCLES		= 9,

	PERF_COUNT_HW_MAX,			/* non-ABI */
};

2.3.2 PERF_TYPE_SOFTWARE

如果类型是PERF_TYPE_SOFTWARE，将测量内核提供的软件事件。将config设置为以下值之一：

PERF_COUNT_SW_CPU_CLOCK
       This reports the CPU clock, a high-resolution per-CPU timer.

PERF_COUNT_SW_TASK_CLOCK
       This reports a clock count specific to the task that is running.

PERF_COUNT_SW_PAGE_FAULTS
       This reports the number of page faults.

PERF_COUNT_SW_CONTEXT_SWITCHES
       This counts context switches.  Until Linux 2.6.34, these were all reported as user-space events, after that they are reported as happening in the kernel.

PERF_COUNT_SW_CPU_MIGRATIONS
       This reports the number of times the process has migrated to a new CPU.

PERF_COUNT_SW_PAGE_FAULTS_MIN
       This counts the number of minor page faults.  These did not require disk I/O to handle.

PERF_COUNT_SW_PAGE_FAULTS_MAJ
       This counts the number of major page faults.  These required disk I/O to handle.

PERF_COUNT_SW_ALIGNMENT_FAULTS (since Linux 2.6.33)
       This  counts  the number of alignment faults.  These happen when unaligned memory accesses happen; the kernel can handle these but it reduces performance.  This
       happens only on some architectures (never on x86).

PERF_COUNT_SW_EMULATION_FAULTS (since Linux 2.6.33)
       This counts the number of emulation faults.  The kernel sometimes traps on unimplemented instructions and emulates them for user  space.   This  can  negatively
       impact performance.

PERF_COUNT_SW_DUMMY (since Linux 3.12)
       This  is  a  placeholder event that counts nothing.  Informational sample record types such as mmap or comm must be associated with an active event.  This dummy
       event allows gathering such records without requiring a counting event.

/*
 * Special "software" events provided by the kernel, even if the hardware
 * does not support performance events. These events measure various
 * physical and sw events of the kernel (and allow the profiling of them as
 * well):
 */
enum perf_sw_ids {
	PERF_COUNT_SW_CPU_CLOCK			= 0,
	PERF_COUNT_SW_TASK_CLOCK		= 1,
	PERF_COUNT_SW_PAGE_FAULTS		= 2,
	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
	PERF_COUNT_SW_DUMMY			= 9,
	PERF_COUNT_SW_BPF_OUTPUT		= 10,

	PERF_COUNT_SW_MAX,			/* non-ABI */
};

2.3.3 PERF_TYPE_TRACEPOINT

如果类型是PERF_type_TRACEPOINT，那么正在测量内核跟踪点。如果内核中启用了ftrace，则可以从debugfs文件系统的以下目录中获取配置中使用的值：

tracing/events/*/*/id

2.3.4 PERF_TYPE_HW_CACHE

如果类型为PERF_TYPE_HW_CACHE，则正在测量硬件CPU缓存事件。要计算适当的配置值，请使用以下公式：

(perf_hw_cache_id) | (perf_hw_cache_op_id << 8) |
                      (perf_hw_cache_op_result_id << 16)

where perf_hw_cache_id is one of:

    PERF_COUNT_HW_CACHE_L1D
           for measuring Level 1 Data Cache

    PERF_COUNT_HW_CACHE_L1I
           for measuring Level 1 Instruction Cache

    PERF_COUNT_HW_CACHE_LL
           for measuring Last-Level Cache

    PERF_COUNT_HW_CACHE_DTLB
           for measuring the Data TLB

    PERF_COUNT_HW_CACHE_ITLB
           for measuring the Instruction TLB

    PERF_COUNT_HW_CACHE_BPU
           for measuring the branch prediction unit

    PERF_COUNT_HW_CACHE_NODE (since Linux 3.1)
           for measuring local memory accesses

and perf_hw_cache_op_id is one of:

    PERF_COUNT_HW_CACHE_OP_READ
           for read accesses

    PERF_COUNT_HW_CACHE_OP_WRITE
           for write accesses

    PERF_COUNT_HW_CACHE_OP_PREFETCH
           for prefetch accesses

and perf_hw_cache_op_result_id is one of:

    PERF_COUNT_HW_CACHE_RESULT_ACCESS
           to measure accesses

    PERF_COUNT_HW_CACHE_RESULT_MISS
           to measure misses

/*
 * Generalized hardware cache events:
 *
 *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x
 *       { read, write, prefetch } x
 *       { accesses, misses }
 */
enum perf_hw_cache_id {
	PERF_COUNT_HW_CACHE_L1D			= 0,
	PERF_COUNT_HW_CACHE_L1I			= 1,
	PERF_COUNT_HW_CACHE_LL			= 2,
	PERF_COUNT_HW_CACHE_DTLB		= 3,
	PERF_COUNT_HW_CACHE_ITLB		= 4,
	PERF_COUNT_HW_CACHE_BPU			= 5,
	PERF_COUNT_HW_CACHE_NODE		= 6,

	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
};

enum perf_hw_cache_op_id {
	PERF_COUNT_HW_CACHE_OP_READ		= 0,
	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,

	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
};

enum perf_hw_cache_op_result_id {
	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,

	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
};

2.3.5 其他类型

PERF_TYPE_RAW
PERF_TYPE_BREAKPOINT
kprobe or uprobe

三、sample相关参数

3.1 sample_period

“采样”事件是每N个事件生成一个溢出通知的事件，其中N由sample_period给出。采样事件的sample_period>0。发生溢出时，请求的数据将记录在mmap缓冲区中。sample_type字段控制每次溢出时记录的数据。

3.2 sample_freq

如果希望使用频率而不是周期，可以使用sample_freq。在本例中，您设置了freq标志。内核将调整采样周期以尝试实现所需的速率。调整率为计时器刻度。

3.3 sample_type

此字段中的各个位指定要包含在样本中的值。它们将被记录在一个环形缓冲区中，该缓冲区可用于使用mmap的用户空间。值保存在样本中的顺序记录在下面的 MMAP Layout 小节中；它不是enum perf_event_sample_format顺序。

/*
 * Bits that can be set in attr.sample_type to request information
 * in the overflow packets.
 */
enum perf_event_sample_format {
	PERF_SAMPLE_IP				= 1U << 0,
	PERF_SAMPLE_TID				= 1U << 1,
	PERF_SAMPLE_TIME			= 1U << 2,
	PERF_SAMPLE_ADDR			= 1U << 3,
	PERF_SAMPLE_READ			= 1U << 4,
	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
	PERF_SAMPLE_ID				= 1U << 6,
	PERF_SAMPLE_CPU				= 1U << 7,
	PERF_SAMPLE_PERIOD			= 1U << 8,
	PERF_SAMPLE_STREAM_ID			= 1U << 9,
	PERF_SAMPLE_RAW				= 1U << 10,
	PERF_SAMPLE_BRANCH_STACK		= 1U << 11,
	PERF_SAMPLE_REGS_USER			= 1U << 12,
	PERF_SAMPLE_STACK_USER			= 1U << 13,
	PERF_SAMPLE_WEIGHT			= 1U << 14,
	PERF_SAMPLE_DATA_SRC			= 1U << 15,
	PERF_SAMPLE_IDENTIFIER			= 1U << 16,
	PERF_SAMPLE_TRANSACTION			= 1U << 17,
	PERF_SAMPLE_REGS_INTR			= 1U << 18,
	PERF_SAMPLE_PHYS_ADDR			= 1U << 19,

	PERF_SAMPLE_MAX = 1U << 20,		/* non-ABI */
};

四、其他重要参数

4.1 read_format

此字段指定系统待用 read 读取perf_event_open（）文件描述符返回的数据的格式。

4.2 disabled

该位指定计数器开始时是禁用还是启用。如果disabled = 1，则可以稍后通过ioctl、prctl或enable_on_exec启用该事件。
创建事件组时，通常将组组长初始化为禁用设置为1，将所有子事件初始化为禁用设为0。尽管禁用为0，但子事件在组长 enable 之前不会启动。

4.3 exclude类型

exclude_user
       If this bit is set, the count excludes events that happen in user space.

exclude_kernel
       If this bit is set, the count excludes events that happen in kernel space.

exclude_hv
       If this bit is set, the count excludes events that happen in the hypervisor.  This is mainly for PMUs that have built-in support for handling this (such as  POWER).   Extra
       support is needed for handling hypervisor measurements on most machines.

exclude_host (since Linux 3.2)
       When  conducting  measurements  that include processes running VM instances (i.e., have executed a KVM_RUN ioctl(2)), only measure events happening inside a guest instance.
       This is only meaningful outside the guests; this setting does not change counts gathered inside of a guest.  Currently, this functionality is x86 only.

exclude_guest (since Linux 3.2)
       When conducting measurements that include processes running VM instances (i.e., have executed a KVM_RUN ioctl(2)), do not measure events happening inside  guest  instances.
       This is only meaningful outside the guests; this setting does not change counts gathered inside of a guest.  Currently, this functionality is x86 only.

4.4 freq

如果设置了该位，则在设置采样间隔时使用sample_frequency而不是sample_period。

4.5 enable_on_exec

如果设置了此位，则在调用exec系统调用后会自动启用计数器。

五、MMAP layout

在采样模式下使用perf_event_open（）时，异步事件（如计数器溢出或PROT_EXEC mmap跟踪）会记录到环形缓冲区中。这个环形缓冲区是通过mmap创建和访问的。

mmap大小应为1+2^n页，其中第一页是元数据页（struct perf_event_mmap_page），其中包含各种信息位，例如环形缓冲区头的位置。

第一个元数据mmap页面的结构如下：

struct perf_event_mmap_page {
	__u32 version;        /* version number of this structure */
	__u32 compat_version; /* lowest version this is compat with */
	__u32 lock;           /* seqlock for synchronization */
	__u32 index;          /* hardware counter identifier */
	__s64 offset;         /* add to hardware counter value */
	__u64 time_enabled;   /* time event active */
	__u64 time_running;   /* time event on CPU */
	union {
	    __u64   capabilities;
	    struct {
	        __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1,
	              cap_bit0_is_deprecated : 1,
	              cap_user_rdpmc         : 1,
	              cap_user_time          : 1,
	              cap_user_time_zero     : 1,
	    };
	};
	__u16 pmc_width;
	__u16 time_shift;
	__u32 time_mult;
	__u64 time_offset;
	__u64 __reserved[120];   /* Pad to 1 k */
	__u64 data_head;         /* head in the data section */
	__u64 data_tail;         /* user-space written tail */
	__u64 data_offset;       /* where the buffer starts */
	__u64 data_size;         /* data buffer size */
	__u64 aux_head;
	__u64 aux_tail;
	__u64 aux_offset;
	__u64 aux_size;
}

以下描述了perf_event_mmap_page结构中的几个字段：
data_head：这指向数据段的开头。该值持续增加，但不会换行。在访问样本之前，需要根据mmap缓冲区的大小手动包装该值。
data_tail：当映射为PROT_WRITE时，data_tail值应按用户空间写入，以反映上次读取的数据。在这种情况下，内核不会覆盖未读数据。
data_offset (since Linux 4.1)：包含mmap缓冲区中性能样本数据开始位置的偏移量。
data_size (since Linux 4.1)：包含mmap缓冲区中perf样本区域的大小。

aux_head, aux_tail, aux_offset, aux_size (since Linux 4.1)：
AUX区域允许为 high-bandwidth 数据流合成单独的采样缓冲区（与主性能采样缓冲区分开）。high-bandwidth 流的一个例子是指令跟踪支持，这在较新的英特尔处理器中可以找到。

要设置AUX区域，需要将第一个aux_offset设置为大于data_offset+data_size的偏移，并且需要将aux_size设置为所需的缓冲区大小。所需的偏移量和大小必须与页面对齐，并且大小必须是2的幂。然后将这些值传递给mmap，以映射AUX缓冲区。AUX缓冲区中的页面作为RLIMIT_MEMLOCK资源限制的一部分（请参见setrlimit），也作为perf_event_mlock_kb许可的一部分。

默认情况下，如果AUX缓冲区无法容纳环形缓冲区中的可用空间，则会将其截断。如果AUX缓冲区被映射为只读缓冲区，那么它将以环形缓冲区模式运行，其中旧数据将被新数据覆盖。在覆盖模式下，可能无法推断新数据的开始位置，用户的工作是在读取时禁用测量，以避免可能的数据竞争。

aux_head和aux_tail环形缓冲区指针具有与前面描述的data_head和data_tail相同的行为和排序规则。

PERF_RECORD_AUX (since Linux 4.1)

This record reports that new data is available in the separate AUX buffer region.

    struct {
        struct perf_event_header header;
        u64    aux_offset;
        u64    aux_size;
        u64    flags;
        struct sample_id sample_id;
    };

aux_offset
       offset in the AUX mmap region where the new data begins.

aux_size
       size of the data made available.

flags  describes the AUX update.

       PERF_AUX_FLAG_TRUNCATED
              if set, then the data returned was truncated to fit the available buffer size.

       PERF_AUX_FLAG_OVERWRITE
              if set, then the data returned has overwritten previous data.

六、Overflow handling

事件可以设置为在超过阈值时发出通知，表示溢出。可以通过使用poll、select或epoll监视事件文件描述符来捕获溢出条件。或者，通过在文件描述符上启用I/O信令，可以通过sa信号处理器捕获溢出事件；请参见fcntl中F_SETOWN和F_SETSIG操作的讨论。

溢出仅由采样事件生成（sample_period必须具有非零值）。

有两种方法可以生成溢出通知。

第一种方法是设置一个wakeup_events或wakeup_watermark值，如果向mmap环形缓冲区写入了一定数量的样本或字节，则会触发该值。在这种情况下，指示POLL_IN。

另一种方法是使用PERF_EVENT_IOC_REFRESH ioctl。此ioctl添加到一个计数器，该计数器在每次事件溢出时递减。当非零时，指示POLL_IN，但一旦计数器达到0，则指示POLL_HUP，并禁用基础事件。

刷新事件 group leader 将刷新所有 siblings。

从Linux 3.18开始，如果正在监视的事件附加到另一个进程并且该进程退出，则指示POLL_HUP。

七、perf_event ioctl calls

各种ioctl作用于perf_event_open（）文件描述符：

PERF_EVENT_IOC_ENABLE
       This enables the individual event or event group specified
       by the file descriptor argument.

       If the PERF_IOC_FLAG_GROUP bit is set in the ioctl
       argument, then all events in a group are enabled, even if
       the event specified is not the group leader (but see
       BUGS).

PERF_EVENT_IOC_DISABLE
       This disables the individual counter or event group
       specified by the file descriptor argument.

       Enabling or disabling the leader of a group enables or
       disables the entire group; that is, while the group leader
       is disabled, none of the counters in the group will count.
       Enabling or disabling a member of a group other than the
       leader affects only that counter; disabling a non-leader
       stops that counter from counting but doesn't affect any
       other counter.

       If the PERF_IOC_FLAG_GROUP bit is set in the ioctl
       argument, then all events in a group are disabled, even if
       the event specified is not the group leader (but see
       BUGS).

PERF_EVENT_IOC_REFRESH
       Non-inherited overflow counters can use this to enable a
       counter for a number of overflows specified by the
       argument, after which it is disabled.  Subsequent calls of
       this ioctl add the argument value to the current count.
       An overflow notification with POLL_IN set will happen on
       each overflow until the count reaches 0; when that happens
       a notification with POLL_HUP set is sent and the event is
       disabled.  Using an argument of 0 is considered undefined
       behavior.

PERF_EVENT_IOC_RESET
       Reset the event count specified by the file descriptor
       argument to zero.  This resets only the counts; there is
       no way to reset the multiplexing time_enabled or
       time_running values.

       If the PERF_IOC_FLAG_GROUP bit is set in the ioctl
       argument, then all events in a group are reset, even if
       the event specified is not the group leader (but see
       BUGS).

PERF_EVENT_IOC_PERIOD
       This updates the overflow period for the event.

       Since Linux 3.7 (on ARM) and Linux 3.14 (all other
       architectures), the new period takes effect immediately.
       On older kernels, the new period did not take effect until
       after the next overflow.

       The argument is a pointer to a 64-bit value containing the
       desired new period.

       Prior to Linux 2.6.36, this ioctl always failed due to a
       bug in the kernel.

PERF_EVENT_IOC_SET_OUTPUT
       This tells the kernel to report event notifications to the
       specified file descriptor rather than the default one.
       The file descriptors must all be on the same CPU.

       The argument specifies the desired file descriptor, or -1
       if output should be ignored.

PERF_EVENT_IOC_SET_FILTER (since Linux 2.6.33)
       This adds an ftrace filter to this event.

       The argument is a pointer to the desired ftrace filter.

PERF_EVENT_IOC_ID (since Linux 3.12)
       This returns the event ID value for the given event file
       descriptor.

       The argument is a pointer to a 64-bit unsigned integer to
       hold the result.

PERF_EVENT_IOC_SET_BPF (since Linux 4.1)
       This allows attaching a Berkeley Packet Filter (BPF)
       program to an existing kprobe tracepoint event.  You need
       CAP_PERFMON (since Linux 5.8) or CAP_SYS_ADMIN privileges
       to use this ioctl.

       The argument is a BPF program file descriptor that was
       created by a previous bpf(2) system call.

PERF_EVENT_IOC_PAUSE_OUTPUT (since Linux 4.7)
       This allows pausing and resuming the event's ring-buffer.
       A paused ring-buffer does not prevent generation of
       samples, but simply discards them.  The discarded samples
       are considered lost, and cause a PERF_RECORD_LOST sample
       to be generated when possible.  An overflow signal may
       still be triggered by the discarded sample even though the
       ring-buffer remains empty.

       The argument is an unsigned 32-bit integer.  A nonzero
       value pauses the ring-buffer, while a zero value resumes
       the ring-buffer.

PERF_EVENT_MODIFY_ATTRIBUTES (since Linux 4.17)
       This allows modifying an existing event without the
       overhead of closing and reopening a new event.  Currently
       this is supported only for breakpoint events.

       The argument is a pointer to a perf_event_attr structure
       containing the updated event settings.

PERF_EVENT_IOC_QUERY_BPF (since Linux 4.16)
       This allows querying which Berkeley Packet Filter (BPF)
       programs are attached to an existing kprobe tracepoint.
       You can only attach one BPF program per event, but you can
       have multiple events attached to a tracepoint.  Querying
       this value on one tracepoint event returns the ID of all
       BPF programs in all events attached to the tracepoint.
       You need CAP_PERFMON (since Linux 5.8) or CAP_SYS_ADMIN
       privileges to use this ioctl.

       The argument is a pointer to a structure
           struct perf_event_query_bpf {
               __u32    ids_len;
               __u32    prog_cnt;
               __u32    ids[0];
           };

       The ids_len field indicates the number of ids that can fit
       in the provided ids array.  The prog_cnt value is filled
       in by the kernel with the number of attached BPF programs.
       The ids array is filled with the ID of each attached BPF
       program.  If there are more programs than will fit in the
       array, then the kernel will return ENOSPC and ids_len will
       indicate the number of program IDs that were successfully
       copied.

/*
 * Ioctls that can be done on a perf event fd:
 */
#define PERF_EVENT_IOC_ENABLE		_IO ('$', 0)
#define PERF_EVENT_IOC_DISABLE		_IO ('$', 1)
#define PERF_EVENT_IOC_REFRESH		_IO ('$', 2)
#define PERF_EVENT_IOC_RESET		_IO ('$', 3)
#define PERF_EVENT_IOC_PERIOD		_IOW('$', 4, __u64)
#define PERF_EVENT_IOC_SET_OUTPUT	_IO ('$', 5)
#define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
#define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
#define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
#define PERF_EVENT_IOC_PAUSE_OUTPUT	_IOW('$', 9, __u32)
#define PERF_EVENT_IOC_QUERY_BPF	_IOWR('$', 10, struct perf_event_query_bpf *)

八、demo

下面是硬件类型计数（count）的一个demo：

#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <asm/unistd.h>

static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
                int cpu, int group_fd, unsigned long flags)
{
    int ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
                    group_fd, flags);
    return ret;
}

int main(int argc, char **argv)
{
    struct perf_event_attr pe;
    long long count;
    int fd;

    memset(&pe, 0, sizeof(struct perf_event_attr));
    pe.type = PERF_TYPE_HARDWARE;
    pe.size = sizeof(struct perf_event_attr);
    pe.config = PERF_COUNT_HW_INSTRUCTIONS;
    pe.disabled = 1;
    pe.exclude_kernel = 1;
    pe.exclude_hv = 1;

    fd = perf_event_open(&pe, 0, -1, -1, 0);
    if (fd == -1) {
        fprintf(stderr, "Error opening leader %llx\n", pe.config);
        exit(EXIT_FAILURE);
    }

    ioctl(fd, PERF_EVENT_IOC_RESET, 0);
    ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);

    printf("Measuring instruction count for this printf\n");

    ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
    read(fd, &count, sizeof(long long));

    printf("Used %lld instructions\n", count);

    close(fd);
}

结果：

Measuring instruction count for this printf
Used 3489 instructions

参考资料

man perf_event_open
https://pwl999.blog.csdn.net/article/details/81200439