详解 ARM PMU (Performance Monitoring Unit)

上官砚文

2023-12-01

本文会详细讲解ARM PMU模块原理及代码流程

kernel version=4.14.90, arch=arm64

初始化流程：

arch/arm64/kernel/perf_event.c

device_initcall(armv8_pmu_driver_init)

这里使用device_initcall调用进行函数的初始化流程，关于device_initcall的定义及实现流程再额外研究，这里简单列一下initcall系列函数的调用顺序

include/linux/init.h

#define pure_initcall(fn)        __define_initcall(fn, 0)

#define core_initcall(fn)        __define_initcall(fn, 1)
#define core_initcall_sync(fn)        __define_initcall(fn, 1s)
#define postcore_initcall(fn)        __define_initcall(fn, 2)
#define postcore_initcall_sync(fn)    __define_initcall(fn, 2s)
#define arch_initcall(fn)        __define_initcall(fn, 3)
#define arch_initcall_sync(fn)        __define_initcall(fn, 3s)
#define subsys_initcall(fn)        __define_initcall(fn, 4)
#define subsys_initcall_sync(fn)    __define_initcall(fn, 4s)
#define fs_initcall(fn)            __define_initcall(fn, 5)
#define fs_initcall_sync(fn)        __define_initcall(fn, 5s)
#define rootfs_initcall(fn)        __define_initcall(fn, rootfs)
#define device_initcall(fn)        __define_initcall(fn, 6)
#define device_initcall_sync(fn)    __define_initcall(fn, 6s)
#define late_initcall(fn)        __define_initcall(fn, 7)
#define late_initcall_sync(fn)        __define_initcall(fn, 7s)

在初始化函数调用过后就开进进入主初始化函数，这里对ARM/X86架构进行了区分

static int __init armv8_pmu_driver_init(void)
{
	if (acpi_disabled)
		return platform_driver_register(&armv8_pmu_driver);
	else
		return arm_pmu_acpi_probe(armv8_pmuv3_init);
}

对于arch=arm64，当然是走的platform_driver_register(&armv8_pmu_driver)，arm架构PMU使用的platform_driver，虚拟platform总线模式，关于platform_driver再额外研究

接下来我们来看armv8_pmu_driver，这里定义出 platform driver所必须的.driver & .probe回调函数，在platform dirver准备就绪后就会以此调用每个driver中的probe？

接着就会调用PMU的通用driver，arm_pmu_device_probe函数定义在driver/perf/arm_pmu_platform.c中

static int armv8_pmu_device_probe(struct platform_device *pdev)
{
	return arm_pmu_device_probe(pdev, armv8_pmu_of_device_ids, NULL);
}

这里需要说明一下PMU的驱动结构

分为两个大部分

1) 位于driver/perf下的通用driver

2) 与架构相关的，定义在arch/arm64/下的，与指令集相关的，perf_event.c / perf_event_v8.c 等等

arm_pmu_device_probe函数主要做了以下几个事情

pmu = armpmu_alloc();
ret = pmu_parse_irqs(pmu);
ret = init_fn(pmu);
ret = armpmu_request_irqs(pmu);
ret = armpmu_register(pmu);

armpmu_alloc

pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
pmu->hw_events = alloc_percpu(struct pmu_hw_events);
pmu->pmu = (struct pmu) {
	.pmu_enable	= armpmu_enable,
	.pmu_disable	= armpmu_disable,
	.event_init	= armpmu_event_init,
	.add		= armpmu_add,
	.del		= armpmu_del,
	.start		= armpmu_start,
	.stop		= armpmu_stop,
	.read		= armpmu_read,
	.filter_match	= armpmu_filter_match,
	.attr_groups	= pmu->attr_groups,
	/*
	 * This is a CPU PMU potentially in a heterogeneous
	 * configuration (e.g. big.LITTLE). This is not an uncore PMU,
	 * and we have taken ctx sharing into account (e.g. with our
	 * pmu::filter_match callback and pmu::event_init group
	 * validation).
	 */
	.capabilities	= PERF_PMU_CAP_HETEROGENEOUS_CPUS,
};

for_each_possible_cpu(cpu) {
	struct pmu_hw_events *events;

	events = per_cpu_ptr(pmu->hw_events, cpu);
	raw_spin_lock_init(&events->pmu_lock);
	events->percpu_pmu = pmu;
}

这里需要粘贴以下，PMU的结构体

struct arm_pmu {
	struct pmu	pmu;
	cpumask_t	active_irqs;
	cpumask_t	supported_cpus;
	char		*name;
	irqreturn_t	(*handle_irq)(int irq_num, void *dev);
	void		(*enable)(struct perf_event *event);
	void		(*disable)(struct perf_event *event);
	int		(*get_event_idx)(struct pmu_hw_events *hw_events,
					 struct perf_event *event);
	void		(*clear_event_idx)(struct pmu_hw_events *hw_events,
					 struct perf_event *event);
	int		(*set_event_filter)(struct hw_perf_event *evt,
					    struct perf_event_attr *attr);
	u32		(*read_counter)(struct perf_event *event);
	void		(*write_counter)(struct perf_event *event, u32 val);
	void		(*start)(struct arm_pmu *);
	void		(*stop)(struct arm_pmu *);
	void		(*reset)(void *);
	int		(*map_event)(struct perf_event *event);
	int		(*filter_match)(struct perf_event *event);
	int		num_events;
	u64		max_period;
	bool		secure_access; /* 32-bit ARM only */
#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40
	DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
	struct platform_device	*plat_device;
	struct pmu_hw_events	__percpu *hw_events;
	struct hlist_node	node;
	struct notifier_block	cpu_pm_nb;
	/* the attr_groups array must be NULL-terminated */
	const struct attribute_group *attr_groups[ARMPMU_NR_ATTR_GROUPS + 1];

	/* Only to be used by ACPI probing code */
	unsigned long acpi_cpuid;
};

上面填充了该结构体的第一个变量pmu，该结构体定义为：

include/linux/perf_event.h

/**
 * struct pmu - generic performance monitoring unit
 */
struct pmu {
	struct list_head		entry;

	struct module			*module;
	struct device			*dev;
	const struct attribute_group	**attr_groups;
	const char			*name;
	int				type;

	/*
	 * various common per-pmu feature flags
	 */
	int				capabilities;

	int * __percpu			pmu_disable_count;
	struct perf_cpu_context * __percpu pmu_cpu_context;
	atomic_t			exclusive_cnt; /* < 0: cpu; > 0: tsk */
	int				task_ctx_nr;
	int				hrtimer_interval_ms;

	/* number of address filters this PMU can do */
	unsigned int			nr_addr_filters;

	/*
	 * Fully disable/enable this PMU, can be used to protect from the PMI
	 * as well as for lazy/batch writing of the MSRs.
	 */
	void (*pmu_enable)		(struct pmu *pmu); /* optional */
	void (*pmu_disable)		(struct pmu *pmu); /* optional */

	/*
	 * Try and initialize the event for this PMU.
	 *
	 * Returns:
	 *  -ENOENT	-- @event is not for this PMU
	 *
	 *  -ENODEV	-- @event is for this PMU but PMU not present
	 *  -EBUSY	-- @event is for this PMU but PMU temporarily unavailable
	 *  -EINVAL	-- @event is for this PMU but @event is not valid
	 *  -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported
	 *  -EACCESS	-- @event is for this PMU, @event is valid, but no privilidges
	 *
	 *  0		-- @event is for this PMU and valid
	 *
	 * Other error return values are allowed.
	 */
	int (*event_init)		(struct perf_event *event);

	/*
	 * Notification that the event was mapped or unmapped.  Called
	 * in the context of the mapping task.
	 */
	void (*event_mapped)		(struct perf_event *event, struct mm_struct *mm); /* optional */
	void (*event_unmapped)		(struct perf_event *event, struct mm_struct *mm); /* optional */

	/*
	 * Flags for ->add()/->del()/ ->start()/->stop(). There are
	 * matching hw_perf_event::state flags.
	 */
#define PERF_EF_START	0x01		/* start the counter when adding    */
#define PERF_EF_RELOAD	0x02		/* reload the counter when starting */
#define PERF_EF_UPDATE	0x04		/* update the counter when stopping */

	/*
	 * Adds/Removes a counter to/from the PMU, can be done inside a
	 * transaction, see the ->*_txn() methods.
	 *
	 * The add/del callbacks will reserve all hardware resources required
	 * to service the event, this includes any counter constraint
	 * scheduling etc.
	 *
	 * Called with IRQs disabled and the PMU disabled on the CPU the event
	 * is on.
	 *
	 * ->add() called without PERF_EF_START should result in the same state
	 *  as ->add() followed by ->stop().
	 *
	 * ->del() must always PERF_EF_UPDATE stop an event. If it calls
	 *  ->stop() that must deal with already being stopped without
	 *  PERF_EF_UPDATE.
	 */
	int  (*add)			(struct perf_event *event, int flags);
	void (*del)			(struct perf_event *event, int flags);

	/*
	 * Starts/Stops a counter present on the PMU.
	 *
	 * The PMI handler should stop the counter when perf_event_overflow()
	 * returns !0. ->start() will be used to continue.
	 *
	 * Also used to change the sample period.
	 *
	 * Called with IRQs disabled and the PMU disabled on the CPU the event
	 * is on -- will be called from NMI context with the PMU generates
	 * NMIs.
	 *
	 * ->stop() with PERF_EF_UPDATE will read the counter and update
	 *  period/count values like ->read() would.
	 *
	 * ->start() with PERF_EF_RELOAD will reprogram the the counter
	 *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
	 */
	void (*start)			(struct perf_event *event, int flags);
	void (*stop)			(struct perf_event *event, int flags);

	/*
	 * Updates the counter value of the event.
	 *
	 * For sampling capable PMUs this will also update the software period
	 * hw_perf_event::period_left field.
	 */
	void (*read)			(struct perf_event *event);

	/*
	 * Group events scheduling is treated as a transaction, add
	 * group events as a whole and perform one schedulability test.
	 * If the test fails, roll back the whole group
	 *
	 * Start the transaction, after this ->add() doesn't need to
	 * do schedulability tests.
	 *
	 * Optional.
	 */
	void (*start_txn)		(struct pmu *pmu, unsigned int txn_flags);
	/*
	 * If ->start_txn() disabled the ->add() schedulability test
	 * then ->commit_txn() is required to perform one. On success
	 * the transaction is closed. On error the transaction is kept
	 * open until ->cancel_txn() is called.
	 *
	 * Optional.
	 */
	int  (*commit_txn)		(struct pmu *pmu);
	/*
	 * Will cancel the transaction, assumes ->del() is called
	 * for each successful ->add() during the transaction.
	 *
	 * Optional.
	 */
	void (*cancel_txn)		(struct pmu *pmu);

	/*
	 * Will return the value for perf_event_mmap_page::index for this event,
	 * if no implementation is provided it will default to: event->hw.idx + 1.
	 */
	int (*event_idx)		(struct perf_event *event); /*optional */

	/*
	 * context-switches callback
	 */
	void (*sched_task)		(struct perf_event_context *ctx,
					bool sched_in);
	/*
	 * PMU specific data size
	 */
	size_t				task_ctx_size;


	/*
	 * Set up pmu-private data structures for an AUX area
	 */
	void *(*setup_aux)		(int cpu, void **pages,
					 int nr_pages, bool overwrite);
					/* optional */

	/*
	 * Free pmu-private AUX data structures
	 */
	void (*free_aux)		(void *aux); /* optional */

	/*
	 * Validate address range filters: make sure the HW supports the
	 * requested configuration and number of filters; return 0 if the
	 * supplied filters are valid, -errno otherwise.
	 *
	 * Runs in the context of the ioctl()ing process and is not serialized
	 * with the rest of the PMU callbacks.
	 */
	int (*addr_filters_validate)	(struct list_head *filters);
					/* optional */

	/*
	 * Synchronize address range filter configuration:
	 * translate hw-agnostic filters into hardware configuration in
	 * event::hw::addr_filters.
	 *
	 * Runs as a part of filter sync sequence that is done in ->start()
	 * callback by calling perf_event_addr_filters_sync().
	 *
	 * May (and should) traverse event::addr_filters::list, for which its
	 * caller provides necessary serialization.
	 */
	void (*addr_filters_sync)	(struct perf_event *event);
					/* optional */

	/*
	 * Filter events for PMU-specific reasons.
	 */
	int (*filter_match)		(struct perf_event *event); /* optional */
};

详解 ARM PMU (Performance Monitoring Unit)

相关阅读

相关文章

相关问答

相关文档