perf属于硬件资源监控模块,通过PMU(Performance Monitor Unit)性能监控单元完成性能事件和硬件计数器的记录情况。
Linux性能计数器的使用(perf_events)可能会带来泄露受监控进程访问的敏感数据的巨大风险。在直接使用perf_events系统调用API的情况下,数据泄露都是可能的以及由perf工具用户模式实用程序(perf)生成的数据文件风险取决于perf_events性能监控单元(PMU)的数据的性质和perf收集并公开以进行性能分析。收集的系统和性能数据可分为以下几类:
系统硬件和软件配置数据,例如:CPU模型及其缓存配置,可用内存量及其拓扑,使用的内核和Perf版本,性能监控设置,包括实验时间,事件配置,Perf命令行参数等。
用户和内核模块路径及其加载地址,包括大小、进程和线程名称及其PID和TIA、捕获的硬件和软件事件的时间戳。
内核软件计数器的内容(例如,用于上下文切换、页面错误、CPU迁移)、架构硬件性能计数器 (PMC)和机器专用寄存器 (MSR)为系统的各个受监控部分(例如,内存控制器 (IMC)、互连 (QPI/UPI)或外设(PCIe)非内核计数器)提供执行指标,而无需直接归因于任何执行上下文状态。
架构执行上下文寄存器的内容(例如,RIP、RSP、x86_64上的RBP)、进程用户和内核空间内存地址和数据、从此类别捕获数据的各种架构 MSR 的内容。
属于第四类的数据可能包含敏感的过程数据。如果某些监视模式下的PMU从进程存储器中捕获执行上下文寄存器的值或数据,则需要对此类监视模式的访问进行适当的排序和保护。因此,perf_events性能监控和可观察性操作是安全访问控制管理的主题。
引自 https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html,在x86_pmu_static_call_update分析静态调用实现部分,避免分支优化,起到(尽量)避免guests篡改信息等作用(只是介于函数方面)。
为了执行安全检查,Linux 实现将进程分为两类 a) 特权进程(其有效用户ID为0,称为超级用户或root) 和 b) 非特权进程(其有效UID为非零)。特权进程绕过所有内核安全权限检查,因此特权进程完全可以使用perf_events性能监视,而无需访问、范围和资源限制。
非特权进程将根据进程的凭据进行完全安全权限检查(通常:有效UID、有效GID和补充组列表)。
Linux将传统上与超级用户相关的权限划分为不同的单元,称为功能,可以基于每个线程为非特权用户的进程和文件独立启用和禁用。
启用了CAP_PERFMON功能的非特权进程被视为与perf_events性能监视和可观察性操作相关的特权进程,因此绕过内核中的作用域权限检查。CAP_PERFMON实现了最小特权原则(POSIX 1003.1e: 2.2.2.39)用于内核中的性能监控和可观察性操作,并为系统中的性能监控和可观察性提供了一种安全的方法。
出于向后兼容性原因,对perf_events监视和可观察性操作的访问也对CAP_SYS_ADMIN特权进程开放,但对于CAP_PERFMON功能,不鼓励CAP_SYS_ADMIN安全监视和可观察性用例的使用。如果系统审计记录对于使用perf_events系统调用API的进程,包含获取CAP_PERFMON和CAP_SYS_ADMIN功能的拒绝记录,则建议单独为进程提供CAP_PERFMON功能,作为解决与使用性能监视和可观察性相关的双重访问拒绝日志记录的首选安全方法。
以前使用perf_events系统调用的Linux v5.9非特权进程也受到PTRACE_MODE_READ_REALCREDS ptrace访问模式检查,其结果决定了是否允许监视。因此,提供CAP_SYS_PTRACE功能的非特权进程实际上被允许通过检查。从Linux v5.9开始,不需要CAP_SYS_PTRACE功能,并且CAP_PERFMON足以为进程提供性能监视和可观察性操作。
授予非特权进程的其他功能可以有效地捕获以后对受监视进程或系统进行性能分析所需的其他数据。例如,CAP_SYSLOG功能允许从 /proc/kallsyms 文件中读取内核空间内存地址。
init_hw_perf_events源码实现在arch/x86/events/core.c文件,early_initcall(init_hw_perf_events); 方式启动。
boot_cpu_data属于cpuinfo_x86结构对象,存储当前机器(x86系列)的cpu相关信息。
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
int err;
pr_info("Performance Events: ");
switch (boot_cpu_data.x86_vendor) { // x86厂商ID
case X86_VENDOR_INTEL: // 英特尔
err = intel_pmu_init();
break;
case X86_VENDOR_AMD: // AMD
err = amd_pmu_init();
break;
case X86_VENDOR_HYGON: // 海光
err = amd_pmu_init();
x86_pmu.name = "HYGON";
break;
case X86_VENDOR_ZHAOXIN: // 兆芯
case X86_VENDOR_CENTAUR:
err = zhaoxin_pmu_init();
break;
default:
err = -ENOTSUPP;
}
intel_pmu_init,继续分析:
/* 如果初始化了PMU 但没有APIC中断,
我们无法采样硬件事件
用户空间必须回退并通过基于hrtimer的软件事件进行采样 */
pmu_check_apic();
/* 完整性检查硬件是否存在或被模拟 */
if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed))
goto out_bad_pmu;
x86_pmu.attr_rdpmc = 1; /* 默认启用用户空间RDPMC使用 */
for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) // BTS、PEBS探头
quirk->func();
perf_events_lapic_init(); // 始终将NMI用于PMU
/* 共享此PMI处理程序的所有PMU/事件应确保为其事件增加 active_events */
register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI");
unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
0, x86_pmu.num_counters, 0, 0);
x86_pmu_format_group.attrs = x86_pmu.format_attrs;
if (!x86_pmu.events_sysfs_show)
x86_pmu_events_group.attrs = &empty_attrs;
/* 将事件经过时间传播到通用事件中
只能在事件处于活动状态的 CPU 上执行
返回处理的增量事件 */
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
if (!x86_pmu.guest_get_msrs)
x86_pmu.guest_get_msrs = (void *)&__static_call_return0;
// Model Specific Register
x86_pmu_static_call_update(); // 更新静态调用函数
x86_pmu_static_call_update,继续分析:
err = cpuhp_setup_state(CPUHP_PERF_X86_PREPARE, "perf/x86:prepare",
x86_pmu_prepare_cpu, x86_pmu_dead_cpu);
// 设置热插拔状态回调函数,在状态触发的当前cpu启动回调
// 注册cpu状态回调函数(x86_pmu_prepare_cpu、x86_pmu_dead_cpu)到热插拔线程结构中,
// 每个cpu都有自己的热插拔线程用于状态检测、调用对应状态的函数等等(内核线程名为cpuhp)
if (err)
return err;
err = cpuhp_setup_state(CPUHP_AP_PERF_X86_STARTING,
"perf/x86:starting", x86_pmu_starting_cpu,
x86_pmu_dying_cpu);
if (err)
goto out;
err = cpuhp_setup_state(CPUHP_AP_PERF_X86_ONLINE, "perf/x86:online",
x86_pmu_online_cpu, NULL);
if (err)
goto out1;
if (!is_hybrid()) {
err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); // 为每个有效的cpu分配perf事件上下文等等
if (err)
goto out2;
} else { // 混合内核
u8 cpu_type = get_this_hybrid_cpu_type(); // 获取cpu类型
for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
hybrid_pmu = &x86_pmu.hybrid_pmu[i];
...
err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
if (cpu_type == hybrid_pmu->cpu_type)
/* 通用代码对于混合模式不是特别友好
第一个注册的PMU的hybrid_pmu->pmu无条件分配给每个可能的cpuctx->ctx.pmu
将正确的混合PMU更新到cpuctx->ctx.pmu */
x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id());
}
if (i < x86_pmu.num_hybrid_pmus) {
for (j = 0; j < i; j++)
perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu);
pr_warn("Failed to register hybrid PMUs\n");
kfree(x86_pmu.hybrid_pmu);
x86_pmu.hybrid_pmu = NULL;
x86_pmu.num_hybrid_pmus = 0;
goto out2;
}
}
return 0;
...
}
core_pmu
static __initconst const struct x86_pmu core_pmu = {
.name = "core",
.handle_irq = x86_pmu_handle_irq,
.disable_all = x86_pmu_disable_all,
.enable_all = core_pmu_enable_all,
.enable = core_pmu_enable_event,
.disable = x86_pmu_disable_event,
.hw_config = core_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
.event_map = intel_pmu_event_map,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
.large_pebs_flags = LARGE_PEBS_FLAGS,
/*
* Intel PMCs cannot be accessed sanely above 32-bit width,
* so we install an artificial 1<<31 period regardless of
* the generic event period:
*/
.max_period = (1ULL<<31) - 1,
.put_event_constraints = intel_put_event_constraints,
.event_constraints = intel_core_event_constraints,
.guest_get_msrs = core_guest_get_msrs,
.format_attrs = intel_arch_formats_attr,
.events_sysfs_show = intel_event_sysfs_show,
/*
* Virtual (or funny metal) CPU can define x86_pmu.extra_regs
* together with PMU version 1 and thus be using core_pmu with
* shared_regs. We need following callbacks here to allocate
* it properly.
*/
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.cpu_dead = intel_pmu_cpu_dead,
.check_period = intel_pmu_check_period,
.lbr_reset = intel_pmu_lbr_reset_64,
.lbr_read = intel_pmu_lbr_read_64,
.lbr_save = intel_pmu_lbr_save,
.lbr_restore = intel_pmu_lbr_restore,
};
intel_pmu
static __initconst const struct x86_pmu intel_pmu = {
.name = "Intel",
.handle_irq = intel_pmu_handle_irq,
.disable_all = intel_pmu_disable_all,
.enable_all = intel_pmu_enable_all,
.enable = intel_pmu_enable_event,
.disable = intel_pmu_disable_event,
.add = intel_pmu_add_event,
.del = intel_pmu_del_event,
.read = intel_pmu_read_event,
.hw_config = intel_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
.event_map = intel_pmu_event_map,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
.large_pebs_flags = LARGE_PEBS_FLAGS,
/*
* Intel PMCs cannot be accessed sanely above 32 bit width,
* so we install an artificial 1<<31 period regardless of
* the generic event period:
*/
.max_period = (1ULL << 31) - 1,
.get_event_constraints = intel_get_event_constraints,
.put_event_constraints = intel_put_event_constraints,
.pebs_aliases = intel_pebs_aliases_core2,
.format_attrs = intel_arch3_formats_attr,
.events_sysfs_show = intel_event_sysfs_show,
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.cpu_dead = intel_pmu_cpu_dead,
.guest_get_msrs = intel_guest_get_msrs,
.sched_task = intel_pmu_sched_task,
.swap_task_ctx = intel_pmu_swap_task_ctx,
.check_period = intel_pmu_check_period,
.aux_output_match = intel_pmu_aux_output_match,
.lbr_reset = intel_pmu_lbr_reset_64,
.lbr_read = intel_pmu_lbr_read_64,
.lbr_save = intel_pmu_lbr_save,
.lbr_restore = intel_pmu_lbr_restore,
/*
* SMM has access to all 4 rings and while traditionally SMM code only
* ran in CPL0, 2021-era firmware is starting to make use of CPL3 in SMM.
*
* Since the EVENTSEL.{USR,OS} CPL filtering makes no distinction
* between SMM or not, this results in what should be pure userspace
* counters including SMM data.
*
* This is a clear privilege issue, therefore globally disable
* counting SMM by default.
*/
.attr_freeze_on_smi = 1,
};
cpu_hw_events
struct x86_pmu x86_pmu __read_mostly;
static struct pmu pmu;
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
.pmu = &pmu,
};
cpuinfo_x86 CPU类型和硬件错误标志,为每个CPU单独保存
struct cpuinfo_x86 {
__u8 x86; /* CPU family */
__u8 x86_vendor; /* CPU vendor */
__u8 x86_model;
__u8 x86_stepping;
#ifdef CONFIG_X86_64
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
int x86_tlbsize;
#endif
#ifdef CONFIG_X86_VMX_FEATURE_NAMES
__u32 vmx_capability[NVMXINTS];
#endif
__u8 x86_virt_bits;
__u8 x86_phys_bits;
/* CPUID returned core id bits: */
__u8 x86_coreid_bits;
__u8 cu_id;
/* Max extended CPUID function supported: */
__u32 extended_cpuid_level;
/* Maximum supported CPUID level, -1=no CPUID: */
int cpuid_level;
/*
* Align to size of unsigned long because the x86_capability array
* is passed to bitops which require the alignment. Use unnamed
* union to enforce the array is aligned to size of unsigned long.
*/
union {
__u32 x86_capability[NCAPINTS + NBUGINTS];
unsigned long x86_capability_alignment;
};
char x86_vendor_id[16];
char x86_model_id[64];
/* in KB - valid for CPUS which support this call: */
unsigned int x86_cache_size;
int x86_cache_alignment; /* In bytes */
/* Cache QoS architectural values, valid only on the BSP: */
int x86_cache_max_rmid; /* max index */
int x86_cache_occ_scale; /* scale to bytes */
int x86_cache_mbm_width_offset;
int x86_power;
unsigned long loops_per_jiffy;
/* protected processor identification number */
u64 ppin;
/* cpuid returned max cores value: */
u16 x86_max_cores;
u16 apicid;
u16 initial_apicid;
u16 x86_clflush_size;
/* number of cores as seen by the OS: */
u16 booted_cores;
/* Physical processor id: */
u16 phys_proc_id;
/* Logical processor id: */
u16 logical_proc_id;
/* Core id: */
u16 cpu_core_id;
u16 cpu_die_id;
u16 logical_die_id;
/* Index into per_cpu list: */
u16 cpu_index;
/* Is SMT active on this core? */
bool smt_active;
u32 microcode;
/* Address space bits used by the cache internally */
u8 x86_cache_bits;
unsigned initialized : 1;
} __randomize_layout;
attribute kobject(内核对象,参考内核设备模型)的属性
struct attribute {
const char *name;
umode_t mode;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
bool ignore_lockdep:1;
struct lock_class_key *key;
struct lock_class_key skey;
#endif
};
cpuid10_eax 英特尔“架构性能监控”CPUID检测/枚举详细信息
union cpuid10_eax {
struct {
unsigned int version_id:8;
unsigned int num_counters:8;
unsigned int bit_width:8;
unsigned int mask_length:8;
} split;
unsigned int full;
};
cpu_hw_events CPU硬件事件
struct cpu_hw_events {
/*
* Generic x86 PMC bits
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long dirty[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
int n_events; /* the # of events in the below arrays */
int n_added; /* the # last events in the below arrays;
they've never been enabled yet */
int n_txn; /* the # last events in the below arrays;
added in the current transaction */
int n_txn_pair;
int n_txn_metric;
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
struct event_constraint *event_constraint[X86_PMC_IDX_MAX];
int n_excl; /* the number of exclusive events */
unsigned int txn_flags;
int is_fake;
/*
* Intel DebugStore bits
*/
struct debug_store *ds;
void *ds_pebs_vaddr;
void *ds_bts_vaddr;
u64 pebs_enabled;
int n_pebs;
int n_large_pebs;
int n_pebs_via_pt;
int pebs_output;
/* Current super set of events hardware configuration */
u64 pebs_data_cfg;
u64 active_pebs_data_cfg;
int pebs_record_size;
/*
* Intel LBR bits
*/
int lbr_users;
int lbr_pebs_users;
struct perf_branch_stack lbr_stack;
struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
union {
struct er_account *lbr_sel;
struct er_account *lbr_ctl;
};
u64 br_sel;
void *last_task_ctx;
int last_log_id;
int lbr_select;
void *lbr_xsave;
/*
* Intel host/guest exclude bits
*/
u64 intel_ctrl_guest_mask;
u64 intel_ctrl_host_mask;
struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX];
/*
* Intel checkpoint mask
*/
u64 intel_cp_status;
/*
* manage shared (per-core, per-cpu) registers
* used on Intel NHM/WSM/SNB
*/
struct intel_shared_regs *shared_regs;
/*
* manage exclusive counter access between hyperthread
*/
struct event_constraint *constraint_list; /* in enable order */
struct intel_excl_cntrs *excl_cntrs;
int excl_thread_id; /* 0 or 1 */
/*
* SKL TSX_FORCE_ABORT shadow
*/
u64 tfa_shadow;
/*
* Perf Metrics
*/
/* number of accepted metrics events */
int n_metric;
/*
* AMD specific bits
*/
struct amd_nb *amd_nb;
int brs_active; /* BRS is enabled */
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
int n_pair; /* Large increment events */
void *kfree_on_online[X86_PERF_KFREE_MAX];
struct pmu *pmu;
};
英特尔架构LBR CPUID检测/枚举详细信息
union cpuid28_eax {
struct {
/* Supported LBR depth values */
unsigned int lbr_depth_mask:8;
unsigned int reserved:22;
/* Deep C-state Reset */
unsigned int lbr_deep_c_reset:1;
/* IP values contain LIP */
unsigned int lbr_lip:1;
} split;
unsigned int full;
};
union cpuid28_ebx {
struct {
/* CPL Filtering Supported */
unsigned int lbr_cpl:1;
/* Branch Filtering Supported */
unsigned int lbr_filter:1;
/* Call-stack Mode Supported */
unsigned int lbr_call_stack:1;
} split;
unsigned int full;
};
union cpuid28_ecx {
struct {
/* Mispredict Bit Supported */
unsigned int lbr_mispred:1;
/* Timed LBRs Supported */
unsigned int lbr_timed_lbr:1;
/* Branch Type Field Supported */
unsigned int lbr_br_type:1;
} split;
unsigned int full;
};
x86_perf_task_context_arch_lbr_xsave 支持XSAVES/XRSTORS进行lbr上下文切换
struct x86_perf_task_context_arch_lbr_xsave {
struct x86_perf_task_context_opt opt;
union {
struct xregs_state xsave;
struct {
struct fxregs_state i387;
struct xstate_header header;
struct arch_lbr_state lbr;
} __attribute__ ((packed, aligned (XSAVE_ALIGNMENT)));
};
};
apic 通用APIC(Advanced Configuration and Power Interface)子架构数据结构
struct apic {
/* Hotpath functions first */
void (*eoi_write)(u32 reg, u32 v);
void (*native_eoi_write)(u32 reg, u32 v);
void (*write)(u32 reg, u32 v);
u32 (*read)(u32 reg);
/* IPI related functions */
void (*wait_icr_idle)(void);
u32 (*safe_wait_icr_idle)(void);
void (*send_IPI)(int cpu, int vector);
void (*send_IPI_mask)(const struct cpumask *mask, int vector);
void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec);
void (*send_IPI_allbutself)(int vector);
void (*send_IPI_all)(int vector);
void (*send_IPI_self)(int vector);
u32 disable_esr;
enum apic_delivery_modes delivery_mode;
bool dest_mode_logical;
u32 (*calc_dest_apicid)(unsigned int cpu);
/* ICR related functions */
u64 (*icr_read)(void);
void (*icr_write)(u32 low, u32 high);
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
bool (*check_apicid_used)(physid_mask_t *map, int apicid);
void (*init_apic_ldr)(void);
void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
void (*setup_apic_routing)(void);
int (*cpu_present_to_apicid)(int mps_cpu);
void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
int (*check_phys_apicid_present)(int phys_apicid);
int (*phys_pkg_id)(int cpuid_apic, int index_msb);
u32 (*get_apic_id)(unsigned long x);
u32 (*set_apic_id)(unsigned int id);
/* wakeup_secondary_cpu */
int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
/* wakeup secondary CPU using 64-bit wakeup point */
int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip);
void (*inquire_remote_apic)(int apicid);
#ifdef CONFIG_X86_32
/*
* Called very early during boot from get_smp_config(). It should
* return the logical apicid. x86_[bios]_cpu_to_apicid is
* initialized before this function is called.
*
* If logical apicid can't be determined that early, the function
* may return BAD_APICID. Logical apicid will be configured after
* init_apic_ldr() while bringing up CPUs. Note that NUMA affinity
* won't be applied properly during early boot in this case.
*/
int (*x86_32_early_logical_apicid)(int cpu);
#endif
char *name;
};
示例机器使用的intel芯片,这里暂时分析这个函数:
__init int intel_pmu_init(void)
{
struct attribute **extra_skl_attr = &empty_attrs;
struct attribute **extra_attr = &empty_attrs;
struct attribute **td_attr = &empty_attrs;
struct attribute **mem_attr = &empty_attrs;
struct attribute **tsx_attr = &empty_attrs;
union cpuid10_edx edx;
union cpuid10_eax eax;
union cpuid10_ebx ebx;
unsigned int fixed_mask;
bool pmem = false;
int version, i;
char *name;
struct x86_hybrid_pmu *pmu;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { // 如果不是新类型处理器
// #define X86_FEATURE_ARCH_PERFMON ( 3*32+11) 英特尔架构性能监视器
switch (boot_cpu_data.x86) { // 其他处理器系列
case 0x6:
return p6_pmu_init(); // 奔腾 II... III... M...
case 0xb:
return knc_pmu_init(); // knc
case 0xf:
return p4_pmu_init(); // p4
}
return -ENODEV;
}
/* 检查性能监视器是否支持hw_event */
/* 通用CPUID函数清除%ecx,
因为某些cpu (Cyrix MII) 未设置或清除%ecx导致返回过时的寄存器内容 */
cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full); // CPU Identification
// 架构性能监控叶
// Intel Resource Director Technology (Intel RDT) 分配枚举子叶 EAX = 10H, ECX = 0
if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
// ARCH_PERFMON_EVENTS_COUNT 7
return -ENODEV;
version = eax.split.version_id;
if (version < 2)
x86_pmu = core_pmu;
else
x86_pmu = intel_pmu;
x86_pmu.version = version;
x86_pmu.num_counters = eax.split.num_counters;
x86_pmu.cntval_bits = eax.split.bit_width;
x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
x86_pmu.events_maskl = ebx.full;
x86_pmu.events_mask_len = eax.split.mask_length;
x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
x86_pmu.pebs_capable = PEBS_COUNTER_MASK;
/* v2 perfmon 不报告固定用途的事件,
因此假设至少 3 个事件,当不在管理程序中运行时: */
if (version > 1 && version < 5) {
int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR);
// #define X86_FEATURE_HYPERVISOR ( 4*32+31) 虚拟机监控程序上运行
x86_pmu.num_counters_fixed =
max((int)edx.split.num_counters_fixed, assume);
fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1;
} else if (version >= 5)
x86_pmu.num_counters_fixed = fls(fixed_mask);
if (boot_cpu_has(X86_FEATURE_PDCM)) { // 性能/调试功能MSR
u64 capabilities;
rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); // u32 = 0x345; ?
x86_pmu.intel_cap.capabilities = capabilities;
}
...
if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) // Last branch record 最近分支记录
// #define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */
intel_pmu_arch_lbr_init();
intel_pmu_arch_lbr_init,继续分析:
intel_ds_init(); // BTS、PEBS 探头和设置
x86_add_quirk(intel_arch_events_quirk); // 添加填充以保证状态缓冲区的64字节对齐
if (version >= 5) {
x86_pmu.intel_cap.anythread_deprecated = edx.split.anythread_deprecated;
if (x86_pmu.intel_cap.anythread_deprecated)
pr_cont(" AnyThread deprecated, ");
}
/* 安装 hw-cache-events 表:*/
switch (boot_cpu_data.x86_model) {
case INTEL_FAM6_CORE_YONAH:
pr_cont("Core events, ");
name = "core";
break;
case INTEL_FAM6_CORE2_MEROM:
x86_add_quirk(intel_clovertown_quirk);
fallthrough;
case INTEL_FAM6_CORE2_MEROM_L:
case INTEL_FAM6_CORE2_PENRYN:
case INTEL_FAM6_CORE2_DUNNINGTON:
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
...
}
snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name);
if (!is_hybrid()) {
group_events_td.attrs = td_attr;
group_events_mem.attrs = mem_attr;
group_events_tsx.attrs = tsx_attr;
group_format_extra.attrs = extra_attr;
group_format_extra_skl.attrs = extra_skl_attr;
x86_pmu.attr_update = attr_update;
} else {
hybrid_group_events_td.attrs = td_attr;
hybrid_group_events_mem.attrs = mem_attr;
hybrid_group_events_tsx.attrs = tsx_attr;
hybrid_group_format_extra.attrs = extra_attr;
x86_pmu.attr_update = hybrid_attr_update;
}
/* 在某些情况下,访问LBR MSR可能会导致 #GP
在此处检查所有LBR MSR
如果无法访问任何LBR MSR,则禁用LBR访问 */
if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL))
x86_pmu.lbr_nr = 0;
for (i = 0; i < x86_pmu.lbr_nr; i++) {
if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
x86_pmu.lbr_nr = 0;
}
/* 在某些情况下,访问额外的MSR可能会导致 #GP
例如 KVM 不支持 offcore 事件
在此处检查所有 extra_regs */
intel_pmu_check_extra_regs(x86_pmu.extra_regs);
/* 使用替代MSR范围支持全宽计数器 */
if (x86_pmu.intel_cap.full_width_write) {
x86_pmu.max_period = x86_pmu.cntval_mask >> 1;
x86_pmu.perfctr = MSR_IA32_PMC0;
pr_cont("full-width counters, ");
}
...
intel_aux_output_init(); // -> perf_report_aux_output_id(event, idx);
return 0;
}
Intel CPU长期以来一直支持LBR用于最后的分支记录,作为记录软件所采用的分支以及暴露其他控制流信息的一种方式。这依赖于特定于模型的寄存器,而在未来的英特尔CPU中,这将被折叠成更通用的 CPU 架构特性。
Linux内核一直在使用现有的基于MSR( model-specific register )的LBR功能,其中perf子系统之类的功能可用于记录所采用的分支和其他公开的控制流信息以供分析和分析。由于XSAVES支持和新的MSR功能,更快的上下文切换,更快的LBR 读取,更好地支持LBR功能而无需知道CPU型号的特定信息,能够将LBR暴露给没有特定于模型的功能的guests,并且开销较低。
void __init intel_pmu_arch_lbr_init(void)
{
struct pmu *pmu = x86_get_pmu(smp_processor_id()); // percpu中获取
union cpuid28_eax eax;
union cpuid28_ebx ebx;
union cpuid28_ecx ecx;jis
unsigned int unused_edx;
bool arch_lbr_xsave;
size_t size;
u64 lbr_nr;
/* Arch LBR Capabilities */
cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx); // 最后一个分支记录信息叶
// Last Branch Records Information Leaf (EAX = 1CH, ECX = 0)
lbr_nr = fls(eax.split.lbr_depth_mask) * 8; // lbr深度
if (!lbr_nr)
goto clear_arch_lbr;
if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr)) // max值作为LBR条目的数量写入新的MSR_ARCH_LBR_DEPTH
goto clear_arch_lbr;
...
arch_lbr_xsave = is_arch_lbr_xsave_available();
if (arch_lbr_xsave) {
size = sizeof(struct x86_perf_task_context_arch_lbr_xsave) +
get_lbr_state_size();
pmu->task_ctx_cache = create_lbr_kmem_cache(size,
XSAVE_ALIGNMENT);
}
if (!pmu->task_ctx_cache) {
arch_lbr_xsave = false;
size = sizeof(struct x86_perf_task_context_arch_lbr) +
lbr_nr * sizeof(struct lbr_entry);
pmu->task_ctx_cache = create_lbr_kmem_cache(size, 0);
}
x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0;
/* LBR调用堆栈需要CPL和分支过滤支持 */
if (!x86_pmu.lbr_cpl ||
!x86_pmu.lbr_filter ||
!x86_pmu.lbr_call_stack)
arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP;
#define LBR_NOT_SUPP -1 /* 不支持LBR过滤器 */
...
pr_cont("Architectural LBR, ");
return;
clear_arch_lbr:
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR);
}
x86_pmu结构中函数通过静态形式调用,静态调用属于全局函数指针的一种调用形式,避免优化分支调用(篡改地址)
static void x86_pmu_static_call_update(void)
{
static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
static_call_update(x86_pmu_enable, x86_pmu.enable);
static_call_update(x86_pmu_disable, x86_pmu.disable);
...
}
以x86_pmu_handle_irq函数为例子:
DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq);
||
\/
#define DEFINE_STATIC_CALL_NULL(name, _func) \
DECLARE_STATIC_CALL(name, _func); \
// extern struct static_call_key __SCK__##name
// extern typeof(func) __SCT__##name
struct static_call_key STATIC_CALL_KEY(name) = { \
// struct static_call_key __SCK__##name
.func = NULL, \
.type = 1, \
}; \
ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)
||
\/
// #ifdef CONFIG_RETHUNK /* 编译指定 -mfunction-return=thunk-extern */
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "jmp __x86_return_thunk")
#else
#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \
__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; int3; nop; nop; nop")
#endif
DEFINE_STATIC_CALL_NULL为目标函数定义静态调用函数名称(.static_call.text段),之后通过static_call_update替换基础函数为函数指针
static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
||
\/
#define static_call_update(name, func) \
({ \
typeof(&STATIC_CALL_TRAMP(name)) __F = (func); \
// typeof(&STATIC_CALL_TRAMP(name)) __SCT__##name
__static_call_update(&STATIC_CALL_KEY(name), \
STATIC_CALL_TRAMP_ADDR(name), __F); \
})
__static_call_update函数传入在DEFINE_STATIC_CALL_NULL中定义的key,静态函数名称地址,用于找到并保存新传入的函数,随着不同的定义选项填充机器码等等
static_call(x86_pmu_handle_irq)(regs);
通过static_call找到key,调用函数,避免优化分支(indirect branch predictor,间接分支预测)调用。