Skip to content

Commit

Permalink
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/…
Browse files Browse the repository at this point in the history
…linux/kernel/git/tip/tip

Pull perf fixes from Ingo Molnar:
 "Misc race fixes uncovered by fuzzing efforts, a Sparse fix, two PMU
  driver fixes, plus miscellanous tooling fixes"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  perf/x86: Reject non sampling events with precise_ip
  perf/x86/intel: Account interrupts for PEBS errors
  perf/core: Fix concurrent sys_perf_event_open() vs. 'move_group' race
  perf/core: Fix sys_perf_event_open() vs. hotplug
  perf/x86/intel: Use ULL constant to prevent undefined shift behaviour
  perf/x86/intel/uncore: Fix hardcoded socket 0 assumption in the Haswell init code
  perf/x86: Set pmu->module in Intel PMU modules
  perf probe: Fix to probe on gcc generated symbols for offline kernel
  perf probe: Fix --funcs to show correct symbols for offline module
  perf symbols: Robustify reading of build-id from sysfs
  perf tools: Install tools/lib/traceevent plugins with install-bin
  tools lib traceevent: Fix prev/next_prio for deadline tasks
  perf record: Fix --switch-output documentation and comment
  perf record: Make __record_options static
  tools lib subcmd: Add OPT_STRING_OPTARG_SET option
  perf probe: Fix to get correct modname from elf header
  samples/bpf trace_output_user: Remove duplicate sys/ioctl.h include
  samples/bpf sock_example: Avoid getting ethhdr from two includes
  perf sched timehist: Show total scheduling time
  • Loading branch information
torvalds committed Jan 15, 2017
2 parents 255e614 + 18e7a45 commit 79078c5
Show file tree
Hide file tree
Showing 20 changed files with 257 additions and 92 deletions.
4 changes: 4 additions & 0 deletions arch/x86/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,10 @@ int x86_pmu_hw_config(struct perf_event *event)

if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;

/* There's no sense in having PEBS for non sampling events: */
if (!is_sampling_event(event))
return -EINVAL;
}
/*
* check that PEBS LBR correction does not conflict with
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/events/intel/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -3987,7 +3987,7 @@ __init int intel_pmu_init(void)
x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
}
x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1;

if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
Expand Down
2 changes: 2 additions & 0 deletions arch/x86/events/intel/cstate.c
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ static struct pmu cstate_core_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT,
.module = THIS_MODULE,
};

static struct pmu cstate_pkg_pmu = {
Expand All @@ -447,6 +448,7 @@ static struct pmu cstate_pkg_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT,
.module = THIS_MODULE,
};

static const struct cstate_model nhm_cstates __initconst = {
Expand Down
6 changes: 5 additions & 1 deletion arch/x86/events/intel/ds.c
Original file line number Diff line number Diff line change
Expand Up @@ -1389,9 +1389,13 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
continue;

/* log dropped samples number */
if (error[bit])
if (error[bit]) {
perf_log_lost_samples(event, error[bit]);

if (perf_event_account_interrupt(event))
x86_pmu_stop(event, 0);
}

if (counts[bit]) {
__intel_pmu_pebs_event(event, iregs, base,
top, bit, counts[bit]);
Expand Down
1 change: 1 addition & 0 deletions arch/x86/events/intel/rapl.c
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,7 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.start = rapl_pmu_event_start;
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
rapl_pmus->pmu.read = rapl_pmu_event_read;
rapl_pmus->pmu.module = THIS_MODULE;
return 0;
}

Expand Down
1 change: 1 addition & 0 deletions arch/x86/events/intel/uncore.c
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
.start = uncore_pmu_event_start,
.stop = uncore_pmu_event_stop,
.read = uncore_pmu_event_read,
.module = THIS_MODULE,
};
} else {
pmu->pmu = *pmu->type->pmu;
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/events/intel/uncore_snbep.c
Original file line number Diff line number Diff line change
Expand Up @@ -2686,7 +2686,7 @@ static struct intel_uncore_type *hswep_msr_uncores[] = {

void hswep_uncore_cpu_init(void)
{
int pkg = topology_phys_to_logical_pkg(0);
int pkg = boot_cpu_data.logical_proc_id;

if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
Expand Down
1 change: 1 addition & 0 deletions include/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
Expand Down
175 changes: 133 additions & 42 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -2249,35 +2249,34 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
bool activate = true;
bool reprogram = true;
int ret = 0;

raw_spin_lock(&cpuctx->ctx.lock);
if (ctx->task) {
raw_spin_lock(&ctx->lock);
task_ctx = ctx;

/* If we're on the wrong CPU, try again */
if (task_cpu(ctx->task) != smp_processor_id()) {
ret = -ESRCH;
goto unlock;
}
reprogram = (ctx->task == current);

/*
* If we're on the right CPU, see if the task we target is
* current, if not we don't have to activate the ctx, a future
* context switch will do that for us.
* If the task is running, it must be running on this CPU,
* otherwise we cannot reprogram things.
*
* If its not running, we don't care, ctx->lock will
* serialize against it becoming runnable.
*/
if (ctx->task != current)
activate = false;
else
WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
if (task_curr(ctx->task) && !reprogram) {
ret = -ESRCH;
goto unlock;
}

WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
} else if (task_ctx) {
raw_spin_lock(&task_ctx->lock);
}

if (activate) {
if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx);
Expand Down Expand Up @@ -2328,13 +2327,36 @@ perf_install_in_context(struct perf_event_context *ctx,
/*
* Installing events is tricky because we cannot rely on ctx->is_active
* to be set in case this is the nr_events 0 -> 1 transition.
*
* Instead we use task_curr(), which tells us if the task is running.
* However, since we use task_curr() outside of rq::lock, we can race
* against the actual state. This means the result can be wrong.
*
* If we get a false positive, we retry, this is harmless.
*
* If we get a false negative, things are complicated. If we are after
* perf_event_context_sched_in() ctx::lock will serialize us, and the
* value must be correct. If we're before, it doesn't matter since
* perf_event_context_sched_in() will program the counter.
*
* However, this hinges on the remote context switch having observed
* our task->perf_event_ctxp[] store, such that it will in fact take
* ctx::lock in perf_event_context_sched_in().
*
* We do this by task_function_call(), if the IPI fails to hit the task
* we know any future context switch of task must see the
* perf_event_ctpx[] store.
*/
again:

/*
* Cannot use task_function_call() because we need to run on the task's
* CPU regardless of whether its current or not.
* This smp_mb() orders the task->perf_event_ctxp[] store with the
* task_cpu() load, such that if the IPI then does not find the task
* running, a future context switch of that task must observe the
* store.
*/
if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
smp_mb();
again:
if (!task_function_call(task, __perf_install_in_context, event))
return;

raw_spin_lock_irq(&ctx->lock);
Expand All @@ -2348,12 +2370,16 @@ perf_install_in_context(struct perf_event_context *ctx,
raw_spin_unlock_irq(&ctx->lock);
return;
}
raw_spin_unlock_irq(&ctx->lock);
/*
* Since !ctx->is_active doesn't mean anything, we must IPI
* unconditionally.
* If the task is not running, ctx->lock will avoid it becoming so,
* thus we can safely install the event.
*/
goto again;
if (task_curr(task)) {
raw_spin_unlock_irq(&ctx->lock);
goto again;
}
add_event_to_ctx(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}

/*
Expand Down Expand Up @@ -7034,25 +7060,12 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}

/*
* Generic event overflow handling, sampling.
*/

static int __perf_event_overflow(struct perf_event *event,
int throttle, struct perf_sample_data *data,
struct pt_regs *regs)
static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
u64 seq;
int ret = 0;

/*
* Non-sampling counters might still use the PMI to fold short
* hardware counters, ignore those.
*/
if (unlikely(!is_sampling_event(event)))
return 0;
u64 seq;

seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
Expand Down Expand Up @@ -7080,6 +7093,34 @@ static int __perf_event_overflow(struct perf_event *event,
perf_adjust_period(event, delta, hwc->last_period, true);
}

return ret;
}

int perf_event_account_interrupt(struct perf_event *event)
{
return __perf_event_account_interrupt(event, 1);
}

/*
* Generic event overflow handling, sampling.
*/

static int __perf_event_overflow(struct perf_event *event,
int throttle, struct perf_sample_data *data,
struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
int ret = 0;

/*
* Non-sampling counters might still use the PMI to fold short
* hardware counters, ignore those.
*/
if (unlikely(!is_sampling_event(event)))
return 0;

ret = __perf_event_account_interrupt(event, throttle);

/*
* XXX event_limit might not quite work as expected on inherited
* events
Expand Down Expand Up @@ -9503,6 +9544,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}

/*
* Variation on perf_event_ctx_lock_nested(), except we take two context
* mutexes.
*/
static struct perf_event_context *
__perf_event_ctx_lock_double(struct perf_event *group_leader,
struct perf_event_context *ctx)
{
struct perf_event_context *gctx;

again:
rcu_read_lock();
gctx = READ_ONCE(group_leader->ctx);
if (!atomic_inc_not_zero(&gctx->refcount)) {
rcu_read_unlock();
goto again;
}
rcu_read_unlock();

mutex_lock_double(&gctx->mutex, &ctx->mutex);

if (group_leader->ctx != gctx) {
mutex_unlock(&ctx->mutex);
mutex_unlock(&gctx->mutex);
put_ctx(gctx);
goto again;
}

return gctx;
}

/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
Expand Down Expand Up @@ -9746,12 +9818,31 @@ SYSCALL_DEFINE5(perf_event_open,
}

if (move_group) {
gctx = group_leader->ctx;
mutex_lock_double(&gctx->mutex, &ctx->mutex);
gctx = __perf_event_ctx_lock_double(group_leader, ctx);

if (gctx->task == TASK_TOMBSTONE) {
err = -ESRCH;
goto err_locked;
}

/*
* Check if we raced against another sys_perf_event_open() call
* moving the software group underneath us.
*/
if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
/*
* If someone moved the group out from under us, check
* if this new event wound up on the same ctx, if so
* its the regular !move_group case, otherwise fail.
*/
if (gctx != ctx) {
err = -EINVAL;
goto err_locked;
} else {
perf_event_ctx_unlock(group_leader, gctx);
move_group = 0;
}
}
} else {
mutex_lock(&ctx->mutex);
}
Expand Down Expand Up @@ -9853,7 +9944,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_unpin_context(ctx);

if (move_group)
mutex_unlock(&gctx->mutex);
perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);

if (task) {
Expand All @@ -9879,7 +9970,7 @@ SYSCALL_DEFINE5(perf_event_open,

err_locked:
if (move_group)
mutex_unlock(&gctx->mutex);
perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
/* err_file: */
fput(event_file);
Expand Down
2 changes: 1 addition & 1 deletion samples/bpf/sock_example.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <net/ethernet.h>
#include <linux/if_ether.h>
#include <net/if.h>
#include <linux/if_packet.h>
#include <arpa/inet.h>
Expand Down
1 change: 0 additions & 1 deletion samples/bpf/trace_output_user.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <string.h>
#include <fcntl.h>
#include <poll.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <linux/bpf.h>
#include <errno.h>
Expand Down
3 changes: 3 additions & 0 deletions tools/lib/subcmd/parse-options.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,9 @@ static int get_value(struct parse_opt_ctx_t *p,
else
err = get_arg(p, opt, flags, (const char **)opt->value);

if (opt->set)
*(bool *)opt->set = true;

/* PARSE_OPT_NOEMPTY: Allow NULL but disallow empty string. */
if (opt->flags & PARSE_OPT_NOEMPTY) {
const char *val = *(const char **)opt->value;
Expand Down
5 changes: 5 additions & 0 deletions tools/lib/subcmd/parse-options.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,11 @@ struct option {
{ .type = OPTION_STRING, .short_name = (s), .long_name = (l), \
.value = check_vtype(v, const char **), (a), .help = (h), \
.flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d) }
#define OPT_STRING_OPTARG_SET(s, l, v, os, a, h, d) \
{ .type = OPTION_STRING, .short_name = (s), .long_name = (l), \
.value = check_vtype(v, const char **), (a), .help = (h), \
.flags = PARSE_OPT_OPTARG, .defval = (intptr_t)(d), \
.set = check_vtype(os, bool *)}
#define OPT_STRING_NOEMPTY(s, l, v, a, h) { .type = OPTION_STRING, .short_name = (s), .long_name = (l), .value = check_vtype(v, const char **), (a), .help = (h), .flags = PARSE_OPT_NOEMPTY}
#define OPT_DATE(s, l, v, h) \
{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
Expand Down
Loading

0 comments on commit 79078c5

Please sign in to comment.