Skip to content

Commit

Permalink
fs/epoll: make nesting accounting safe for -rt kernel
Browse files Browse the repository at this point in the history
Davidlohr Bueso pointed out that when CONFIG_DEBUG_LOCK_ALLOC is set
ep_poll_safewake() can take several non-raw spinlocks after disabling
interrupts.  Since a spinlock can block in the -rt kernel, we can't take a
spinlock after disabling interrupts.  So let's re-work how we determine
the nesting level such that it plays nicely with the -rt kernel.

Let's introduce a 'nests' field in struct eventpoll that records the
current nesting level during ep_poll_callback().  Then, if we nest again
we can find the previous struct eventpoll that we were called from and
increase our count by 1.  The 'nests' field is protected by
ep->poll_wait.lock.

I've also moved the visited field to reduce the size of struct eventpoll
from 184 bytes to 176 bytes on x86_64 for !CONFIG_DEBUG_LOCK_ALLOC, which
is typical for a production config.

Reported-by: Davidlohr Bueso <[email protected]>
Signed-off-by: Jason Baron <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Reviewed-by: Davidlohr Bueso <[email protected]>
Cc: Roman Penyaev <[email protected]>
Cc: Eric Wong <[email protected]>
Cc: Al Viro <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
almostivan authored and torvalds committed Apr 7, 2020
1 parent 282144e commit efcdd35
Showing 1 changed file with 43 additions and 21 deletions.
64 changes: 43 additions & 21 deletions fs/eventpoll.c
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,18 @@ struct eventpoll {
struct file *file;

/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
int visited;

#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
unsigned int napi_id;
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* tracks wakeup nests for lockdep validation */
u8 nests;
#endif
};

/* Wait structure used by the poll hooks */
Expand Down Expand Up @@ -545,30 +550,47 @@ static int ep_call_nested(struct nested_calls *ncalls,
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC

static DEFINE_PER_CPU(int, wakeup_nest);

static void ep_poll_safewake(wait_queue_head_t *wq)
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
struct eventpoll *ep_src;
unsigned long flags;
int subclass;
u8 nests = 0;

local_irq_save(flags);
preempt_disable();
subclass = __this_cpu_read(wakeup_nest);
spin_lock_nested(&wq->lock, subclass + 1);
__this_cpu_inc(wakeup_nest);
wake_up_locked_poll(wq, POLLIN);
__this_cpu_dec(wakeup_nest);
spin_unlock(&wq->lock);
local_irq_restore(flags);
preempt_enable();
/*
* To set the subclass or nesting level for spin_lock_irqsave_nested()
* it might be natural to create a per-cpu nest count. However, since
* we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
* schedule() in the -rt kernel, the per-cpu variable are no longer
* protected. Thus, we are introducing a per eventpoll nest field.
* If we are not being call from ep_poll_callback(), epi is NULL and
* we are at the first level of nesting, 0. Otherwise, we are being
* called from ep_poll_callback() and if a previous wakeup source is
* not an epoll file itself, we are at depth 1 since the wakeup source
* is depth 0. If the wakeup source is a previous epoll file in the
* wakeup chain then we use its nests value and record ours as
* nests + 1. The previous epoll file nests value is stable since its
* already holding its own poll_wait.lock.
*/
if (epi) {
if ((is_file_epoll(epi->ffd.file))) {
ep_src = epi->ffd.file->private_data;
nests = ep_src->nests;
} else {
nests = 1;
}
}
spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
ep->nests = nests + 1;
wake_up_locked_poll(&ep->poll_wait, EPOLLIN);
ep->nests = 0;
spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}

#else

static void ep_poll_safewake(wait_queue_head_t *wq)
static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi)
{
wake_up_poll(wq, EPOLLIN);
wake_up_poll(&ep->poll_wait, EPOLLIN);
}

#endif
Expand Down Expand Up @@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep)

/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
ep_poll_safewake(&ep->poll_wait);
ep_poll_safewake(ep, NULL);

/*
* We need to lock this because we could be hit by
Expand Down Expand Up @@ -1258,7 +1280,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v

/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
ep_poll_safewake(ep, epi);

if (!(epi->event.events & EPOLLEXCLUSIVE))
ewake = 1;
Expand Down Expand Up @@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,

/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
ep_poll_safewake(ep, NULL);

return 0;

Expand Down Expand Up @@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,

/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
ep_poll_safewake(ep, NULL);

return 0;
}
Expand Down

0 comments on commit efcdd35

Please sign in to comment.