Skip to content

Commit

Permalink
scsi_transport_srp: Fix a race condition
Browse files Browse the repository at this point in the history
The rport timers must be stopped before the SRP initiator destroys the
resources associated with the SCSI host. This is necessary because
otherwise the callback functions invoked from the SRP transport layer
could trigger a use-after-free. Stopping the rport timers before
invoking scsi_remove_host() can trigger long delays in the SCSI error
handler if a transport layer failure occurs while scsi_remove_host()
is in progress. Hence move the code for stopping the rport timers from
srp_rport_release() into a new function and invoke that function after
scsi_remove_host() has finished. This patch fixes the following
sporadic kernel crash:

     kernel BUG at include/asm-generic/dma-mapping-common.h:64!
     invalid opcode: 0000 [#1] SMP
     RIP: 0010:[<ffffffffa03b20b1>]  [<ffffffffa03b20b1>] srp_unmap_data+0x121/0x130 [ib_srp]
     Call Trace:
     [<ffffffffa03b20fc>] srp_free_req+0x3c/0x80 [ib_srp]
     [<ffffffffa03b2188>] srp_finish_req+0x48/0x70 [ib_srp]
     [<ffffffffa03b21fb>] srp_terminate_io+0x4b/0x60 [ib_srp]
     [<ffffffffa03a6fb5>] __rport_fail_io_fast+0x75/0x80 [scsi_transport_srp]
     [<ffffffffa03a7438>] rport_fast_io_fail_timedout+0x88/0xc0 [scsi_transport_srp]
     [<ffffffff8108b370>] worker_thread+0x170/0x2a0
     [<ffffffff81090876>] kthread+0x96/0xa0
     [<ffffffff8100c0ca>] child_rip+0xa/0x20

Signed-off-by: Bart Van Assche <[email protected]>
Signed-off-by: Roland Dreier <[email protected]>
  • Loading branch information
bvanassche authored and rolandd committed Jan 21, 2014
1 parent 18cc4e0 commit 9307916
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 42 deletions.
1 change: 1 addition & 0 deletions drivers/infiniband/ulp/srp/ib_srp.c
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ static void srp_remove_target(struct srp_target_port *target)
srp_rport_get(target->rport);
srp_remove_host(target->scsi_host);
scsi_remove_host(target->scsi_host);
srp_stop_rport_timers(target->rport);
srp_disconnect_target(target);
ib_destroy_cm_id(target->cm_id);
srp_free_target_ib(target);
Expand Down
83 changes: 43 additions & 40 deletions drivers/scsi/scsi_transport_srp.c
Original file line number Diff line number Diff line change
Expand Up @@ -456,37 +456,29 @@ static void __srp_start_tl_fail_timers(struct srp_rport *rport)

lockdep_assert_held(&rport->mutex);

if (!rport->deleted) {
delay = rport->reconnect_delay;
fast_io_fail_tmo = rport->fast_io_fail_tmo;
dev_loss_tmo = rport->dev_loss_tmo;
pr_debug("%s current state: %d\n",
dev_name(&shost->shost_gendev), rport->state);
delay = rport->reconnect_delay;
fast_io_fail_tmo = rport->fast_io_fail_tmo;
dev_loss_tmo = rport->dev_loss_tmo;
pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev),
rport->state);

if (delay > 0)
if (rport->state == SRP_RPORT_LOST)
return;
if (delay > 0)
queue_delayed_work(system_long_wq, &rport->reconnect_work,
1UL * delay * HZ);
if (srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev),
rport->state);
scsi_target_block(&shost->shost_gendev);
if (fast_io_fail_tmo >= 0)
queue_delayed_work(system_long_wq,
&rport->reconnect_work,
1UL * delay * HZ);
if (srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) {
pr_debug("%s new state: %d\n",
dev_name(&shost->shost_gendev),
rport->state);
scsi_target_block(&shost->shost_gendev);
if (fast_io_fail_tmo >= 0)
queue_delayed_work(system_long_wq,
&rport->fast_io_fail_work,
1UL * fast_io_fail_tmo * HZ);
if (dev_loss_tmo >= 0)
queue_delayed_work(system_long_wq,
&rport->dev_loss_work,
1UL * dev_loss_tmo * HZ);
}
} else {
pr_debug("%s has already been deleted\n",
dev_name(&shost->shost_gendev));
srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST);
scsi_target_unblock(&shost->shost_gendev,
SDEV_TRANSPORT_OFFLINE);
&rport->fast_io_fail_work,
1UL * fast_io_fail_tmo * HZ);
if (dev_loss_tmo >= 0)
queue_delayed_work(system_long_wq,
&rport->dev_loss_work,
1UL * dev_loss_tmo * HZ);
}
}

Expand Down Expand Up @@ -560,7 +552,7 @@ int srp_reconnect_rport(struct srp_rport *rport)
scsi_target_block(&shost->shost_gendev);
while (scsi_request_fn_active(shost))
msleep(20);
res = i->f->reconnect(rport);
res = rport->state != SRP_RPORT_LOST ? i->f->reconnect(rport) : -ENODEV;
pr_debug("%s (state %d): transport.reconnect() returned %d\n",
dev_name(&shost->shost_gendev), rport->state, res);
if (res == 0) {
Expand Down Expand Up @@ -626,10 +618,6 @@ static void srp_rport_release(struct device *dev)
{
struct srp_rport *rport = dev_to_rport(dev);

cancel_delayed_work_sync(&rport->reconnect_work);
cancel_delayed_work_sync(&rport->fast_io_fail_work);
cancel_delayed_work_sync(&rport->dev_loss_work);

put_device(dev->parent);
kfree(rport);
}
Expand Down Expand Up @@ -784,12 +772,6 @@ void srp_rport_del(struct srp_rport *rport)
device_del(dev);
transport_destroy_device(dev);

mutex_lock(&rport->mutex);
if (rport->state == SRP_RPORT_BLOCKED)
__rport_fail_io_fast(rport);
rport->deleted = true;
mutex_unlock(&rport->mutex);

put_device(dev);
}
EXPORT_SYMBOL_GPL(srp_rport_del);
Expand All @@ -814,6 +796,27 @@ void srp_remove_host(struct Scsi_Host *shost)
}
EXPORT_SYMBOL_GPL(srp_remove_host);

/**
* srp_stop_rport_timers - stop the transport layer recovery timers
*
* Must be called after srp_remove_host() and scsi_remove_host(). The caller
* must hold a reference on the rport (rport->dev) and on the SCSI host
* (rport->dev.parent).
*/
void srp_stop_rport_timers(struct srp_rport *rport)
{
mutex_lock(&rport->mutex);
if (rport->state == SRP_RPORT_BLOCKED)
__rport_fail_io_fast(rport);
srp_rport_set_state(rport, SRP_RPORT_LOST);
mutex_unlock(&rport->mutex);

cancel_delayed_work_sync(&rport->reconnect_work);
cancel_delayed_work_sync(&rport->fast_io_fail_work);
cancel_delayed_work_sync(&rport->dev_loss_work);
}
EXPORT_SYMBOL_GPL(srp_stop_rport_timers);

static int srp_tsk_mgmt_response(struct Scsi_Host *shost, u64 nexus, u64 tm_id,
int result)
{
Expand Down
4 changes: 2 additions & 2 deletions include/scsi/scsi_transport_srp.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ struct srp_rport_identifiers {
* @SRP_RPORT_BLOCKED: Transport layer not operational; fast I/O fail timer
* is running and I/O has been blocked.
* @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast.
* @SRP_RPORT_LOST: Device loss timer has expired; port is being removed.
* @SRP_RPORT_LOST: Port is being removed.
*/
enum srp_rport_state {
SRP_RPORT_RUNNING,
Expand Down Expand Up @@ -48,7 +48,6 @@ struct srp_rport {

struct mutex mutex;
enum srp_rport_state state;
bool deleted;
int reconnect_delay;
int failed_reconnects;
struct delayed_work reconnect_work;
Expand Down Expand Up @@ -101,6 +100,7 @@ extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo,
extern int srp_reconnect_rport(struct srp_rport *rport);
extern void srp_start_tl_fail_timers(struct srp_rport *rport);
extern void srp_remove_host(struct Scsi_Host *);
extern void srp_stop_rport_timers(struct srp_rport *rport);

/**
* srp_chkready() - evaluate the transport layer state before I/O
Expand Down

0 comments on commit 9307916

Please sign in to comment.