Skip to content

Commit

Permalink
Merge tag 'for-linus-20190715' of git://git.kernel.dk/linux-block
Browse files Browse the repository at this point in the history
Pull more block updates from Jens Axboe:
 "A later pull request with some followup items. I had some vacation
  coming up to the merge window, so certain things items were delayed a
  bit. This pull request also contains fixes that came in within the
  last few days of the merge window, which I didn't want to push right
  before sending you a pull request.

  This contains:

   - NVMe pull request, mostly fixes, but also a few minor items on the
     feature side that were timing constrained (Christoph et al)

   - Report zones fixes (Damien)

   - Removal of dead code (Damien)

   - Turn on cgroup psi memstall (Josef)

   - block cgroup MAINTAINERS entry (Konstantin)

   - Flush init fix (Josef)

   - blk-throttle low iops timing fix (Konstantin)

   - nbd resize fixes (Mike)

   - nbd 0 blocksize crash fix (Xiubo)

   - block integrity error leak fix (Wenwen)

   - blk-cgroup writeback and priority inheritance fixes (Tejun)"

* tag 'for-linus-20190715' of git://git.kernel.dk/linux-block: (42 commits)
  MAINTAINERS: add entry for block io cgroup
  null_blk: fixup ->report_zones() for !CONFIG_BLK_DEV_ZONED
  block: Limit zone array allocation size
  sd_zbc: Fix report zones buffer allocation
  block: Kill gfp_t argument of blkdev_report_zones()
  block: Allow mapping of vmalloc-ed buffers
  block/bio-integrity: fix a memory leak bug
  nvme: fix NULL deref for fabrics options
  nbd: add netlink reconfigure resize support
  nbd: fix crash when the blksize is zero
  block: Disable write plugging for zoned block devices
  block: Fix elevator name declaration
  block: Remove unused definitions
  nvme: fix regression upon hot device removal and insertion
  blk-throttle: fix zero wait time for iops throttled group
  block: Fix potential overflow in blk_report_zones()
  blkcg: implement REQ_CGROUP_PUNT
  blkcg, writeback: Implement wbc_blkcg_css()
  blkcg, writeback: Add wbc->no_cgroup_owner
  blkcg, writeback: Rename wbc_account_io() to wbc_account_cgroup_owner()
  ...
  • Loading branch information
torvalds committed Jul 16, 2019
2 parents 273cbf6 + 787c79d commit 9637d51
Show file tree
Hide file tree
Showing 50 changed files with 660 additions and 210 deletions.
2 changes: 1 addition & 1 deletion Documentation/admin-guide/cgroup-v2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2124,7 +2124,7 @@ following two functions.
a queue (device) has been associated with the bio and
before submission.

wbc_account_io(@wbc, @page, @bytes)
wbc_account_cgroup_owner(@wbc, @page, @bytes)
Should be called for each data segment being written out.
While this function doesn't care exactly when it's called
during the writeback session, it's the easiest and most
Expand Down
5 changes: 0 additions & 5 deletions Documentation/block/biodoc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -843,11 +843,6 @@ elevator_latter_req_fn These return the request before or after the

elevator_completed_req_fn called when a request is completed.

elevator_may_queue_fn returns true if the scheduler wants to allow the
current context to queue a new request even if
it is over the queue limit. This must be used
very carefully!!

elevator_set_req_fn
elevator_put_req_fn Must be used to allocate and free any elevator
specific storage for a request.
Expand Down
13 changes: 13 additions & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -4183,6 +4183,19 @@ S: Maintained
F: mm/memcontrol.c
F: mm/swap_cgroup.c

CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO)
M: Tejun Heo <[email protected]>
M: Jens Axboe <[email protected]>
L: [email protected]
L: [email protected]
T: git git://git.kernel.dk/linux-block
F: Documentation/cgroup-v1/blkio-controller.rst
F: block/blk-cgroup.c
F: include/linux/blk-cgroup.h
F: block/blk-throttle.c
F: block/blk-iolatency.c
F: block/bfq-cgroup.c

CORETEMP HARDWARE MONITORING DRIVER
M: Fenghua Yu <[email protected]>
L: [email protected]
Expand Down
8 changes: 6 additions & 2 deletions block/bio-integrity.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,12 @@ bool bio_integrity_prep(struct bio *bio)
ret = bio_integrity_add_page(bio, virt_to_page(buf),
bytes, offset);

if (ret == 0)
return false;
if (ret == 0) {
printk(KERN_ERR "could not attach integrity payload\n");
kfree(buf);
status = BLK_STS_RESOURCE;
goto err_end_io;
}

if (ret < bytes)
break;
Expand Down
28 changes: 27 additions & 1 deletion block/bio.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/blk-cgroup.h>
#include <linux/highmem.h>

#include <trace/events/block.h>
#include "blk.h"
Expand Down Expand Up @@ -1441,8 +1442,22 @@ void bio_unmap_user(struct bio *bio)
bio_put(bio);
}

static void bio_invalidate_vmalloc_pages(struct bio *bio)
{
#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
if (bio->bi_private && !op_is_write(bio_op(bio))) {
unsigned long i, len = 0;

for (i = 0; i < bio->bi_vcnt; i++)
len += bio->bi_io_vec[i].bv_len;
invalidate_kernel_vmap_range(bio->bi_private, len);
}
#endif
}

static void bio_map_kern_endio(struct bio *bio)
{
bio_invalidate_vmalloc_pages(bio);
bio_put(bio);
}

Expand All @@ -1463,13 +1478,20 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = kaddr >> PAGE_SHIFT;
const int nr_pages = end - start;
bool is_vmalloc = is_vmalloc_addr(data);
struct page *page;
int offset, i;
struct bio *bio;

bio = bio_kmalloc(gfp_mask, nr_pages);
if (!bio)
return ERR_PTR(-ENOMEM);

if (is_vmalloc) {
flush_kernel_vmap_range(data, len);
bio->bi_private = data;
}

offset = offset_in_page(kaddr);
for (i = 0; i < nr_pages; i++) {
unsigned int bytes = PAGE_SIZE - offset;
Expand All @@ -1480,7 +1502,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
if (bytes > len)
bytes = len;

if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
if (!is_vmalloc)
page = virt_to_page(data);
else
page = vmalloc_to_page(data);
if (bio_add_pc_page(q, bio, page, bytes,
offset) < bytes) {
/* we don't support partial mappings */
bio_put(bio);
Expand Down
66 changes: 61 additions & 5 deletions block/blk-cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
#include <linux/tracehook.h>
#include <linux/psi.h>
#include "blk.h"

#define MAX_KEY_LEN 100
Expand All @@ -47,12 +48,14 @@ struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);

struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
EXPORT_SYMBOL_GPL(blkcg_root_css);

static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];

static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */

static bool blkcg_debug_stats = false;
static struct workqueue_struct *blkcg_punt_bio_wq;

static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
Expand Down Expand Up @@ -87,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);

WARN_ON(!bio_list_empty(&blkg->async_bios));

/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
if (blkg->parent)
Expand All @@ -112,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
call_rcu(&blkg->rcu_head, __blkg_release);
}

static void blkg_async_bio_workfn(struct work_struct *work)
{
struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
async_bio_work);
struct bio_list bios = BIO_EMPTY_LIST;
struct bio *bio;

/* as long as there are pending bios, @blkg can't go away */
spin_lock_bh(&blkg->async_bio_lock);
bio_list_merge(&bios, &blkg->async_bios);
bio_list_init(&blkg->async_bios);
spin_unlock_bh(&blkg->async_bio_lock);

while ((bio = bio_list_pop(&bios)))
submit_bio(bio);
}

/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
Expand Down Expand Up @@ -140,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,

blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
spin_lock_init(&blkg->async_bio_lock);
bio_list_init(&blkg->async_bios);
INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg;

for (i = 0; i < BLKCG_MAX_POLS; i++) {
Expand Down Expand Up @@ -1526,6 +1551,25 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);

bool __blkcg_punt_bio_submit(struct bio *bio)
{
struct blkcg_gq *blkg = bio->bi_blkg;

/* consume the flag first */
bio->bi_opf &= ~REQ_CGROUP_PUNT;

/* never bounce for the root cgroup */
if (!blkg->parent)
return false;

spin_lock_bh(&blkg->async_bio_lock);
bio_list_add(&blkg->async_bios, bio);
spin_unlock_bh(&blkg->async_bio_lock);

queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
return true;
}

/*
* Scale the accumulated delay based on how long it has been since we updated
* the delay. We only call this when we are adding delay, in case it's been a
Expand Down Expand Up @@ -1587,6 +1631,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
*/
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
unsigned long pflags;
u64 now = ktime_to_ns(ktime_get());
u64 exp;
u64 delay_nsec = 0;
Expand All @@ -1613,11 +1658,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
*/
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);

/*
* TODO: the use_memdelay flag is going to be for the upcoming psi stuff
* that hasn't landed upstream yet. Once that stuff is in place we need
* to do a psi_memstall_enter/leave if memdelay is set.
*/
if (use_memdelay)
psi_memstall_enter(&pflags);

exp = ktime_add_ns(now, delay_nsec);
tok = io_schedule_prepare();
Expand All @@ -1627,6 +1669,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
break;
} while (!fatal_signal_pending(current));
io_schedule_finish(tok);

if (use_memdelay)
psi_memstall_leave(&pflags);
}

/**
Expand Down Expand Up @@ -1726,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec);
}

static int __init blkcg_init(void)
{
blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
WQ_MEM_RECLAIM | WQ_FREEZABLE |
WQ_UNBOUND | WQ_SYSFS, 0);
if (!blkcg_punt_bio_wq)
return -ENOMEM;
return 0;
}
subsys_initcall(blkcg_init);

module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
6 changes: 5 additions & 1 deletion block/blk-core.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->internal_tag = -1;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
refcount_set(&rq->ref, 1);
}
EXPORT_SYMBOL(blk_rq_init);

Expand Down Expand Up @@ -687,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
struct request *rq;
struct list_head *plug_list;

plug = current->plug;
plug = blk_mq_plug(q, bio);
if (!plug)
return false;

Expand Down Expand Up @@ -1127,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
*/
blk_qc_t submit_bio(struct bio *bio)
{
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;

/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
Expand Down
2 changes: 1 addition & 1 deletion block/blk-mq.c
Original file line number Diff line number Diff line change
Expand Up @@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)

blk_mq_bio_to_request(rq, bio, nr_segs);

plug = current->plug;
plug = blk_mq_plug(q, bio);
if (unlikely(is_flush_fua)) {
/* bypass scheduler for flush rq */
blk_insert_flush(rq);
Expand Down
32 changes: 32 additions & 0 deletions block/blk-mq.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}

/*
* blk_mq_plug() - Get caller context plug
* @q: request queue
* @bio : the bio being submitted by the caller context
*
* Plugging, by design, may delay the insertion of BIOs into the elevator in
* order to increase BIO merging opportunities. This however can cause BIO
* insertion order to change from the order in which submit_bio() is being
* executed in the case of multiple contexts concurrently issuing BIOs to a
* device, even if these context are synchronized to tightly control BIO issuing
* order. While this is not a problem with regular block devices, this ordering
* change can cause write BIO failures with zoned block devices as these
* require sequential write patterns to zones. Prevent this from happening by
* ignoring the plug state of a BIO issuing context if the target request queue
* is for a zoned block device and the BIO to plug is a write operation.
*
* Return current->plug if the bio can be plugged and NULL otherwise
*/
static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
struct bio *bio)
{
/*
* For regular block devices or read operations, use the context plug
* which may be NULL if blk_start_plug() was not executed.
*/
if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
return current->plug;

/* Zoned block device write operation case: do not plug the BIO */
return NULL;
}

#endif
9 changes: 3 additions & 6 deletions block/blk-throttle.c
Original file line number Diff line number Diff line change
Expand Up @@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;

jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];

/* Slice has just started. Consider one slice interval */
if (!jiffy_elapsed)
jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed = jiffies - tg->slice_start[rw];

jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
/* Round up to the next throttle slice, wait time must be nonzero */
jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);

/*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
Expand Down
Loading

0 comments on commit 9637d51

Please sign in to comment.