Skip to content

Commit

Permalink
dax,ext2: replace XIP read and write with DAX I/O
Browse files Browse the repository at this point in the history
Use the generic AIO infrastructure instead of custom read and write
methods.  In addition to giving us support for AIO, this adds the missing
locking between read() and truncate().

Signed-off-by: Matthew Wilcox <[email protected]>
Reviewed-by: Ross Zwisler <[email protected]>
Reviewed-by: Jan Kara <[email protected]>
Cc: Andreas Dilger <[email protected]>
Cc: Boaz Harrosh <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Kirill A. Shutemov <[email protected]>
Cc: Mathieu Desnoyers <[email protected]>
Cc: Randy Dunlap <[email protected]>
Cc: Theodore Ts'o <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
Matthew Wilcox authored and torvalds committed Feb 17, 2015
1 parent fbbbad4 commit d475c63
Show file tree
Hide file tree
Showing 8 changed files with 214 additions and 245 deletions.
6 changes: 6 additions & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -3151,6 +3151,12 @@ L: [email protected]
S: Maintained
F: drivers/i2c/busses/i2c-diolan-u2c.c

DIRECT ACCESS (DAX)
M: Matthew Wilcox <[email protected]>
L: [email protected]
S: Supported
F: fs/dax.c

DIRECTORY NOTIFICATION (DNOTIFY)
M: Eric Paris <[email protected]>
S: Maintained
Expand Down
1 change: 1 addition & 0 deletions fs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_AIO) += aio.o
obj-$(CONFIG_FS_XIP) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
Expand Down
186 changes: 186 additions & 0 deletions fs/dax.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
/*
* fs/dax.c - Direct Access filesystem code
* Copyright (c) 2013-2014 Intel Corporation
* Author: Matthew Wilcox <[email protected]>
* Author: Ross Zwisler <[email protected]>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/

#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/mutex.h>
#include <linux/uio.h>

static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
{
unsigned long pfn;
sector_t sector = bh->b_blocknr << (blkbits - 9);
return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
}

static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
loff_t end)
{
loff_t final = end - pos + first; /* The final byte of the buffer */

if (first > 0)
memset(addr, 0, first);
if (final < size)
memset(addr + final, 0, size - final);
}

static bool buffer_written(struct buffer_head *bh)
{
return buffer_mapped(bh) && !buffer_unwritten(bh);
}

/*
* When ext4 encounters a hole, it returns without modifying the buffer_head
* which means that we can't trust b_size. To cope with this, we set b_state
* to 0 before calling get_block and, if any bit is set, we know we can trust
* b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
* and would save us time calling get_block repeatedly.
*/
static bool buffer_size_valid(struct buffer_head *bh)
{
return bh->b_state != 0;
}

static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
loff_t start, loff_t end, get_block_t get_block,
struct buffer_head *bh)
{
ssize_t retval = 0;
loff_t pos = start;
loff_t max = start;
loff_t bh_max = start;
void *addr;
bool hole = false;

if (rw != WRITE)
end = min(end, i_size_read(inode));

while (pos < end) {
unsigned len;
if (pos == max) {
unsigned blkbits = inode->i_blkbits;
sector_t block = pos >> blkbits;
unsigned first = pos - (block << blkbits);
long size;

if (pos == bh_max) {
bh->b_size = PAGE_ALIGN(end - pos);
bh->b_state = 0;
retval = get_block(inode, block, bh,
rw == WRITE);
if (retval)
break;
if (!buffer_size_valid(bh))
bh->b_size = 1 << blkbits;
bh_max = pos - first + bh->b_size;
} else {
unsigned done = bh->b_size -
(bh_max - (pos - first));
bh->b_blocknr += done >> blkbits;
bh->b_size -= done;
}

hole = (rw != WRITE) && !buffer_written(bh);
if (hole) {
addr = NULL;
size = bh->b_size - first;
} else {
retval = dax_get_addr(bh, &addr, blkbits);
if (retval < 0)
break;
if (buffer_unwritten(bh) || buffer_new(bh))
dax_new_buf(addr, retval, first, pos,
end);
addr += first;
size = retval - first;
}
max = min(pos + size, end);
}

if (rw == WRITE)
len = copy_from_iter(addr, max - pos, iter);
else if (!hole)
len = copy_to_iter(addr, max - pos, iter);
else
len = iov_iter_zero(max - pos, iter);

if (!len)
break;

pos += len;
addr += len;
}

return (pos == start) ? retval : pos - start;
}

/**
* dax_do_io - Perform I/O to a DAX file
* @rw: READ to read or WRITE to write
* @iocb: The control block for this I/O
* @inode: The file which the I/O is directed at
* @iter: The addresses to do I/O from or to
* @pos: The file offset where the I/O starts
* @get_block: The filesystem method used to translate file offsets to blocks
* @end_io: A filesystem callback for I/O completion
* @flags: See below
*
* This function uses the same locking scheme as do_blockdev_direct_IO:
* If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
* caller for writes. For reads, we take and release the i_mutex ourselves.
* If DIO_LOCKING is not set, the filesystem takes care of its own locking.
* As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
* is in progress.
*/
ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
struct iov_iter *iter, loff_t pos,
get_block_t get_block, dio_iodone_t end_io, int flags)
{
struct buffer_head bh;
ssize_t retval = -EINVAL;
loff_t end = pos + iov_iter_count(iter);

memset(&bh, 0, sizeof(bh));

if ((flags & DIO_LOCKING) && (rw == READ)) {
struct address_space *mapping = inode->i_mapping;
mutex_lock(&inode->i_mutex);
retval = filemap_write_and_wait_range(mapping, pos, end - 1);
if (retval) {
mutex_unlock(&inode->i_mutex);
goto out;
}
}

/* Protects against truncate */
atomic_inc(&inode->i_dio_count);

retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);

if ((flags & DIO_LOCKING) && (rw == READ))
mutex_unlock(&inode->i_mutex);

if ((retval > 0) && end_io)
end_io(iocb, pos, retval, bh.b_private);

inode_dio_done(inode);
out:
return retval;
}
EXPORT_SYMBOL_GPL(dax_do_io);
6 changes: 4 additions & 2 deletions fs/ext2/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ const struct file_operations ext2_file_operations = {
#ifdef CONFIG_EXT2_FS_XIP
const struct file_operations ext2_xip_file_operations = {
.llseek = generic_file_llseek,
.read = xip_file_read,
.write = xip_file_write,
.read = new_sync_read,
.write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
Expand Down
8 changes: 7 additions & 1 deletion fs/ext2/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
ssize_t ret;

ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
if (IS_DAX(inode))
ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
NULL, DIO_LOCKING);
else
ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
ext2_get_block);
if (ret < 0 && (rw & WRITE))
ext2_write_failed(mapping, offset + count);
return ret;
Expand Down Expand Up @@ -888,6 +893,7 @@ const struct address_space_operations ext2_aops = {
const struct address_space_operations ext2_aops_xip = {
.bmap = ext2_bmap,
.get_xip_mem = ext2_get_xip_mem,
.direct_IO = ext2_direct_IO,
};

const struct address_space_operations ext2_nobh_aops = {
Expand Down
12 changes: 8 additions & 4 deletions include/linux/fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -2587,12 +2587,11 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);

ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
loff_t, get_block_t, dio_iodone_t, int flags);

#ifdef CONFIG_FS_XIP
extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
loff_t *ppos);
extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
size_t len, loff_t *ppos);
extern int xip_truncate_page(struct address_space *mapping, loff_t from);
#else
static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
Expand Down Expand Up @@ -2756,6 +2755,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
extern void save_mount_options(struct super_block *sb, char *options);
extern void replace_mount_options(struct super_block *sb, char *options);

static inline bool io_is_direct(struct file *filp)
{
return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
}

static inline ino_t parent_ino(struct dentry *dentry)
{
ino_t res;
Expand Down
6 changes: 2 additions & 4 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;

/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (file->f_flags & O_DIRECT) {
if (io_is_direct(file)) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
Expand Down Expand Up @@ -2584,8 +2583,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
goto out;

/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
if (io_is_direct(file)) {
loff_t endbyte;

written = generic_file_direct_write(iocb, from, pos);
Expand Down
Loading

0 comments on commit d475c63

Please sign in to comment.