Skip to content

Commit

Permalink
btrfs: disable strict file flushes for renames and truncates
Browse files Browse the repository at this point in the history
Truncates and renames are often used to replace old versions of a file
with new versions.  Applications often expect this to be an atomic
replacement, even if they haven't done anything to make sure the new
version is fully on disk.

Btrfs has strict flushing in place to make sure that renaming over an
old file with a new file will fully flush out the new file before
allowing the transaction commit with the rename to complete.

This ordering means the commit code needs to be able to lock file pages,
and there are a few paths in the filesystem where we will try to end a
transaction with the page lock held.  It's rare, but these things can
deadlock.

This patch removes the ordered flushes and switches to a best effort
filemap_flush like ext4 uses. It's not perfect, but it should fix the
deadlocks.

Signed-off-by: Chris Mason <[email protected]>
  • Loading branch information
masoncl committed Aug 15, 2014
1 parent 27b9a81 commit 8d875f9
Show file tree
Hide file tree
Showing 8 changed files with 6 additions and 267 deletions.
6 changes: 0 additions & 6 deletions fs/btrfs/btrfs_inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,6 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;

/*
* list for tracking inodes that must be sent to disk before a
* rename or truncate commit
*/
struct list_head ordered_operations;

/* node for the red-black tree that links inodes in subvolume root */
struct rb_node rb_node;

Expand Down
32 changes: 0 additions & 32 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
int read_only);
static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
struct btrfs_root *root);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
Expand Down Expand Up @@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
btrfs_cleanup_transaction(root);
}

static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
struct list_head splice;

INIT_LIST_HEAD(&splice);

mutex_lock(&root->fs_info->ordered_operations_mutex);
spin_lock(&root->fs_info->ordered_root_lock);

list_splice_init(&t->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);

list_del_init(&btrfs_inode->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);

btrfs_invalidate_inodes(btrfs_inode->root);

spin_lock(&root->fs_info->ordered_root_lock);
}

spin_unlock(&root->fs_info->ordered_root_lock);
mutex_unlock(&root->fs_info->ordered_operations_mutex);
}

static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
{
struct btrfs_ordered_extent *ordered;
Expand Down Expand Up @@ -4093,8 +4063,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
btrfs_destroy_ordered_operations(cur_trans, root);

btrfs_destroy_delayed_refs(cur_trans, root);

cur_trans->state = TRANS_STATE_COMMIT_START;
Expand Down
26 changes: 1 addition & 25 deletions fs/btrfs/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -1838,33 +1838,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,

int btrfs_release_file(struct inode *inode, struct file *filp)
{
/*
* ordered_data_close is set by settattr when we are about to truncate
* a file from a non-zero size to a zero size. This tries to
* flush down new bytes that may have been written if the
* application were using truncate to replace a file in place.
*/
if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags)) {
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;

/*
* We need to block on a committing transaction to keep us from
* throwing a ordered operation on to the list and causing
* something like sync to deadlock trying to flush out this
* inode.
*/
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
btrfs_end_transaction(trans, root);
if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(inode->i_mapping);
}
if (filp->private_data)
btrfs_ioctl_trans_end(filp);
filemap_flush(inode->i_mapping);
return 0;
}

Expand Down
47 changes: 3 additions & 44 deletions fs/btrfs/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -7950,27 +7950,6 @@ static int btrfs_truncate(struct inode *inode)
min_size);
BUG_ON(ret);

/*
* setattr is responsible for setting the ordered_data_close flag,
* but that is only tested during the last file release. That
* could happen well after the next commit, leaving a great big
* window where new writes may get lost if someone chooses to write
* to this file after truncating to zero
*
* The inode doesn't have any dirty data here, and so if we commit
* this is a noop. If someone immediately starts writing to the inode
* it is very likely we'll catch some of their writes in this
* transaction, and the commit will find this file on the ordered
* data list with good things to send down.
*
* This is a best effort solution, there is still a window where
* using truncate to replace the contents of the file will
* end up with a zero length file after a crash.
*/
if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags))
btrfs_add_ordered_operation(trans, root, inode);

/*
* So if we truncate and then write and fsync we normally would just
* write the extents that changed, which is a problem if we need to
Expand Down Expand Up @@ -8118,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
mutex_init(&ei->delalloc_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->ordered_operations);
RB_CLEAR_NODE(&ei->rb_node);

return inode;
Expand Down Expand Up @@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode)
if (!root)
goto free;

/*
* Make sure we're properly removed from the ordered operation
* lists.
*/
smp_mb();
if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
spin_lock(&root->fs_info->ordered_root_lock);
list_del_init(&BTRFS_I(inode)->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
}

if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags)) {
btrfs_info(root->fs_info, "inode %llu still on the orphan list",
Expand Down Expand Up @@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = 0;

/*
* we're using rename to replace one file with another.
* and the replacement file is large. Start IO on it now so
* we don't add too much work to the end of the transaction
* we're using rename to replace one file with another. Start IO on it
* now so we don't add too much work to the end of the transaction
*/
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
filemap_flush(old_inode->i_mapping);

/* close the racy window with snapshot create/destroy ioctl */
Expand Down Expand Up @@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
*/
btrfs_pin_log_trans(root);
}
/*
* make sure the inode gets flushed if it is replacing
* something.
*/
if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
btrfs_add_ordered_operation(trans, root, old_inode);

inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
Expand Down
123 changes: 0 additions & 123 deletions fs/btrfs/ordered-data.c
Original file line number Diff line number Diff line change
Expand Up @@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,

trace_btrfs_ordered_extent_remove(inode, entry);

/*
* we have no more ordered extents for this inode and
* no dirty pages. We can safely remove it from the
* list of ordered extents
*/
if (RB_EMPTY_ROOT(&tree->tree) &&
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
spin_lock(&root->fs_info->ordered_root_lock);
list_del_init(&BTRFS_I(inode)->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
}

if (!root->nr_ordered_extents) {
spin_lock(&root->fs_info->ordered_root_lock);
BUG_ON(list_empty(&root->ordered_root));
Expand Down Expand Up @@ -686,81 +674,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
mutex_unlock(&fs_info->ordered_operations_mutex);
}

/*
* this is used during transaction commit to write all the inodes
* added to the ordered operation list. These files must be fully on
* disk before the transaction commits.
*
* we have two modes here, one is to just start the IO via filemap_flush
* and the other is to wait for all the io. When we wait, we have an
* extra check to make sure the ordered operation list really is empty
* before we return
*/
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int wait)
{
struct btrfs_inode *btrfs_inode;
struct inode *inode;
struct btrfs_transaction *cur_trans = trans->transaction;
struct list_head splice;
struct list_head works;
struct btrfs_delalloc_work *work, *next;
int ret = 0;

INIT_LIST_HEAD(&splice);
INIT_LIST_HEAD(&works);

mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
spin_lock(&root->fs_info->ordered_root_lock);
list_splice_init(&cur_trans->ordered_operations, &splice);
while (!list_empty(&splice)) {
btrfs_inode = list_entry(splice.next, struct btrfs_inode,
ordered_operations);
inode = &btrfs_inode->vfs_inode;

list_del_init(&btrfs_inode->ordered_operations);

/*
* the inode may be getting freed (in sys_unlink path).
*/
inode = igrab(inode);
if (!inode)
continue;

if (!wait)
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);

work = btrfs_alloc_delalloc_work(inode, wait, 1);
if (!work) {
spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations))
list_add_tail(&btrfs_inode->ordered_operations,
&splice);
list_splice_tail(&splice,
&cur_trans->ordered_operations);
spin_unlock(&root->fs_info->ordered_root_lock);
ret = -ENOMEM;
goto out;
}
list_add_tail(&work->list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);

cond_resched();
spin_lock(&root->fs_info->ordered_root_lock);
}
spin_unlock(&root->fs_info->ordered_root_lock);
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
}
mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
return ret;
}

/*
* Used to start IO or wait for a given ordered extent to finish.
*
Expand Down Expand Up @@ -1120,42 +1033,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
return index;
}


/*
* add a given inode to the list of inodes that must be fully on
* disk before a transaction commit finishes.
*
* This basically gives us the ext3 style data=ordered mode, and it is mostly
* used to make sure renamed files are fully on disk.
*
* It is a noop if the inode is already fully on disk.
*
* If trans is not null, we'll do a friendly check for a transaction that
* is already flushing things and force the IO down ourselves.
*/
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode)
{
struct btrfs_transaction *cur_trans = trans->transaction;
u64 last_mod;

last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);

/*
* if this file hasn't been changed since the last transaction
* commit, we can safely return without doing anything
*/
if (last_mod <= root->fs_info->last_trans_committed)
return;

spin_lock(&root->fs_info->ordered_root_lock);
if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
list_add_tail(&BTRFS_I(inode)->ordered_operations,
&cur_trans->ordered_operations);
}
spin_unlock(&root->fs_info->ordered_root_lock);
}

int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
Expand Down
5 changes: 0 additions & 5 deletions fs/btrfs/ordered-data.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
u32 *sum, int len);
int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int wait);
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
void btrfs_get_logged_extents(struct inode *inode,
Expand Down
Loading

0 comments on commit 8d875f9

Please sign in to comment.