Skip to content

Commit

Permalink
ext3: avoid false EIO errors
Browse files Browse the repository at this point in the history
Sometimes block_write_begin() can map buffers in a page but later we
fail to copy data into those buffers (because the source page has been
paged out in the mean time).  We then end up with !uptodate mapped
buffers.  To add a bit more to the confusion, block_write_end() does
not commit any data (and thus does not any mark buffers as uptodate) if
we didn't succeed with copying all the data.

Commit f4fc66a (ext3: convert to new
aops) missed these cases and thus we were inserting non-uptodate
buffers to transaction's list which confuses JBD code and it reports IO
errors, aborts a transaction and generally makes users afraid about
their data ;-P.

This patch fixes the problem by reorganizing ext3_..._write_end() code
to first call block_write_end() to mark buffers with valid data
uptodate and after that we file only uptodate buffers to transaction's
lists.

We also fix a problem where we could leave blocks allocated beyond i_size
(i_disksize in fact) because of failed write. We now add inode to orphan
list when write fails (to be safe in case we crash) and then truncate blocks
beyond i_size in a separate transaction.

Signed-off-by: Jan Kara <[email protected]>
Reviewed-by: Aneesh Kumar K.V <[email protected]>
Cc: Nick Piggin <[email protected]>
Cc: <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
jankara authored and torvalds committed Apr 3, 2009
1 parent de18f3b commit 695f6ae
Showing 1 changed file with 74 additions and 65 deletions.
139 changes: 74 additions & 65 deletions fs/ext3/inode.c
Original file line number Diff line number Diff line change
Expand Up @@ -1149,12 +1149,15 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
int ret;
handle_t *handle;
int retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
/* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason */
int needed_blocks = ext3_writepage_trans_blocks(inode) + 1;

index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
Expand Down Expand Up @@ -1184,14 +1187,19 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
}
write_begin_failed:
if (ret) {
ext3_journal_stop(handle);
unlock_page(page);
page_cache_release(page);
/*
* block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_mutex.
*
* Add inode to orphan list in case we crash before truncate
* finishes.
*/
if (pos + len > inode->i_size)
ext3_orphan_add(handle, inode);
ext3_journal_stop(handle);
unlock_page(page);
page_cache_release(page);
if (pos + len > inode->i_size)
vmtruncate(inode, inode->i_size);
}
Expand All @@ -1211,6 +1219,18 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
return err;
}

/* For ordered writepage and write_end functions */
static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
{
/*
* Write could have mapped the buffer but it didn't copy the data in
* yet. So avoid filing such buffer into a transaction.
*/
if (buffer_mapped(bh) && buffer_uptodate(bh))
return ext3_journal_dirty_data(handle, bh);
return 0;
}

/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
Expand All @@ -1221,26 +1241,20 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}

/*
* Generic write_end handler for ordered and writeback ext3 journal modes.
* We can't use generic_write_end, because that unlocks the page and we need to
* unlock the page after ext3_journal_stop, but ext3_journal_stop must run
* after block_write_end.
* This is nasty and subtle: ext3_write_begin() could have allocated blocks
* for the whole page but later we failed to copy the data in. Update inode
* size according to what we managed to copy. The rest is going to be
* truncated in write_end function.
*/
static int ext3_generic_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
static void update_file_sizes(struct inode *inode, loff_t pos, unsigned copied)
{
struct inode *inode = file->f_mapping->host;

copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
/* What matters to us is i_disksize. We don't write i_size anywhere */
if (pos + copied > inode->i_size)
i_size_write(inode, pos + copied);
if (pos + copied > EXT3_I(inode)->i_disksize) {
EXT3_I(inode)->i_disksize = pos + copied;
mark_inode_dirty(inode);
}

return copied;
}

/*
Expand All @@ -1260,35 +1274,29 @@ static int ext3_ordered_write_end(struct file *file,
unsigned from, to;
int ret = 0, ret2;

from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);

from = pos & (PAGE_CACHE_SIZE - 1);
to = from + copied;
ret = walk_page_buffers(handle, page_buffers(page),
from, to, NULL, ext3_journal_dirty_data);
from, to, NULL, journal_dirty_data_fn);

if (ret == 0) {
/*
* generic_write_end() will run mark_inode_dirty() if i_size
* changes. So let's piggyback the i_disksize mark_inode_dirty
* into that.
*/
loff_t new_i_size;

new_i_size = pos + copied;
if (new_i_size > EXT3_I(inode)->i_disksize)
EXT3_I(inode)->i_disksize = new_i_size;
ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
ret = ret2;
}
if (ret == 0)
update_file_sizes(inode, pos, copied);
/*
* There may be allocated blocks outside of i_size because
* we failed to copy some data. Prepare for truncate.
*/
if (pos + len > inode->i_size)
ext3_orphan_add(handle, inode);
ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
unlock_page(page);
page_cache_release(page);

if (pos + len > inode->i_size)
vmtruncate(inode, inode->i_size);
return ret ? ret : copied;
}

Expand All @@ -1299,25 +1307,22 @@ static int ext3_writeback_write_end(struct file *file,
{
handle_t *handle = ext3_journal_current_handle();
struct inode *inode = file->f_mapping->host;
int ret = 0, ret2;
loff_t new_i_size;

new_i_size = pos + copied;
if (new_i_size > EXT3_I(inode)->i_disksize)
EXT3_I(inode)->i_disksize = new_i_size;

ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
ret = ret2;
int ret;

ret2 = ext3_journal_stop(handle);
if (!ret)
ret = ret2;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
update_file_sizes(inode, pos, copied);
/*
* There may be allocated blocks outside of i_size because
* we failed to copy some data. Prepare for truncate.
*/
if (pos + len > inode->i_size)
ext3_orphan_add(handle, inode);
ret = ext3_journal_stop(handle);
unlock_page(page);
page_cache_release(page);

if (pos + len > inode->i_size)
vmtruncate(inode, inode->i_size);
return ret ? ret : copied;
}

Expand All @@ -1338,15 +1343,23 @@ static int ext3_journalled_write_end(struct file *file,
if (copied < len) {
if (!PageUptodate(page))
copied = 0;
page_zero_new_buffers(page, from+copied, to);
page_zero_new_buffers(page, from + copied, to);
to = from + copied;
}

ret = walk_page_buffers(handle, page_buffers(page), from,
to, &partial, write_end_fn);
if (!partial)
SetPageUptodate(page);
if (pos+copied > inode->i_size)
i_size_write(inode, pos+copied);

if (pos + copied > inode->i_size)
i_size_write(inode, pos + copied);
/*
* There may be allocated blocks outside of i_size because
* we failed to copy some data. Prepare for truncate.
*/
if (pos + len > inode->i_size)
ext3_orphan_add(handle, inode);
EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
if (inode->i_size > EXT3_I(inode)->i_disksize) {
EXT3_I(inode)->i_disksize = inode->i_size;
Expand All @@ -1361,6 +1374,8 @@ static int ext3_journalled_write_end(struct file *file,
unlock_page(page);
page_cache_release(page);

if (pos + len > inode->i_size)
vmtruncate(inode, inode->i_size);
return ret ? ret : copied;
}

Expand Down Expand Up @@ -1428,17 +1443,11 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}

static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
{
if (buffer_mapped(bh))
return ext3_journal_dirty_data(handle, bh);
return 0;
}

static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
{
return !buffer_mapped(bh);
}

/*
* Note that we always start a transaction even if we're not journalling
* data. This is to preserve ordering: any hole instantiation within
Expand Down

0 comments on commit 695f6ae

Please sign in to comment.