From: Oliver Xymoron These patches add the missing infrastructure for reporting asynchronous write errors to block devices to userspace. Errors are reported at the next fsync, fdatasync, or msync on the given file, and on close if the error occurs in time. Record async write errors and report at subsequent fsync/fdatasync. DESC async write errors core: fixes EDESC - code simplifications - No need to lock the page to stabilise ->mapping in mpage.c fs/buffer.c | 43 +++++++++++++++++++++++++++++++++---------- fs/ext3/inode.c | 2 +- fs/inode.c | 1 + fs/mpage.c | 8 ++++++++ fs/ntfs/compress.c | 2 +- fs/open.c | 27 ++++++++++++++++++++++----- include/linux/buffer_head.h | 10 +++++++--- include/linux/fs.h | 1 + kernel/ksyms.c | 3 ++- mm/filemap.c | 8 ++++++++ mm/vmscan.c | 24 +++++++++++++++++++++++- 11 files changed, 107 insertions(+), 22 deletions(-) diff -puN fs/buffer.c~awe-core fs/buffer.c --- 25/fs/buffer.c~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/fs/buffer.c 2003-08-06 20:20:56.000000000 -0700 @@ -170,15 +170,29 @@ static void buffer_io_error(struct buffe * Default synchronous end-of-IO handler.. Just mark it up-to-date and * unlock the buffer. This is what ll_rw_block uses too. */ -void end_buffer_io_sync(struct buffer_head *bh, int uptodate) +void end_buffer_read_sync(struct buffer_head *bh, int uptodate) { if (uptodate) { set_buffer_uptodate(bh); } else { - /* - * This happens, due to failed READA attempts. - * buffer_io_error(bh); - */ + /* This happens, due to failed READA attempts. */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +void end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to I/O error on %s\n", + bdevname(bh->b_bdev, b)); + set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } unlock_buffer(bh); @@ -550,6 +564,7 @@ still_busy: */ void end_buffer_async_write(struct buffer_head *bh, int uptodate) { + char b[BDEVNAME_SIZE]; static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; unsigned long flags; struct buffer_head *tmp; @@ -562,6 +577,9 @@ void end_buffer_async_write(struct buffe set_buffer_uptodate(bh); } else { buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to I/O error on %s\n", + bdevname(bh->b_bdev, b)); + page->mapping->error = -EIO; clear_buffer_uptodate(bh); SetPageError(page); } @@ -1288,7 +1306,7 @@ static struct buffer_head *__bread_slow( if (buffer_dirty(bh)) buffer_error(); get_bh(bh); - bh->b_end_io = end_buffer_io_sync; + bh->b_end_io = end_buffer_read_sync; submit_bh(READ, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) @@ -2646,8 +2664,10 @@ int submit_bh(int rw, struct buffer_head buffer_error(); if (rw == READ && buffer_dirty(bh)) buffer_error(); - - set_buffer_req(bh); + + /* Only clear out a write error when rewriting */ + if (test_set_buffer_req(bh) && rw == WRITE) + clear_buffer_write_io_error(bh); /* * from here on down, it's all bio -- do the initial mapping, @@ -2707,13 +2727,14 @@ void ll_rw_block(int rw, int nr, struct continue; get_bh(bh); - bh->b_end_io = end_buffer_io_sync; if (rw == WRITE) { + bh->b_end_io = end_buffer_write_sync; if (test_clear_buffer_dirty(bh)) { submit_bh(WRITE, bh); continue; } } else { + bh->b_end_io = end_buffer_read_sync; if (!buffer_uptodate(bh)) { submit_bh(rw, bh); continue; @@ -2734,7 +2755,7 @@ void sync_dirty_buffer(struct buffer_hea lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { get_bh(bh); - bh->b_end_io = end_buffer_io_sync; + bh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, bh); wait_on_buffer(bh); } else { @@ -2793,6 +2814,8 @@ drop_buffers(struct page *page, struct b bh = head; do { check_ttfb_buffer(page, bh); + if (buffer_write_io_error(bh)) + page->mapping->error = -EIO; if (buffer_busy(bh)) goto failed; if (!buffer_uptodate(bh) && !buffer_req(bh)) diff -puN fs/ext3/inode.c~awe-core fs/ext3/inode.c --- 25/fs/ext3/inode.c~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/fs/ext3/inode.c 2003-08-06 20:20:56.000000000 -0700 @@ -2431,7 +2431,7 @@ make_io: * read the block from disk */ get_bh(bh); - bh->b_end_io = end_buffer_io_sync; + bh->b_end_io = end_buffer_read_sync; submit_bh(READ, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { diff -puN fs/inode.c~awe-core fs/inode.c --- 25/fs/inode.c~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/fs/inode.c 2003-08-06 20:20:56.000000000 -0700 @@ -145,6 +145,7 @@ static struct inode *alloc_inode(struct mapping->dirtied_when = 0; mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; + mapping->error = 0; if (sb->s_bdev) mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; memset(&inode->u, 0, sizeof(inode->u)); diff -puN fs/mpage.c~awe-core fs/mpage.c --- 25/fs/mpage.c~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/fs/mpage.c 2003-08-06 20:20:56.000000000 -0700 @@ -388,6 +388,7 @@ static struct bio * mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc) { + struct address_space *mapping = page->mapping; struct inode *inode = page->mapping->host; const unsigned blkbits = inode->i_blkbits; unsigned long end_index; @@ -562,6 +563,11 @@ confused: if (bio) bio = mpage_bio_submit(WRITE, bio); *ret = page->mapping->a_ops->writepage(page, wbc); + /* + * The caller has a ref on the inode, so *mapping is stable + */ + if (*ret < 0) + mapping->error = *ret; out: return bio; } @@ -663,6 +669,8 @@ mpage_writepages(struct address_space *m test_clear_page_dirty(page)) { if (writepage) { ret = (*writepage)(page, wbc); + if (ret < 0) + mapping->error = ret; } else { bio = mpage_writepage(bio, page, get_block, &last_block_in_bio, &ret, wbc); diff -puN fs/ntfs/compress.c~awe-core fs/ntfs/compress.c --- 25/fs/ntfs/compress.c~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/fs/ntfs/compress.c 2003-08-06 20:20:56.000000000 -0700 @@ -643,7 +643,7 @@ lock_retry_remap: continue; } atomic_inc(&tbh->b_count); - tbh->b_end_io = end_buffer_io_sync; + tbh->b_end_io = end_buffer_read_sync; submit_bh(READ, tbh); } diff -puN fs/open.c~awe-core fs/open.c --- 25/fs/open.c~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/fs/open.c 2003-08-06 20:20:56.000000000 -0700 @@ -944,15 +944,32 @@ asmlinkage long sys_creat(const char __u */ int filp_close(struct file *filp, fl_owner_t id) { - int retval; + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + int retval = 0, err; + + /* Report and clear outstanding errors */ + err = filp->f_error; + if (err) { + filp->f_error = 0; + retval = err; + } + + err = mapping->error; + if (!retval) + retval = err; + mapping->error = 0; if (!file_count(filp)) { printk(KERN_ERR "VFS: Close: file count is 0\n"); - return 0; + return retval; } - retval = 0; - if (filp->f_op && filp->f_op->flush) - retval = filp->f_op->flush(filp); + + if (filp->f_op && filp->f_op->flush) { + err = filp->f_op->flush(filp); + if (!retval) + retval = err; + } + dnotify_flush(filp, id); locks_remove_posix(filp, id); fput(filp); diff -puN include/linux/buffer_head.h~awe-core include/linux/buffer_head.h --- 25/include/linux/buffer_head.h~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/include/linux/buffer_head.h 2003-08-06 20:20:56.000000000 -0700 @@ -24,8 +24,9 @@ enum bh_state_bits { BH_Async_Read, /* Is under end_buffer_async_read I/O */ BH_Async_Write, /* Is under end_buffer_async_write I/O */ BH_Delay, /* Buffer is not yet allocated on disk */ - BH_Boundary, /* Block is followed by a discontiguity */ + BH_Write_EIO, /* I/O error on write */ + BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities */ @@ -109,12 +110,14 @@ TAS_BUFFER_FNS(Dirty, dirty) BUFFER_FNS(Lock, locked) TAS_BUFFER_FNS(Lock, locked) BUFFER_FNS(Req, req) +TAS_BUFFER_FNS(Req, req) BUFFER_FNS(Mapped, mapped) BUFFER_FNS(New, new) BUFFER_FNS(Async_Read, async_read) BUFFER_FNS(Async_Write, async_write) -BUFFER_FNS(Delay, delay); +BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) +BUFFER_FNS(Write_EIO,write_io_error) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) #define touch_buffer(bh) mark_page_accessed(bh->b_page) @@ -139,7 +142,8 @@ void set_bh_page(struct buffer_head *bh, int try_to_free_buffers(struct page *); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); -void end_buffer_io_sync(struct buffer_head *bh, int uptodate); +void end_buffer_read_sync(struct buffer_head *bh, int uptodate); +void end_buffer_write_sync(struct buffer_head *bh, int uptodate); void end_buffer_async_write(struct buffer_head *bh, int uptodate); /* Things to do with buffers at mapping->private_list */ diff -puN include/linux/fs.h~awe-core include/linux/fs.h --- 25/include/linux/fs.h~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/include/linux/fs.h 2003-08-06 20:20:56.000000000 -0700 @@ -332,6 +332,7 @@ struct address_space { spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ + int error; /* write error for fsync */ }; struct block_device { diff -puN kernel/ksyms.c~awe-core kernel/ksyms.c --- 25/kernel/ksyms.c~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/kernel/ksyms.c 2003-08-06 20:20:56.000000000 -0700 @@ -178,7 +178,8 @@ EXPORT_SYMBOL(d_splice_alias); EXPORT_SYMBOL(d_lookup); EXPORT_SYMBOL(d_path); EXPORT_SYMBOL(mark_buffer_dirty); -EXPORT_SYMBOL(end_buffer_io_sync); +EXPORT_SYMBOL(end_buffer_read_sync); +EXPORT_SYMBOL(end_buffer_write_sync); EXPORT_SYMBOL(end_buffer_async_write); EXPORT_SYMBOL(__mark_inode_dirty); EXPORT_SYMBOL(get_empty_filp); diff -puN mm/filemap.c~awe-core mm/filemap.c --- 25/mm/filemap.c~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-08-06 20:20:56.000000000 -0700 @@ -197,6 +197,14 @@ restart: spin_lock(&mapping->page_lock); } spin_unlock(&mapping->page_lock); + + /* Check for outstanding write errors */ + if (mapping->error) { + if (!ret) + ret = mapping->error; + mapping->error = 0; + } + return ret; } diff -puN mm/vmscan.c~awe-core mm/vmscan.c --- 25/mm/vmscan.c~awe-core 2003-08-06 20:20:56.000000000 -0700 +++ 25-akpm/mm/vmscan.c 2003-08-06 20:20:56.000000000 -0700 @@ -236,6 +236,27 @@ static int may_write_to_queue(struct bac } /* + * We detected a synchronous write error writing a page out. Probably + * -ENOSPC. We need to propagate that into the address_space for a subsequent + * fsync(), msync() or close(). + * + * The tricky part is that after writepage we cannot touch the mapping: nothing + * prevents it from being freed up. But we have a ref on the page and once + * that page is locked, the mapping is pinned. + * + * We're allowed to run sleeping lock_page() here because we know the caller has + * __GFP_FS. + */ +static void handle_write_error(struct address_space *mapping, + struct page *page, int error) +{ + lock_page(page); + if (page->mapping == mapping) + mapping->error = error; + unlock_page(page); +} + +/* * shrink_list returns the number of reclaimed pages */ static int @@ -362,7 +383,8 @@ shrink_list(struct list_head *page_list, SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); - + if (res < 0) + handle_write_error(mapping, page, res); if (res == WRITEPAGE_ACTIVATE) { ClearPageReclaim(page); goto activate_locked; _