aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOGAWA Hirofumi <hirofumi@mail.parknet.co.jp>2014-05-08 17:46:56 +0900
committerDaniel Phillips <daniel@tux3.org>2014-05-08 17:46:56 +0900
commit4365b0d9857a5808ce70191f703984ee58544358 (patch)
tree4b0a9e9988b3c80b8ccb095c794205329139a319
parent54b3c445821b2546763094bf2f629e7902665024 (diff)
downloadlinux-tux3-4365b0d9857a5808ce70191f703984ee58544358.tar.gz
tux3: Support mmap write: Add initial mmap write support
To support mmap write, we have to care about stabled pages (pages on flushing delta). I.e. once pages are decided as flushing to backing storage, we should not leak modification/truncation of pages in frontend to flushing delta (this provides reliability like data=journal). To provide this, we do pagefork the page in previous delta to modify in frontend. Before issue I/O, PTE of the page become read-only. Then, we will notice mmap write via page fault (->page_mkwrite() hook). In ->page_mkwrite(), if page was stabled (dirtied for previous delta, or under I/O), we do pagefork the page to new page, switch PTE to point new page, and make PTE writable. With this, stabled page is not modified, and frontend can continue modification to new page without waiting I/O. [FIXME: ->page_mkwrite() can't support pagefork directly (->page_mkwrite() is assuming the page in PTE is not modified.). So, this patch switch PTE to new page, and make writable in ->page_mkwrite(). Then, retry page fault again, and resolve page fault by switched PTE for new page.] Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
-rw-r--r--fs/tux3/Kconfig7
-rw-r--r--fs/tux3/Makefile2
-rw-r--r--fs/tux3/buffer.c21
-rw-r--r--fs/tux3/buffer_fork.c10
-rw-r--r--fs/tux3/commit.c2
-rw-r--r--fs/tux3/filemap.c6
-rw-r--r--fs/tux3/filemap_mmap.c247
-rw-r--r--fs/tux3/inode.c2
-rw-r--r--fs/tux3/mmap_builtin_hack.c95
-rw-r--r--fs/tux3/mmap_builtin_hack.h13
-rw-r--r--fs/tux3/tux3.h3
11 files changed, 385 insertions, 23 deletions
diff --git a/fs/tux3/Kconfig b/fs/tux3/Kconfig
index a952804d70eeb7..212e4e60ee4964 100644
--- a/fs/tux3/Kconfig
+++ b/fs/tux3/Kconfig
@@ -5,3 +5,10 @@ config TUX3
be called tux3.
If unsure, see you again in six months.
+
+config TUX3_MMAP
+ bool "Tux3 mmap support"
+ depends on TUX3
+ help
+ Adds EXPORT_SYMBOL_GPL(), etc. to provide functionality for mmap.
+ This needs build tux3 as module with support mmap.
diff --git a/fs/tux3/Makefile b/fs/tux3/Makefile
index 36233d8c309bf7..9623a540dcb2d4 100644
--- a/fs/tux3/Makefile
+++ b/fs/tux3/Makefile
@@ -14,4 +14,6 @@ EXTRA_CFLAGS += -Werror -std=gnu99 -Wno-declaration-after-statement
#EXTRA_CFLAGS += -DTUX3_FLUSHER=TUX3_FLUSHER_SYNC
#EXTRA_CFLAGS += -DTUX3_FLUSHER=TUX3_FLUSHER_ASYNC_OWN
EXTRA_CFLAGS += -DTUX3_FLUSHER=TUX3_FLUSHER_ASYNC_HACK
+
+obj-$(CONFIG_TUX3_MMAP) += mmap_builtin_hack.o
endif
diff --git a/fs/tux3/buffer.c b/fs/tux3/buffer.c
index fe7681466878b9..3d057c33417a9b 100644
--- a/fs/tux3/buffer.c
+++ b/fs/tux3/buffer.c
@@ -87,25 +87,6 @@ int buffer_can_modify(struct buffer_head *buffer, unsigned delta)
tux3_bufsta_get_delta(state) == tux3_delta(delta);
}
-/*
- * Copy of __set_page_dirty() without __mark_inode_dirty(). Caller
- * decides whether mark inode dirty or not.
- */
-static void __tux3_set_page_dirty(struct page *page,
- struct address_space *mapping, int warn)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&mapping->tree_lock, flags);
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- }
- spin_unlock_irqrestore(&mapping->tree_lock, flags);
-}
-
/* Set our delta dirty bits, then add to our dirty buffers list */
static inline void __tux3_set_buffer_dirty_list(struct address_space *mapping,
struct buffer_head *buffer, int delta,
@@ -147,7 +128,7 @@ int tux3_set_buffer_dirty_list(struct address_space *mapping,
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page->mapping;
if (mapping)
- __tux3_set_page_dirty(page, mapping, 0);
+ __tux3_set_page_dirty_account(page, mapping, 0);
return 1;
}
}
diff --git a/fs/tux3/buffer_fork.c b/fs/tux3/buffer_fork.c
index 64d1d0f6dc92fb..fd3508e0f1b1d4 100644
--- a/fs/tux3/buffer_fork.c
+++ b/fs/tux3/buffer_fork.c
@@ -195,6 +195,8 @@ void free_forked_buffers(struct sb *sb, struct inode *inode, int force)
* Block fork core
*/
+#include "mmap_builtin_hack.h"
+
/*
* This replaces the oldpage on radix-tree with newpage atomically.
*
@@ -460,7 +462,8 @@ struct buffer_head *blockdirty(struct buffer_head *buffer, unsigned newdelta)
lock_page(oldpage);
/* This happens on partially dirty page. */
-// assert(PageUptodate(page));
+// assert(PageUptodate(oldpage));
+ assert(!page_mapped(oldpage));
switch ((ret_needfork = need_fork(oldpage, buffer, newdelta))) {
case RET_FORKED:
@@ -635,6 +638,10 @@ struct page *pagefork_for_blockdirty(struct page *oldpage, unsigned newdelta)
* newpage is available on radix-tree here.
*/
SetPageForked(oldpage);
+ /*
+ * Update PTEs for forked page.
+ */
+ page_cow_file(oldpage, newpage);
unlock_page(oldpage);
/* Register forked buffer to free forked page later */
@@ -660,6 +667,7 @@ int bufferfork_to_invalidate(struct address_space *mapping, struct page *page)
unsigned delta = tux3_inode_delta(mapping->host);
assert(PageLocked(page));
+ assert(!page_mapped(page));
switch (need_fork(page, NULL, delta)) {
case RET_NEED_FORK:
diff --git a/fs/tux3/commit.c b/fs/tux3/commit.c
index a01fd9ef0f51f7..fc7f85e6f912ce 100644
--- a/fs/tux3/commit.c
+++ b/fs/tux3/commit.c
@@ -695,7 +695,7 @@ void change_end_atomic(struct sb *sb)
* This is used for nested change_begin/end. We should not use this
* usually (nesting change_begin/end is wrong for normal operations).
*
- * For now, this is only used for ->evict_inode() debugging.
+ * For now, this is only used for ->evict_inode() debugging, and page fault.
*/
void change_begin_atomic_nested(struct sb *sb, void **ptr)
{
diff --git a/fs/tux3/filemap.c b/fs/tux3/filemap.c
index a53c71595106a8..612ac41e2490bc 100644
--- a/fs/tux3/filemap.c
+++ b/fs/tux3/filemap.c
@@ -960,6 +960,8 @@ static sector_t tux3_bmap(struct address_space *mapping, sector_t iblock)
return blocknr;
}
+#include "filemap_mmap.c"
+
const struct address_space_operations tux_file_aops = {
.readpage = tux3_readpage,
.readpages = tux3_readpages,
@@ -970,6 +972,7 @@ const struct address_space_operations tux_file_aops = {
.write_begin = tux3_file_write_begin,
.write_end = tux3_file_write_end,
.bmap = tux3_bmap,
+ .set_page_dirty = tux3_set_page_dirty_assert,
.invalidatepage = tux3_invalidatepage,
// .releasepage = ext4_releasepage,
#ifdef TUX3_DIRECT_IO
@@ -1000,6 +1003,7 @@ const struct address_space_operations tux_symlink_aops = {
.write_begin = tux3_symlink_write_begin,
.write_end = __tux3_file_write_end,
.bmap = tux3_bmap,
+ .set_page_dirty = tux3_set_page_dirty_bug,
.invalidatepage = tux3_invalidatepage,
// .releasepage = ext4_releasepage,
#ifdef TUX3_DIRECT_IO
@@ -1042,6 +1046,7 @@ const struct address_space_operations tux_blk_aops = {
.writepages = tux3_disable_writepages,
.write_begin = tux3_blk_write_begin,
.bmap = tux3_bmap,
+ .set_page_dirty = tux3_set_page_dirty_bug,
.invalidatepage = tux3_invalidatepage,
// .migratepage = buffer_migrate_page, /* FIXME */
// .is_partially_uptodate = block_is_partially_uptodate,
@@ -1087,6 +1092,7 @@ const struct address_space_operations tux_vol_aops = {
.writepage = tux3_disable_writepage,
.writepages = tux3_disable_writepages,
.write_begin = tux3_vol_write_begin,
+ .set_page_dirty = tux3_set_page_dirty_bug,
.invalidatepage = tux3_invalidatepage,
// .is_partially_uptodate = block_is_partially_uptodate,
// .is_dirty_writeback = buffer_check_dirty_writeback,
diff --git a/fs/tux3/filemap_mmap.c b/fs/tux3/filemap_mmap.c
new file mode 100644
index 00000000000000..93b9f9cdf40c72
--- /dev/null
+++ b/fs/tux3/filemap_mmap.c
@@ -0,0 +1,247 @@
+/*
+ * mmap(2) handlers to support page fork.
+ */
+
+/*
+ * Copy of __set_page_dirty() without __mark_inode_dirty(). Caller
+ * decides whether mark inode dirty or not.
+ */
+void __tux3_set_page_dirty_account(struct page *page,
+ struct address_space *mapping, int warn)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ if (page->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(warn && !PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ }
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+}
+
+static void __tux3_set_page_dirty(struct page *page,
+ struct address_space *mapping, int warn)
+{
+ __tux3_set_page_dirty_account(page, mapping, warn);
+ __tux3_mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+}
+
+static int tux3_set_page_dirty_buffers(struct page *page)
+{
+#if 0
+ struct address_space *mapping = page->mapping;
+ int newly_dirty;
+
+ spin_lock(&mapping->private_lock);
+ if (page_has_buffers(page)) {
+ struct buffer_head *head = page_buffers(page);
+ struct buffer_head *bh = head;
+
+ do {
+ set_buffer_dirty(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+ newly_dirty = !TestSetPageDirty(page);
+ spin_unlock(&mapping->private_lock);
+
+ if (newly_dirty)
+ __set_page_dirty(page, mapping, 1);
+
+ return newly_dirty;
+#else
+ struct address_space *mapping = page->mapping;
+ unsigned delta = tux3_get_current_delta();
+ struct buffer_head *head, *buffer;
+ int newly_dirty;
+
+ /* This should be tux3 page and locked */
+ assert(mapping);
+ assert(PageLocked(page));
+ /* This page should have buffers (caller should allocate) */
+ assert(page_has_buffers(page));
+
+ /*
+ * FIXME: we dirty all buffers on this page, so we optimize this
+ * by avoiding to check page-dirty/inode-dirty multiple times.
+ */
+ newly_dirty = 0;
+ if (!TestSetPageDirty(page)) {
+ __tux3_set_page_dirty(page, mapping, 1);
+ newly_dirty = 1;
+ }
+ buffer = head = page_buffers(page);
+ do {
+ __tux3_mark_buffer_dirty(buffer, delta);
+ buffer = buffer->b_this_page;
+ } while (buffer != head);
+#endif
+ return newly_dirty;
+}
+
+/* Copy of set_page_dirty() */
+static int tux3_set_page_dirty(struct page *page)
+{
+ /*
+ * readahead/lru_deactivate_page could remain
+ * PG_readahead/PG_reclaim due to race with end_page_writeback
+ * About readahead, if the page is written, the flags would be
+ * reset. So no problem.
+ * About lru_deactivate_page, if the page is redirty, the flag
+ * will be reset. So no problem. but if the page is used by readahead
+ * it will confuse readahead and make it restart the size rampup
+ * process. But it's a trivial problem.
+ */
+ ClearPageReclaim(page);
+
+ return tux3_set_page_dirty_buffers(page);
+}
+
+static int tux3_set_page_dirty_assert(struct page *page)
+{
+ struct buffer_head *head, *buffer;
+
+ /* See comment of tux3_set_page_dirty() */
+ ClearPageReclaim(page);
+
+ /* Is there any cases to be called for old page of forked page? */
+ WARN_ON(PageForked(page));
+
+ /* This page should be dirty already, otherwise we will lost data. */
+ assert(PageDirty(page));
+ /* All buffers should be dirty already, otherwise we will lost data. */
+ assert(page_has_buffers(page));
+ head = buffer = page_buffers(page);
+ do {
+ assert(buffer_dirty(buffer));
+ buffer = buffer->b_this_page;
+ } while (buffer != head);
+
+ return 0;
+}
+
+static int tux3_set_page_dirty_bug(struct page *page)
+{
+ /* See comment of tux3_set_page_dirty() */
+ ClearPageReclaim(page);
+
+ assert(0);
+ /* This page should not be mmapped */
+ assert(!page_mapped(page));
+ /* This page should be dirty already, otherwise we will lost data. */
+ assert(PageDirty(page));
+ return 0;
+}
+
+static int tux3_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct sb *sb = tux_sb(inode->i_sb);
+ struct page *clone, *page = vmf->page;
+ void *ptr;
+ int ret;
+
+ sb_start_pagefault(inode->i_sb);
+
+retry:
+ lock_page(page);
+ if (page->mapping != mapping(inode)) {
+ unlock_page(page);
+ ret = VM_FAULT_NOPAGE;
+ goto out;
+ }
+
+ /*
+ * page fault can be happened while holding change_begin/end()
+ * (e.g. copy of user data between ->write_begin and
+ * ->write_end for write(2)).
+ *
+ * So, we use nested version here.
+ */
+ change_begin_atomic_nested(sb, &ptr);
+
+ /*
+ * FIXME: Caller releases vmf->page (old_page) unconditionally.
+ * So, this takes additional refcount to workaround it.
+ */
+ if (vmf->page == page)
+ page_cache_get(page);
+
+ clone = pagefork_for_blockdirty(page, tux3_get_current_delta());
+ if (IS_ERR(clone)) {
+ /* Someone did page fork */
+ pgoff_t index = page->index;
+
+ change_end_atomic_nested(sb, ptr);
+ unlock_page(page);
+ page_cache_release(page);
+
+ switch (PTR_ERR(clone)) {
+ case -EAGAIN:
+ page = find_get_page(inode->i_mapping, index);
+ assert(page);
+ goto retry;
+ case -ENOMEM:
+ ret = VM_FAULT_OOM;
+ break;
+ default:
+ ret = VM_FAULT_SIGBUS;
+ break;
+ }
+
+ goto out;
+ }
+
+ file_update_time(vma->vm_file);
+
+ /* Assign buffers to dirty */
+ if (!page_has_buffers(clone))
+ create_empty_buffers(clone, sb->blocksize, 0);
+
+ /*
+ * We mark the page dirty already here so that when freeze is in
+ * progress, we are guaranteed that writeback during freezing will
+ * see the dirty page and writeprotect it again.
+ */
+ tux3_set_page_dirty(clone);
+#if 1
+ /* FIXME: Caller doesn't see the changed vmf->page */
+ vmf->page = clone;
+
+ change_end_atomic_nested(sb, ptr);
+ /* FIXME: caller doesn't know about pagefork */
+ unlock_page(clone);
+ page_cache_release(clone);
+ ret = 0;
+// ret = VM_FAULT_LOCKED;
+#endif
+out:
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
+static const struct vm_operations_struct tux3_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = tux3_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
+};
+
+int tux3_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+#ifdef CONFIG_TUX3_MMAP
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+
+ file_accessed(file);
+ vma->vm_ops = &tux3_file_vm_ops;
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
diff --git a/fs/tux3/inode.c b/fs/tux3/inode.c
index 644deabc2cd22a..f759f8781d7a63 100644
--- a/fs/tux3/inode.c
+++ b/fs/tux3/inode.c
@@ -858,7 +858,7 @@ static const struct file_operations tux_file_fops = {
#ifdef CONFIG_COMPAT
// .compat_ioctl = fat_compat_dir_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = tux3_file_mmap,
.open = generic_file_open,
.fsync = tux3_sync_file,
.splice_read = generic_file_splice_read,
diff --git a/fs/tux3/mmap_builtin_hack.c b/fs/tux3/mmap_builtin_hack.c
new file mode 100644
index 00000000000000..dade54f8fd41dd
--- /dev/null
+++ b/fs/tux3/mmap_builtin_hack.c
@@ -0,0 +1,95 @@
+/*
+ * mmap support helpers. But core doesn't provide functionality that
+ * pagefork needs.
+ *
+ * So, this hack adds EXPORT_SYMBOL_GPL() and inline functions, and
+ * liked with kernel statically.
+ *
+ * FIXME: we should patch the kernel instead.
+ */
+
+#include "tux3.h"
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+
+extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma);
+
+static int page_cow_one(struct page *oldpage, struct page *newpage,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t oldptval, ptval, *pte;
+ spinlock_t *ptl;
+ int ret = 0;
+
+ pte = page_check_address(oldpage, mm, address, &ptl, 1);
+ if (!pte)
+ goto out;
+
+ flush_cache_page(vma, address, pte_pfn(*pte));
+ oldptval = ptep_clear_flush(vma, address, pte);
+
+ /* Take refcount for PTE */
+ page_cache_get(newpage);
+
+ /*
+ * vm_page_prot doesn't have writable bit, so page fault will
+ * be occurred immediately after returned from this page fault
+ * again. And second time of page fault will be resolved with
+ * forked page was set here.
+ *
+ * FIXME: we should resolve page fault with one page
+ * fault. Maybe, we will have to modify callers of
+ * ->page_mkwrite().
+ */
+ ptval = mk_pte(newpage, vma->vm_page_prot);
+#if 0
+ if (pte_dirty(oldptval))
+ ptval = pte_mkdirty(ptval);
+ if (pte_young(oldptval))
+ ptval = pte_mkyoung(ptval);
+#endif
+ set_pte_at(mm, address, pte, ptval);
+
+ /* Update rmap accounting */
+ assert(!PageMlocked(oldpage)); /* Caller should migrate mlock flag */
+ page_remove_rmap(oldpage);
+ page_add_file_rmap(newpage);
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, address, pte);
+
+ pte_unmap_unlock(pte, ptl);
+
+ mmu_notifier_invalidate_page(mm, address);
+
+ /* Release refcount for PTE */
+ page_cache_release(oldpage);
+out:
+ return ret;
+}
+
+int page_cow_file(struct page *oldpage, struct page *newpage)
+{
+ struct address_space *mapping = page_mapping(oldpage);
+ pgoff_t pgoff = oldpage->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ struct vm_area_struct *vma;
+ int ret = 0;
+
+ BUG_ON(!PageLocked(oldpage));
+ BUG_ON(!PageLocked(newpage));
+ BUG_ON(PageAnon(oldpage));
+ BUG_ON(mapping == NULL);
+
+ mutex_lock(&mapping->i_mmap_mutex);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (vma->vm_flags & VM_SHARED) {
+ unsigned long address = vma_address(oldpage, vma);
+ ret += page_cow_one(oldpage, newpage, vma, address);
+ }
+ }
+ mutex_unlock(&mapping->i_mmap_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(page_cow_file);
diff --git a/fs/tux3/mmap_builtin_hack.h b/fs/tux3/mmap_builtin_hack.h
new file mode 100644
index 00000000000000..b313506ae61b0a
--- /dev/null
+++ b/fs/tux3/mmap_builtin_hack.h
@@ -0,0 +1,13 @@
+#ifndef _MMAP_HACK_H
+#define _MMAP_HACK_H
+
+#ifdef CONFIG_TUX3_MMAP
+int page_cow_file(struct page *oldpage, struct page *newpage);
+#else
+static inline int page_cow_file(struct page *oldpage, struct page *newpage)
+{
+ return 0;
+}
+#endif
+
+#endif /* !_MMAP_HACK_H */
diff --git a/fs/tux3/tux3.h b/fs/tux3/tux3.h
index eef92d4cac9f19..04fde94779ba08 100644
--- a/fs/tux3/tux3.h
+++ b/fs/tux3/tux3.h
@@ -674,6 +674,9 @@ int tux3_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
struct buffer_head *__get_buffer(struct page *page, int offset);
void tux3_try_cancel_dirty_page(struct page *page);
+void __tux3_set_page_dirty_account(struct page *page,
+ struct address_space *mapping, int warn);
+int tux3_file_mmap(struct file *file, struct vm_area_struct *vma);
extern const struct address_space_operations tux_file_aops;
extern const struct address_space_operations tux_symlink_aops;
extern const struct address_space_operations tux_blk_aops;