Tao Ma
2010-Jun-29 08:27 UTC
[Ocfs2-devel] [PATCH 0/4] Add readahead support in CoW for reflinked files.
Hi all, When we try to CoW some clusters for a reflinked file, we have to read the contents first, allocate some clusters and then map these pages to the new clusters. Currently, we use block_read_full_page to read it, but it is a little bit slower. So this patch set try to add readahead support for CoW. Before we start, we call readahead first so that the pages can be read at the very first. And during CoW, when we find a readahead page, we know that we need to move the readahead window, so a new asyncreadahead is called. I have a small test to show how readahead speed up CoW. readahead_test() { MNT_DIR=/mnt/ocfs2 DEVICE=/dev/sda8 echo 'y'|mkfs.ocfs2 --fs-features=local,refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE=$MNT_DIR/$RANDOM REFLINK=$MNT_DIR/$RANDOM dd if=/dev/zero of=$FILE bs=1M count=1000 reflink $FILE $REFLINK dd if=/dev/zero of=$REFLINK bs=1M count=1000 conv=notrunc umount $MNT_DIR } Without these patch set, the 2nd dd has a i/o speed of 22MB/s. with the patch set, the i/o speed is increased to about 40MB/s. Any comments are welcomed. Regards, Tao
Tao Ma
2010-Jun-29 08:35 UTC
[Ocfs2-devel] [PATCH 1/4] ocfs2: Add struct file to ocfs2_refcount_cow.
Add a new parameter 'struct file *' to ocfs2_refcount_cow so that we can add readahead support later. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/aops.c | 2 +- fs/ocfs2/file.c | 13 ++++++++----- fs/ocfs2/refcounttree.c | 4 +++- fs/ocfs2/refcounttree.h | 3 ++- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4dfaa6e..5f94923 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1691,7 +1691,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, mlog_errno(ret); goto out; } else if (ret == 1) { - ret = ocfs2_refcount_cow(inode, di_bh, + ret = ocfs2_refcount_cow(inode, NULL, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ca527ca..06780f0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -361,7 +361,7 @@ static int ocfs2_cow_file_pos(struct inode *inode, if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) goto out; - return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); + return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); out: return status; @@ -1864,6 +1864,7 @@ out: } static int ocfs2_prepare_inode_for_refcount(struct inode *inode, + struct file *file, loff_t pos, size_t count, int *meta_level) { @@ -1881,7 +1882,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode, *meta_level = 1; - ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); + ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); if (ret) mlog_errno(ret); out: @@ -1889,7 +1890,7 @@ out: return ret; } -static int ocfs2_prepare_inode_for_write(struct dentry *dentry, +static int ocfs2_prepare_inode_for_write(struct file *file, loff_t *ppos, size_t count, int appending, @@ -1897,6 +1898,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, int *has_refcount) { int ret = 0, meta_level = 0; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; loff_t saved_pos, end; @@ -1952,6 +1954,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, meta_level = -1; ret = ocfs2_prepare_inode_for_refcount(inode, + file, saved_pos, count, &meta_level); @@ -2066,7 +2069,7 @@ relock: } can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, + ret = ocfs2_prepare_inode_for_write(file, ppos, iocb->ki_left, appending, &can_do_direct, &has_refcount); if (ret < 0) { @@ -2193,7 +2196,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, { int ret; - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, + ret = ocfs2_prepare_inode_for_write(out, &sd->pos, sd->total_len, 0, NULL, NULL); if (ret < 0) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 4793f36..7636174 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3398,6 +3398,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, + struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3475,6 +3476,7 @@ out: * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, + struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3494,7 +3496,7 @@ int ocfs2_refcount_cow(struct inode *inode, num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { - ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, + ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 9983ba1..29cba0e 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u32 clusters, int *credits, int *ref_blocks); -int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_refcount_cow(struct inode *inode, + struct file *filep, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); typedef int (ocfs2_post_refcount_func)(struct inode *inode, -- 1.7.1.GIT
Tao Ma
2010-Jun-29 08:35 UTC
[Ocfs2-devel] [PATCH 2/4] ocfs2: Add readahead support for CoW.
Add a new function ocfs2_readahead_for_cow so that we start readahead before we start our CoW. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/refcounttree.c | 24 ++++++++++++++++++++++++ 1 files changed, 24 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7636174..03ec6ac 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3392,6 +3392,28 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) return ret; } +static void ocfs2_readahead_for_cow(struct inode *inode, + struct file *file, + u32 start, u32 len) +{ + struct address_space *mapping; + pgoff_t index; + unsigned long num_pages; + int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; + + if (!file) + return; + + mapping = file->f_mapping; + num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; + if (!num_pages) + num_pages = 1; + + index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; + page_cache_sync_readahead(mapping, &file->f_ra, file, + index, num_pages); +} + /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an @@ -3427,6 +3449,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, BUG_ON(cow_len == 0); + ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; -- 1.7.1.GIT
In CoW, when we meet with a readahead page, we know it is time to move the readahead window. So carry out a new readahead. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/refcounttree.c | 15 ++++++++++++++- 1 files changed, 14 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 03ec6ac..df47182 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -49,6 +49,7 @@ struct ocfs2_cow_context { struct inode *inode; + struct file *file; u32 cow_start; u32 cow_len; struct ocfs2_extent_tree data_et; @@ -2922,13 +2923,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; - unsigned int from, to; + unsigned int from, to, readahead_pages; loff_t offset, end, map_end; struct address_space *mapping = context->inode->i_mapping; mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, new_cluster, new_len, cpos); + readahead_pages + (ocfs2_cow_contig_clusters(sb) << + OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); @@ -2953,6 +2957,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); + if (PageReadahead(page) && context->file) { + page_cache_async_readahead(mapping, + &context->file->f_ra, + context->file, + page, page_index, + readahead_pages); + } + if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); if (ret) { @@ -3472,6 +3484,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; + context->file = file; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); -- 1.7.1.GIT
Tao Ma
2010-Jun-29 08:35 UTC
[Ocfs2-devel] [PATCH 4/4] ocfs2: make mmap CoW work with readahead.
refcount_cow need file * to do the readahead. So we just add 'struct file *' to ocfs2_write_begin_nolock and __ocfs2_page_mkwrite and let them pass this parameter to ocfs2_refcount_cow. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/aops.c | 7 ++++--- fs/ocfs2/aops.h | 3 ++- fs/ocfs2/mmap.c | 8 ++++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 5f94923..337e874 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1645,7 +1645,8 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, return ret; } -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *file, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) @@ -1691,7 +1692,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, mlog_errno(ret); goto out; } else if (ret == 1) { - ret = ocfs2_refcount_cow(inode, NULL, di_bh, + ret = ocfs2_refcount_cow(inode, file, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); @@ -1853,7 +1854,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index c48e93f..e99ac0f 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *file, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af2b8fe..7fcb107 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -59,8 +59,8 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) return ret; } -static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, - struct page *page) +static int __ocfs2_page_mkwrite(struct file *file, struct inode *inode, + struct buffer_head *di_bh, struct page *page) { int ret; struct address_space *mapping = inode->i_mapping; @@ -109,7 +109,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, if (page->index == last_index) len = size & ~PAGE_CACHE_MASK; - ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, &fsdata, di_bh, page); if (ret) { if (ret != -ENOSPC) @@ -157,7 +157,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = __ocfs2_page_mkwrite(inode, di_bh, page); + ret = __ocfs2_page_mkwrite(vma->vm_file, inode, di_bh, page); up_write(&OCFS2_I(inode)->ip_alloc_sem); -- 1.7.1.GIT