Tao Ma
2010-Jul-14 07:52 UTC
[Ocfs2-devel] [PATCH 0/5 v2] Add readahead support in CoW for reflinked files.
Hi all, When we try to CoW some clusters for a reflinked file, we have to read the contents first, allocate some clusters and then map these pages to the new clusters. Currently, we use block_read_full_page to read it, but it is a little bit slower. So this patch set try to add readahead support for CoW. Before we start, we call readahead first so that the pages can be read at the very first. And during CoW, when we find a readahead page, we know that we need to move the readahead window, so a new asyncreadahead is called. I have a small test to show how readahead speed up CoW. readahead_test() { MNT_DIR=/mnt/ocfs2 DEVICE=/dev/sda8 echo 'y'|mkfs.ocfs2 --fs-features=local,refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE=$MNT_DIR/$RANDOM REFLINK=$MNT_DIR/$RANDOM dd if=/dev/zero of=$FILE bs=1M count=1000 reflink $FILE $REFLINK dd if=/dev/zero of=$REFLINK bs=1M count=1000 conv=notrunc umount $MNT_DIR } Without these patch set, the 2nd dd has a i/o speed of 22MB/s. with the patch set, the i/o speed is increased to about 40MB/s. Any comments are welcomed. Regards, Tao
Tao Ma
2010-Jul-14 07:53 UTC
[Ocfs2-devel] [PATCH 1/5] ocfs2: pass struct file* to ocfs2_write_begin_nolock.
struct file * has file_ra_state to store the readahead state and data. So pass this to ocfs2_write_begin_nolock so that it can be used in ocfs2_refcount_cow. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/aops.c | 5 +++-- fs/ocfs2/aops.h | 3 ++- fs/ocfs2/mmap.c | 7 ++++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3623ca2..13a41ef 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1644,7 +1644,8 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, return ret; } -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) @@ -1852,7 +1853,7 @@ static int ocfs2_write_begin(struct file *file, struct address_space *mapping, */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, flags, pagep, fsdata, di_bh, NULL); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index c48e93f..7606f66 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -48,7 +48,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -int ocfs2_write_begin_nolock(struct address_space *mapping, +int ocfs2_write_begin_nolock(struct file *filp, + struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af2b8fe..b04d696 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) return ret; } -static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, +static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct page *page) { int ret; + struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); unsigned int len = PAGE_CACHE_SIZE; @@ -109,7 +110,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, if (page->index == last_index) len = size & ~PAGE_CACHE_MASK; - ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, + ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, &fsdata, di_bh, page); if (ret) { if (ret != -ENOSPC) @@ -157,7 +158,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) */ down_write(&OCFS2_I(inode)->ip_alloc_sem); - ret = __ocfs2_page_mkwrite(inode, di_bh, page); + ret = __ocfs2_page_mkwrite(vma->vm_file, di_bh, page); up_write(&OCFS2_I(inode)->ip_alloc_sem); -- 1.7.1.GIT
Tao Ma
2010-Jul-14 07:53 UTC
[Ocfs2-devel] [PATCH 2/5] ocfs2: pass struct file* to ocfs2_prepare_inode_for_write.
struct file * has file_ra_state to store the readahead state and data. So pass this to ocfs2_prepare_inode_for_write. so that it can be used in ocfs2_refcount_cow. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/file.c | 9 ++++++--- 1 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6a13ea6..d61137e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1864,6 +1864,7 @@ out: } static int ocfs2_prepare_inode_for_refcount(struct inode *inode, + struct file *file, loff_t pos, size_t count, int *meta_level) { @@ -1889,7 +1890,7 @@ out: return ret; } -static int ocfs2_prepare_inode_for_write(struct dentry *dentry, +static int ocfs2_prepare_inode_for_write(struct file *file, loff_t *ppos, size_t count, int appending, @@ -1897,6 +1898,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, int *has_refcount) { int ret = 0, meta_level = 0; + struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; loff_t saved_pos, end; @@ -1952,6 +1954,7 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, meta_level = -1; ret = ocfs2_prepare_inode_for_refcount(inode, + file, saved_pos, count, &meta_level); @@ -2066,7 +2069,7 @@ relock: } can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, + ret = ocfs2_prepare_inode_for_write(file, ppos, iocb->ki_left, appending, &can_do_direct, &has_refcount); if (ret < 0) { @@ -2196,7 +2199,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, { int ret; - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, + ret = ocfs2_prepare_inode_for_write(out, &sd->pos, sd->total_len, 0, NULL, NULL); if (ret < 0) { mlog_errno(ret); -- 1.7.1.GIT
Tao Ma
2010-Jul-14 07:53 UTC
[Ocfs2-devel] [PATCH 3/5] ocfs2: Add struct file to ocfs2_refcount_cow.
Add a new parameter 'struct file *' to ocfs2_refcount_cow so that we can add readahead support later. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/aops.c | 2 +- fs/ocfs2/file.c | 4 ++-- fs/ocfs2/refcounttree.c | 4 +++- fs/ocfs2/refcounttree.h | 3 ++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 13a41ef..968ae3d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1691,7 +1691,7 @@ int ocfs2_write_begin_nolock(struct file *filp, mlog_errno(ret); goto out; } else if (ret == 1) { - ret = ocfs2_refcount_cow(inode, di_bh, + ret = ocfs2_refcount_cow(inode, filp, di_bh, wc->w_cpos, wc->w_clen, UINT_MAX); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d61137e..ab1d0ce 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -361,7 +361,7 @@ static int ocfs2_cow_file_pos(struct inode *inode, if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) goto out; - return ocfs2_refcount_cow(inode, fe_bh, cpos, 1, cpos+1); + return ocfs2_refcount_cow(inode, NULL, fe_bh, cpos, 1, cpos+1); out: return status; @@ -1882,7 +1882,7 @@ static int ocfs2_prepare_inode_for_refcount(struct inode *inode, *meta_level = 1; - ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); + ret = ocfs2_refcount_cow(inode, file, di_bh, cpos, clusters, UINT_MAX); if (ret) mlog_errno(ret); out: diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 4793f36..7636174 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3398,6 +3398,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) * unrefcounted extent. */ static int ocfs2_refcount_cow_hunk(struct inode *inode, + struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3475,6 +3476,7 @@ out: * clusters between cpos and cpos+write_len are safe to modify. */ int ocfs2_refcount_cow(struct inode *inode, + struct file *file, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos) { @@ -3494,7 +3496,7 @@ int ocfs2_refcount_cow(struct inode *inode, num_clusters = write_len; if (ext_flags & OCFS2_EXT_REFCOUNTED) { - ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, + ret = ocfs2_refcount_cow_hunk(inode, file, di_bh, cpos, num_clusters, max_cpos); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 9983ba1..29cba0e 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -52,7 +52,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u32 clusters, int *credits, int *ref_blocks); -int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_refcount_cow(struct inode *inode, + struct file *filep, struct buffer_head *di_bh, u32 cpos, u32 write_len, u32 max_cpos); typedef int (ocfs2_post_refcount_func)(struct inode *inode, -- 1.7.1.GIT
Tao Ma
2010-Jul-14 07:53 UTC
[Ocfs2-devel] [PATCH 4/5] ocfs2: Add readahead support for CoW.
Add a new function ocfs2_readahead_for_cow so that we start readahead before we start our CoW. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/refcounttree.c | 24 ++++++++++++++++++++++++ 1 files changed, 24 insertions(+), 0 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 7636174..03ec6ac 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3392,6 +3392,28 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) return ret; } +static void ocfs2_readahead_for_cow(struct inode *inode, + struct file *file, + u32 start, u32 len) +{ + struct address_space *mapping; + pgoff_t index; + unsigned long num_pages; + int cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits; + + if (!file) + return; + + mapping = file->f_mapping; + num_pages = (len << cs_bits) >> PAGE_CACHE_SHIFT; + if (!num_pages) + num_pages = 1; + + index = ((loff_t)start << cs_bits) >> PAGE_CACHE_SHIFT; + page_cache_sync_readahead(mapping, &file->f_ra, file, + index, num_pages); +} + /* * Starting at cpos, try to CoW write_len clusters. Don't CoW * past max_cpos. This will stop when it runs into a hole or an @@ -3427,6 +3449,8 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, BUG_ON(cow_len == 0); + ocfs2_readahead_for_cow(inode, file, cow_start, cow_len); + context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); if (!context) { ret = -ENOMEM; -- 1.7.1.GIT
In CoW, when we meet with a readahead page, we know it is time to move the readahead window. So carry out a new readahead. Signed-off-by: Tao Ma <tao.ma at oracle.com> --- fs/ocfs2/refcounttree.c | 15 ++++++++++++++- 1 files changed, 14 insertions(+), 1 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 03ec6ac..df47182 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -49,6 +49,7 @@ struct ocfs2_cow_context { struct inode *inode; + struct file *file; u32 cow_start; u32 cow_len; struct ocfs2_extent_tree data_et; @@ -2922,13 +2923,16 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); struct page *page; pgoff_t page_index; - unsigned int from, to; + unsigned int from, to, readahead_pages; loff_t offset, end, map_end; struct address_space *mapping = context->inode->i_mapping; mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, new_cluster, new_len, cpos); + readahead_pages + (ocfs2_cow_contig_clusters(sb) << + OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT; offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); @@ -2953,6 +2957,14 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize) BUG_ON(PageDirty(page)); + if (PageReadahead(page) && context->file) { + page_cache_async_readahead(mapping, + &context->file->f_ra, + context->file, + page, page_index, + readahead_pages); + } + if (!PageUptodate(page)) { ret = block_read_full_page(page, ocfs2_get_block); if (ret) { @@ -3472,6 +3484,7 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, context->ref_root_bh = ref_root_bh; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; context->get_clusters = ocfs2_di_get_clusters; + context->file = file; ocfs2_init_dinode_extent_tree(&context->data_et, INODE_CACHE(inode), di_bh); -- 1.7.1.GIT
Joel Becker
2010-Jul-15 02:31 UTC
[Ocfs2-devel] [PATCH 0/5 v2] Add readahead support in CoW for reflinked files.
On Wed, Jul 14, 2010 at 03:52:26PM +0800, Tao Ma wrote:> When we try to CoW some clusters for a reflinked file, we have to > read the contents first, allocate some clusters and then map these > pages to the new clusters. Currently, we use block_read_full_page to > read it, but it is a little bit slower. > So this patch set try to add readahead support for CoW. Before we > start, we call readahead first so that the pages can be read at the > very first. And during CoW, when we find a readahead page, we know > that we need to move the readahead window, so a new asyncreadahead > is called.I think this looks good. Can you create a branch in your git tree for it? I'll pull when I build merge-window. Joel -- "Conservative, n. A statesman who is enamoured of existing evils, as distinguished from the Liberal, who wishes to replace them with others." - Ambrose Bierce, The Devil's Dictionary Joel Becker Consulting Software Developer Oracle E-mail: joel.becker at oracle.com Phone: (650) 506-8127