Darrick J. Wong
2016-Nov-09 22:51 UTC
[Ocfs2-devel] [PATCH 0/6] ocfs2: wire up {clone, copy, dedupe}_range
Hi all, These patches wire up the existing ocfs2 reflinking capabilities to the new(ish) VFS {copy,clone,dedupe}_range interface. The first few patches clean up some minor bugs that I found; the last kernel patch contains the new code. A few minor fixes to xfstests are needed to make more of the tests run. I'll tack that patch on the end. --D [1] https://github.com/djwong/linux/tree/ocfs2-vfs-reflink
Darrick J. Wong
2016-Nov-09 22:51 UTC
[Ocfs2-devel] [PATCH 1/6] ocfs2: convert inode refcount test to a helper
Replace the open-coded inode refcount flag test with a helper function to reduce the potential for bugs. Signed-off-by: Darrick J. Wong <darrick.wong at oracle.com> --- fs/ocfs2/refcounttree.c | 28 +++++++++++++++------------- fs/ocfs2/refcounttree.h | 2 ++ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 1923851..59be8f4 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -48,6 +48,12 @@ #include <linux/mount.h> #include <linux/posix_acl.h> +/* Does this inode have the reflink flag set? */ +bool ocfs2_is_refcount_inode(struct inode *inode) +{ + return (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); +} + struct ocfs2_cow_context { struct inode *inode; u32 cow_start; @@ -410,7 +416,7 @@ static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) goto out; } - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); di = (struct ocfs2_dinode *)di_bh->b_data; *ref_blkno = le64_to_cpu(di->i_refcount_loc); @@ -570,7 +576,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, u32 num_got; u64 suballoc_loc, first_blkno; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); trace_ocfs2_create_refcount_tree( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -708,7 +714,7 @@ static int ocfs2_set_refcount_tree(struct inode *inode, struct ocfs2_refcount_block *rb; struct ocfs2_refcount_tree *ref_tree; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, &ref_tree, &ref_root_bh); @@ -775,7 +781,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); u16 bit = 0; - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + if (!ocfs2_is_refcount_inode(inode)) return 0; BUG_ON(!ref_blkno); @@ -2299,11 +2305,10 @@ int ocfs2_decrease_refcount(struct inode *inode, { int ret; u64 ref_blkno; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_block(inode, &ref_blkno); if (ret) { @@ -2533,7 +2538,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, int *ref_blocks) { int ret; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); @@ -2544,7 +2548,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, goto out; } - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), refcount_loc, &tree); @@ -3412,14 +3416,13 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, { int ret; u32 cow_start = 0, cow_len = 0; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_cow_context *context = NULL; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, cpos, write_len, max_cpos, @@ -3629,11 +3632,10 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, { int ret; struct ocfs2_xattr_value_root *xv = vb->vb_xv; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_cow_context *context = NULL; u32 cow_start, cow_len; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, cpos, write_len, UINT_MAX, @@ -3807,7 +3809,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, ocfs2_init_dealloc_ctxt(&dealloc); - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + if (!ocfs2_is_refcount_inode(inode)) { ret = ocfs2_create_refcount_tree(inode, di_bh); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 6422bbc..553edfb 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -17,6 +17,8 @@ #ifndef OCFS2_REFCOUNTTREE_H #define OCFS2_REFCOUNTTREE_H +bool ocfs2_is_refcount_inode(struct inode *inode); + struct ocfs2_refcount_tree { struct rb_node rf_node; u64 rf_blkno;
Darrick J. Wong
2016-Nov-09 22:51 UTC
[Ocfs2-devel] [PATCH 2/6] ocfs2: add newlines to some error messages
These two error messages are missing the trailing newline. Signed-off-by: Darrick J. Wong <darrick.wong at oracle.com> --- fs/ocfs2/alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f72712f..bb2d207 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5194,7 +5194,7 @@ int ocfs2_change_extent_flag(handle_t *handle, rec = &el->l_recs[index]; if (new_flags && (rec->e_flags & new_flags)) { mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " - "extent that already had them", + "extent that already had them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), new_flags); goto out; @@ -5202,7 +5202,7 @@ int ocfs2_change_extent_flag(handle_t *handle, if (clear_flags && !(rec->e_flags & clear_flags)) { mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " - "extent that didn't have them", + "extent that didn't have them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), clear_flags); goto out;
Darrick J. Wong
2016-Nov-09 22:51 UTC
[Ocfs2-devel] [PATCH 3/6] ocfs2: prohibit refcounted swapfiles
The swapfile mechanism calls bmap once to find all the swap file mappings, which means that we cannot properly support CoW remapping. Therefore, error out if the swap code tries to call bmap on a refcounted file. Signed-off-by: Darrick J. Wong <darrick.wong at oracle.com> --- fs/ocfs2/aops.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c5c5b97..4d037db 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -464,6 +464,15 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)block); + /* + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasse? the file system for actual I/O. We really can't allow + * that on refcounted inodes, so we have to skip out here. And yes, + * 0 is the magic code for a bmap error.. + */ + if (ocfs2_is_refcount_inode(inode)) + return 0; + /* We don't need to lock journal system files, since they aren't * accessed concurrently from multiple nodes. */
Darrick J. Wong
2016-Nov-09 22:51 UTC
[Ocfs2-devel] [PATCH 4/6] ocfs2: budget for extent tree splits when adding refcount flag
When we're adding the refcount flag to an extent, we have to budget enough space to handle a full extent btree split in addition to whatever modifications have to be made to the refcount btree. We don't currently do this, with the result that generic/186 crashes when we need an extent split but not a refcount split because meta_ac never gets allocated. Signed-off-by: Darrick J. Wong <darrick.wong at oracle.com> --- fs/ocfs2/refcounttree.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 59be8f4..d92b6c6 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3698,6 +3698,9 @@ int ocfs2_add_refcount_flag(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_alloc_context *meta_ac = NULL; + /* We need to be able to handle at least an extent tree split. */ + ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el); + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, ref_ci, ref_root_bh, p_cluster, num_clusters,
Darrick J. Wong
2016-Nov-09 22:51 UTC
[Ocfs2-devel] [PATCH 5/6] ocfs2: don't eat io errors during _dio_end_io_write
ocfs2_dio_end_io_write eats whatever errors may happen, which means that write errors do not propagate to userspace. Fix that. Signed-off-by: Darrick J. Wong <darrick.wong at oracle.com> --- fs/ocfs2/aops.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4d037db..136a49c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2263,10 +2263,10 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, return ret; } -static void ocfs2_dio_end_io_write(struct inode *inode, - struct ocfs2_dio_write_ctxt *dwc, - loff_t offset, - ssize_t bytes) +static int ocfs2_dio_end_io_write(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc, + loff_t offset, + ssize_t bytes) { struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree et; @@ -2374,6 +2374,8 @@ static void ocfs2_dio_end_io_write(struct inode *inode, if (locked) inode_unlock(inode); ocfs2_dio_free_write_ctx(inode, dwc); + + return ret; } /* @@ -2388,6 +2390,7 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = file_inode(iocb->ki_filp); int level; + int ret = 0; if (bytes <= 0) return 0; @@ -2396,13 +2399,13 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); if (private) - ocfs2_dio_end_io_write(inode, private, offset, bytes); + ret = ocfs2_dio_end_io_write(inode, private, offset, bytes); ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); ocfs2_rw_unlock(inode, level); - return 0; + return ret; } static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
Darrick J. Wong
2016-Nov-09 22:51 UTC
[Ocfs2-devel] [PATCH 6/6] ocfs2: implement the VFS clone_range, copy_range, and dedupe_range features
Connect the new VFS clone_range, copy_range, and dedupe_range features to the existing reflink capability of ocfs2. Compared to the existing ocfs2 reflink ioctl We have to do things a little differently to support the VFS semantics (we can clone subranges of a file but we don't clone xattrs), but the VFS ioctls are more broadly supported. Signed-off-by: Darrick J. Wong <darrick.wong at oracle.com> --- fs/ocfs2/file.c | 62 ++++- fs/ocfs2/file.h | 3 fs/ocfs2/refcounttree.c | 619 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/refcounttree.h | 7 + 4 files changed, 688 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 000c234..d5a022d 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode, *done = ret; } -static int ocfs2_remove_inode_range(struct inode *inode, - struct buffer_head *di_bh, u64 byte_start, - u64 byte_len) +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len) { int ret = 0, flags = 0, done = 0, i; u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; @@ -2440,6 +2440,56 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) return offset; } +static ssize_t ocfs2_file_copy_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + size_t len, + unsigned int flags) +{ + int error; + + error = ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, false); + if (error) + return error; + return len; +} + +static int ocfs2_file_clone_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) +{ + return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, false); +} + +#define OCFS2_MAX_DEDUPE_LEN (16 * 1024 * 1024) +static ssize_t ocfs2_file_dedupe_range(struct file *src_file, + u64 loff, + u64 len, + struct file *dst_file, + u64 dst_loff) +{ + int error; + + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > OCFS2_MAX_DEDUPE_LEN) + len = OCFS2_MAX_DEDUPE_LEN; + + error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff, + len, true); + if (error) + return error; + return len; +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -2479,6 +2529,9 @@ const struct file_operations ocfs2_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .copy_file_range = ocfs2_file_copy_range, + .clone_file_range = ocfs2_file_clone_range, + .dedupe_file_range = ocfs2_file_dedupe_range, }; const struct file_operations ocfs2_dops = { @@ -2524,6 +2577,9 @@ const struct file_operations ocfs2_fops_no_plocks = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .copy_file_range = ocfs2_file_copy_range, + .clone_file_range = ocfs2_file_clone_range, + .dedupe_file_range = ocfs2_file_dedupe_range, }; const struct file_operations ocfs2_dops_no_plocks = { diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index e8c62f2..897fd9a 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, size_t count); +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len); #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index d92b6c6..3e2198c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -34,6 +34,7 @@ #include "xattr.h" #include "namei.h" #include "ocfs2_trace.h" +#include "file.h" #include <linux/bio.h> #include <linux/blkdev.h> @@ -4447,3 +4448,621 @@ int ocfs2_reflink_ioctl(struct inode *inode, return error; } + +/* Update destination inode size, if necessary. */ +static int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen) +{ + handle_t *handle; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)d_bh->b_data; + int ret; + + if (newlen <= i_size_read(dest)) + return 0; + + handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + ret = ocfs2_journal_access_di(handle, INODE_CACHE(dest), d_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + spin_lock(&OCFS2_I(dest)->ip_lock); + if (newlen > i_size_read(dest)) { + i_size_write(dest, newlen); + di->i_size = newlen; + } + spin_unlock(&OCFS2_I(dest)->ip_lock); + + ocfs2_journal_dirty(handle, d_bh); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle); + return ret; +} + +/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ +static int ocfs2_reflink_remap_extent(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_extent_tree s_et; + struct ocfs2_extent_tree t_et; + struct ocfs2_dinode *dis; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_super *osb; + loff_t pstart, plen; + u32 p_cluster, num_clusters, slast, spos, tpos; + unsigned int ext_flags; + int ret = 0; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh); + ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh); + + spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in); + tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out); + slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len); + + while (spos < slast) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + /* Look up the extent. */ + ret = ocfs2_get_clusters(s_inode, spos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_clusters = min_t(u32, num_clusters, slast - spos); + + /* Punch out the dest range. */ + pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos); + plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters); + ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (p_cluster == 0) + goto next_loop; + + /* Lock the refcount btree... */ + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(dis->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Mark s_inode's extent as refcounted. */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(s_inode, &s_et, + &ref_tree->rf_ci, + ref_root_bh, spos, + p_cluster, num_clusters, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + } + + /* Map in the new extent. */ + ext_flags |= OCFS2_EXT_REFCOUNTED; + ret = ocfs2_add_refcounted_extent(t_inode, &t_et, + &ref_tree->rf_ci, + ref_root_bh, + tpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +next_loop: + spos += num_clusters; + tpos += num_clusters; + } + +out: + return ret; +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + return ret; +} + +/* Set up refcount tree and remap s_inode to t_inode. */ +static int ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len) +{ + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb; + struct ocfs2_dinode *dis; + struct ocfs2_dinode *dit; + int ret; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + dit = (struct ocfs2_dinode *)t_bh->b_data; + ocfs2_init_dealloc_ctxt(&dealloc); + + /* + * If both inodes belong to two different refcount groups then + * forget it because we don't know how (or want) to go merging + * refcount trees. + */ + ret = -EOPNOTSUPP; + if (ocfs2_is_refcount_inode(s_inode) && + ocfs2_is_refcount_inode(t_inode) && + le64_to_cpu(dis->i_refcount_loc) !+ le64_to_cpu(dit->i_refcount_loc)) + goto out; + + /* Neither inode has a refcount tree. Add one to s_inode. */ + if (!ocfs2_is_refcount_inode(s_inode) && + !ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_create_refcount_tree(s_inode, s_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Ensure that both inodes end up with the same refcount tree. */ + if (!ocfs2_is_refcount_inode(s_inode)) { + ret = ocfs2_set_refcount_tree(s_inode, s_bh, + le64_to_cpu(dit->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + if (!ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(dis->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * If we're reflinking the entire file and the source is inline + * data, just copy the contents. + */ + if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) && + i_size_read(t_inode) <= len && + (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh, + pos_out, len, &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +/* Lock an inode and grab a bh pointing to the inode. */ +static int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh1, + struct inode *t_inode, + struct buffer_head **bh2) +{ + struct inode *inode1; + struct inode *inode2; + struct ocfs2_inode_info *oi1; + struct ocfs2_inode_info *oi2; + bool same_inode = (s_inode == t_inode); + int status; + + /* First grab the VFS and rw locks. */ + inode1 = s_inode; + inode2 = t_inode; + if (inode1->i_ino > inode2->i_ino) + swap(inode1, inode2); + + inode_lock(inode1); + status = ocfs2_rw_lock(inode1, 1); + if (status) { + mlog_errno(status); + goto out_i1; + } + if (!same_inode) { + inode_lock_nested(inode2, I_MUTEX_CHILD); + status = ocfs2_rw_lock(inode2, 1); + if (status) { + mlog_errno(status); + goto out_i2; + } + } + + /* Now go for the cluster locks */ + oi1 = OCFS2_I(inode1); + oi2 = OCFS2_I(inode2); + + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); + + if (*bh1) + *bh1 = NULL; + if (*bh2) + *bh2 = NULL; + + /* We always want to lock the one with the lower lockid first. */ + if (oi1->ip_blkno > oi2->ip_blkno) + mlog_errno(-ENOLCK); + + /* lock id1 */ + status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_rw2; + } + + /* lock id2 */ + if (!same_inode) { + status = ocfs2_inode_lock_nested(inode2, bh2, 1, + OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_cl1; + } + } else + *bh2 = *bh1; + + trace_ocfs2_double_lock_end( + (unsigned long long)OCFS2_I(inode1)->ip_blkno, + (unsigned long long)OCFS2_I(inode2)->ip_blkno); + + return 0; + +out_cl1: + ocfs2_inode_unlock(inode1, 1); + brelse(*bh1); + *bh1 = NULL; +out_rw2: + ocfs2_rw_unlock(inode2, 1); +out_i2: + inode_unlock(inode2); + ocfs2_rw_unlock(inode1, 1); +out_i1: + inode_unlock(inode1); + return status; +} + +/* Unlock both inodes and release buffers. */ +static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + ocfs2_inode_unlock(s_inode, 1); + ocfs2_rw_unlock(s_inode, 1); + inode_unlock(s_inode); + brelse(s_bh); + + if (s_inode == t_inode) + return; + + ocfs2_inode_unlock(t_inode, 1); + ocfs2_rw_unlock(t_inode, 1); + inode_unlock(t_inode); + brelse(t_bh); +} + +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *ocfs2_reflink_get_page(struct inode *inode, + loff_t offset) +{ + struct address_space *mapping; + struct page *page; + pgoff_t n; + + n = offset >> PAGE_SHIFT; + mapping = inode->i_mapping; + page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + */ +static int ocfs2_reflink_compare_extents(struct inode *src, + loff_t srcoff, + struct inode *dest, + loff_t destoff, + loff_t len, + bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) { + mlog_errno(-EUCLEAN); + goto out_error; + } + + src_page = ocfs2_reflink_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = ocfs2_reflink_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} + +/* Link a range of blocks from one file to another. */ +int ocfs2_reflink_remap_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); + struct buffer_head *in_bh = NULL, *out_bh = NULL; + loff_t bs = 1 << OCFS2_SB(inode_in->i_sb)->s_clustersize_bits; + bool same_inode = (inode_in == inode_out); + bool is_same = false; + loff_t isize; + ssize_t ret; + loff_t blen; + + if (!ocfs2_refcount_tree(osb)) + return -EOPNOTSUPP; + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + /* Lock both files against IO */ + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); + if (ret) + return ret; + + ret = -EINVAL; + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) + goto out_unlock; + + /* Don't touch certain kinds of inodes */ + ret = -EPERM; + if (IS_IMMUTABLE(inode_out)) + goto out_unlock; + + ret = -ETXTBSY; + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + goto out_unlock; + + /* Don't reflink dirs, pipes, sockets... */ + ret = -EISDIR; + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + goto out_unlock; + ret = -EINVAL; + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) + goto out_unlock; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + goto out_unlock; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) { + ret = 0; + goto out_unlock; + } + + if (len == 0) + len = isize - pos_in; + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + len < pos_in || pos_out + len < pos_out || + pos_in + len > isize) + goto out_unlock; + + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; + + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + len > disize) + goto out_unlock; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + goto out_unlock; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode) { + if (pos_out + blen > pos_in && pos_out < pos_in + blen) + goto out_unlock; + } + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + len - 1); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + len - 1); + if (ret) + goto out_unlock; + + /* + * Check that the extents are the same. + */ + if (is_dedupe) { + ret = ocfs2_reflink_compare_extents(inode_in, pos_in, + inode_out, pos_out, + len, &is_same); + if (ret) + goto out_unlock; + if (!is_same) { + ret = -EBADE; + goto out_unlock; + } + } + + /* Lock out changes to the allocation maps */ + down_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, + SINGLE_DEPTH_NESTING); + + /* + * Invalidate the page cache so that we can clear any CoW mappings + * in the destination file. + */ + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); + + ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out, + out_bh, pos_out, len); + + up_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + up_write(&OCFS2_I(inode_out)->ip_alloc_sem); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode_in, 0); + ocfs2_extent_map_trunc(inode_out, 0); + + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return 0; + +out_unlock: + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return ret; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 553edfb..c023e88 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -117,4 +117,11 @@ int ocfs2_reflink_ioctl(struct inode *inode, const char __user *oldname, const char __user *newname, bool preserve); +int ocfs2_reflink_remap_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe); + #endif /* OCFS2_REFCOUNTTREE_H */
Darrick J. Wong
2016-Nov-09 23:00 UTC
[Ocfs2-devel] [PATCH 7/6] xfstests: fix some minor problems testing ocfs2
There are a a few things about ocfs2 tools that need special-casing in xfstests, so fix them so that we can start testing ocfs2. Signed-off-by: Darrick J. Wong <darrick.wong at oracle.com> --- common/quota | 2 +- common/rc | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/common/quota b/common/quota index 678bc43..d9bb8d9 100644 --- a/common/quota +++ b/common/quota @@ -34,7 +34,7 @@ _require_quota() _notrun "Installed kernel does not support quotas" fi ;; - gfs2) + gfs2|ocfs2) ;; xfs) if [ ! -f /proc/fs/xfs/xqmstat ]; then diff --git a/common/rc b/common/rc index 8e078da..c75b614 100644 --- a/common/rc +++ b/common/rc @@ -978,7 +978,7 @@ _scratch_mkfs_sized() xfs) def_blksz=`echo $MKFS_OPTIONS|sed -rn 's/.*-b ?size= ?+([0-9]+).*/\1/p'` ;; - ext2|ext3|ext4|ext4dev|udf|btrfs|reiser4) + ext2|ext3|ext4|ext4dev|udf|btrfs|reiser4|ocfs2) def_blksz=`echo $MKFS_OPTIONS| sed -rn 's/.*-b ?+([0-9]+).*/\1/p'` ;; esac @@ -1015,6 +1015,9 @@ _scratch_mkfs_sized() ext2|ext3|ext4|ext4dev) ${MKFS_PROG}.$FSTYP -F $MKFS_OPTIONS -b $blocksize $SCRATCH_DEV $blocks ;; + ocfs2) + yes | ${MKFS_PROG}.$FSTYP -F $MKFS_OPTIONS -b $blocksize $SCRATCH_DEV $blocks + ;; udf) $MKFS_UDF_PROG $MKFS_OPTIONS -b $blocksize $SCRATCH_DEV $blocks ;; @@ -1087,9 +1090,12 @@ _scratch_mkfs_blocksized() xfs) _scratch_mkfs_xfs $MKFS_OPTIONS -b size=$blocksize ;; - ext2|ext3|ext4|ocfs2) + ext2|ext3|ext4) ${MKFS_PROG}.$FSTYP -F $MKFS_OPTIONS -b $blocksize $SCRATCH_DEV ;; + ocfs2) + yes | ${MKFS_PROG}.$FSTYP -F $MKFS_OPTIONS -b $blocksize $SCRATCH_DEV + ;; *) _notrun "Filesystem $FSTYP not supported in _scratch_mkfs_blocksized" ;;
Eric Ren
2016-Nov-11 03:15 UTC
[Ocfs2-devel] [PATCH 0/6] ocfs2: wire up {clone, copy, dedupe}_range
Hi, On 11/10/2016 06:51 AM, Darrick J. Wong wrote:> Hi all, > > These patches wire up the existing ocfs2 reflinking capabilities to > the new(ish) VFS {copy,clone,dedupe}_range interface. The first few > patches clean up some minor bugs that I found; the last kernel patch > contains the new code. > > A few minor fixes to xfstests are needed to make more of the tests > run. I'll tack that patch on the end.FYI, reflink testcases from ocfs2-test both on single and multiple node(s) all passed with your patches. At least, it shows that no obvious regression issue is observed so far ;-) Eric> > --D > > [1] https://github.com/djwong/linux/tree/ocfs2-vfs-reflink > -- > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in > the body of a message to majordomo at vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html >